## Load Files and Vectorize 

In [26]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [27]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [28]:
from sklearn.cluster import KMeans # import from sklearn
k_means = KMeans(n_clusters=3) # Must set number of clusters at initialization time!
k_means.fit(df_corpus2) # Run the clustering algorithm
cluster_assignments = k_means.predict(df_corpus2)

ValueError: could not convert string to float: 'Doubles is a common street food of Trinidadian origin. It is a common breakfast item and snack of locals made with two baras (flat fried dough) filled with curry channa (curried chickpeas). Doubles was first created in Fairfield, Princes Town by Emamool Deen (a.k.a. Mamudeen).\n\nSee also\nCuisine of Trinidad and Tobago\n\nReferences\nExternal links\nThe Origin of Trinidad\nDoubles Recipe'

In [48]:
vec = TfidfVectorizer(stop_words='english', max_features=10)

In [49]:
vec = vec.fit(df_corpus2['content'])

In [50]:
#vec.vocabulary_

In [51]:
corpus2_vectors = vec.transform(df_corpus2['content']).toarray()

In [52]:
user_doc = "A watermelon is a type of edible fruit, they are 92% water. About 6% of a watermelon is sugar, which makes it very sweet. There are many different types of watermelon. Some have a green rind on the outside and a red-pink flesh on the inside, with brown seeds. Some can have yellow flesh, and some can be seedless. The green rind on the outside is not usually eaten, though it can be used as a vegetable. It can also be stewed or pickled. Most watermelons are oblong or spherical. In Japan, watermelons are grown in different shapes. Many people like to eat watermelon in the summer because the fruit is cool and refreshing. Watermelons are a great source of vitamin A, vitamin C, vitamin B6 and vitamin B1. They also contain potassium, magnesium, carotenoid antioxidant, and lycopene. Watermelons are fruits that come from a vine-like plant."

In [53]:
user_doc_vector = vec.transform([user_doc]).toarray()

In [54]:
user_doc_vector

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.72840959, 0.        , 0.        , 0.68514193, 0.        ]])

In [55]:
distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]

In [56]:
distances.shape

(14216,)

In [57]:
ranking = np.argsort(distances)

In [58]:
ranking.shape

(14216,)

In [59]:
top_ten = ranking[:10]
top_ten

array([ 3879,  7216,  8986,  1943, 11400, 10636,  9629,  8609,  5384,
        1488])

In [60]:
distances[top_ten]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Second Model 

In [74]:
import textstat
import operator
import wikipedia
import wikipediaapi

def give_harder_level(text):
    """
    Takes in text and returns a harder level read
    """
    ##find all the articles that are easier than this article 
    all_harder_text = []
    input_score = textstat.flesch_kincaid_grade(text)
    for i in range(len(df_corpus2['score'])):
        if df_corpus2['score'][i] > input_score:
            all_harder_text.append(df_corpus2['content'][i])
    
    vec = TfidfVectorizer(stop_words='english', max_features=2000)
    vec = vec.fit(all_harder_text)
    corpus2_vectors = vec.transform(all_harder_text).toarray()
    user_doc = text
    user_doc_vector = vec.transform([user_doc]).toarray()    
    distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]
    ranking = np.argsort(distances)
    top = ranking[0]
    best_match = df_corpus2['content'][top]
        
    print(distances[top])
    return (best_match)

In [77]:
sample = "A watermelon is a type of edible fruit."

In [78]:
give_harder_level(sample)

0.19034598889396925


'Throughout its history, the United Kingdom has been a major producer and source of musical creation, drawing its artistic basis from the history of the United Kingdom, from church music, Western culture and the ancient and traditional folk music and instrumentation of England, Scotland, Northern Ireland and Wales.\nIn parts the 20th century, influences from the music of the United States became dominant in popular music. Following this was the explosion of the British Invasion, while subsequent notable movements in British music include the new wave of British heavy metal and Britpop. The United Kingdom has one of the world\'s largest music industries today, with many British musicians having influenced modern music.\n\nBackground\nEarly music\nMusic in the British Isles, from the earliest recorded times until the Baroque and the rise of recognisably modern classical music, was a diverse and rich culture, including sacred and secular music and ranging from the popular to the elite. Ea