## Load Files and Vectorize 

In [4]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [5]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [48]:
vec = TfidfVectorizer(stop_words='english', max_features=10)

In [49]:
vec = vec.fit(df_corpus2['content'])

In [50]:
#vec.vocabulary_

In [51]:
corpus2_vectors = vec.transform(df_corpus2['content']).toarray()

In [52]:
user_doc = "A watermelon is a type of edible fruit, they are 92% water. About 6% of a watermelon is sugar, which makes it very sweet. There are many different types of watermelon. Some have a green rind on the outside and a red-pink flesh on the inside, with brown seeds. Some can have yellow flesh, and some can be seedless. The green rind on the outside is not usually eaten, though it can be used as a vegetable. It can also be stewed or pickled. Most watermelons are oblong or spherical. In Japan, watermelons are grown in different shapes. Many people like to eat watermelon in the summer because the fruit is cool and refreshing. Watermelons are a great source of vitamin A, vitamin C, vitamin B6 and vitamin B1. They also contain potassium, magnesium, carotenoid antioxidant, and lycopene. Watermelons are fruits that come from a vine-like plant."

In [53]:
user_doc_vector = vec.transform([user_doc]).toarray()

In [54]:
user_doc_vector

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.72840959, 0.        , 0.        , 0.68514193, 0.        ]])

In [55]:
distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]

In [56]:
distances.shape

(14216,)

In [57]:
ranking = np.argsort(distances)

In [58]:
ranking.shape

(14216,)

In [59]:
top_ten = ranking[:10]
top_ten

array([ 3879,  7216,  8986,  1943, 11400, 10636,  9629,  8609,  5384,
        1488])

In [60]:
distances[top_ten]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [61]:
df_corpus2.loc[ranking, 'content']

3879     With a climate as varied and extreme as India,...
7216     International finance (also referred to as int...
8986     A pakudos is a visual motif used by the Mangya...
1943     Construct (psychology), also hypothetical cons...
11400    The epicanthic fold is the skin fold of the up...
10636    Chuan (Chinese: 串, Dungan: Чўан, pinyin: chuàn...
9629     Global Village Tech Park is a software technol...
8609     Tejuíno is a cold beverage made from fermented...
5384     Gens was used by Lewis H. Morgan (in Ancient S...
1488     "Natural Science" is a song by the Canadian ro...
10581    Argument to moderation (Latin: argumentum ad t...
8306     Burgess may refer to:\n\nPeople\nBurgess (name...
11978    Trust for Nature is a not-for-profit organisat...
7376     Kochi is a city in the Indian state of Kerala....
7953     A self-directed Individual Retirement Account ...
6197     Vets Prevail, is a veteran-created online ment...
13135    Humans of some cultures eat octopus. The arms .

## Second Model 

In [10]:
import textstat
import operator
import wikipedia
import wikipediaapi

def give_harder_level(text):
    """
    Takes in text and returns a harder level read
    """
    ##find all the articles that are easier than this article 
    all_harder_text = []
    input_score = textstat.flesch_kincaid_grade(text)
    for i in range(len(df_corpus2['score'])):
        if df_corpus2['score'][i] > input_score:
            all_harder_text.append(df_corpus2['content'][i])
    
    vec = TfidfVectorizer(stop_words='english', max_features=2000)
    vec = vec.fit(all_harder_text)
    corpus2_vectors = vec.transform(all_harder_text).toarray()
    user_doc = text
    user_doc_vector = vec.transform([user_doc]).toarray() 
    user_doc_vector_short = np.argsort(user_doc_vector)[-25:]
    distances = cdist(user_doc_vector_short,
                  corpus2_vectors[-25:],
                  metric='cosine')[0]
    ranking = np.argsort(distances)
    top = ranking[0]
    best_match = df_corpus2['content'][top]
        
    print(distances[top])
    return (best_match)

In [11]:
sample = "On one level, this is a business dispute that highlights tensions in the music industry (both intractably ancient and very current). When Swift signed her six-album deal that furnished Big Machine with rights to her masters, she was partaking in a classic arrangement for new artists: handing over future control of music in exchange for start-up promotional, recording, and distribution help. As she’s risen to megastardom, she’s chafed at that arrangement the same way that many successful musicians have chafed at not having ownership of their work. Prince, who famously protested Warner Bros. Records in 1993 by writing the word slave on his cheek, struck a deal for his own back catalog at great cost in 2014. Paul McCartney preached for years about the importance of artists owning music—and one person he preached to, Michael Jackson, ended up buying the rights to the Beatles’ catalog from under him. West himself recently filed suit to gain control of his own masters."

In [12]:
give_harder_level(sample)

0.6822204939076502


'Value or values may refer to:\n\nValue (philosophy),\nValue (ethics) it may be described as treating actions themselves as abstract objects, putting value to them\nValues (Western philosophy) expands the notion of value beyond that of ethics, but limited to Western sources\nSocial imaginary is the set of values, institutions, laws, and symbols common to a particular social group\nValue (economics), a measure of the benefit that may be gained from goods or service\nTheory of value (economics), the study of the concept of economic value\nValue (marketing), the difference between a customer\'s evaluation of benefits and costs\nValue investing, an investment paradigm\n\nOther uses\nValue, also known as lightness or tone, a representation of variation in the perception of a color or color space\'s brightness\nValue (computer science), an expression that implies no (further) (mathematical) processing; a "normal form"\nValue (mathematics), a property such as number assigned to or calculated 