## Load Files and Vectorize 

In [5]:
import os
import numpy as np
import pandas as pd
import pickle
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [6]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [7]:
with open(r"data/df_best_category.pkl", "rb") as input_file:
    df_best_category = pickle.load(input_file)

In [8]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [9]:
with open (r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [11]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


## Fourth Model 

In [290]:
import textstat
import operator
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from operator import itemgetter

In [291]:
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text, lemmatizer=lemmatizer):
    return lemmatizer.lemmatize(text, pos='v')

In [292]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [293]:
def give_simpler_level(text, vectorizer=vectorizer):
    """
    Takes in text and returns a harder level read
    """
    unseen_document = str(text)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_vector = dictionary.doc2bow(preprocess(unseen_document))
    text_topics = lda_model_1[bow_vector]
    topic_group = max(text_topics,key=itemgetter(1))[0]
    
    category_df = df_best_category.loc[df_best_category['category'] == topic_group] 
    
#     #find all the articles that are easier than this article 
#     all_harder_text = []
#     input_score = textstat.flesch_kincaid_grade(text)
#     for i in range(len(df_corpus2['score'])):
#         if df_corpus2['score'][i] > input_score:
#             all_harder_text.append(df_corpus2['content'][i])
    
    all_harder_text = []
    
    for i in category_df['article_index']: 
        temp = df_corpus2['content'][i]
        all_harder_text.append(temp)
        
    vec = TfidfVectorizer(stop_words='english', max_features=2000)
    vec = vec.fit(all_harder_text)
    corpus2_vectors = vec.transform(all_harder_text).toarray()
    user_doc = text
    user_doc_vector = vec.transform([user_doc]).toarray() 
    #user_doc_vector_short = np.argsort(user_doc_vector)
    
    distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]
    ranking = np.argsort(distances)
    top = ranking[0]
    best_match = df_corpus2['content'][top]
        
    print(distances[top])
    return (best_match)

In [294]:
sample = "Wine is an alcoholic drink made from fermented grapes.[1] Yeast consumes the sugar in the grapes and converts it to ethanol, carbon dioxide, and heat. Different varieties of grapes and strains of yeasts produce different styles of wine. These variations result from the complex interactions between the biochemical development of the grape, the reactions involved in fermentation, the terroir, and the production process. Many countries enact legal appellations intended to define styles and qualities of wine. These typically restrict the geographical origin and permitted varieties of grapes, as well as other aspects of wine production. Wines not made from grapes include rice wine and fruit wines such as plum, cherry, pomegranate, currant and elderberry."

In [295]:
give_harder_level(sample)

0.3362925132672765


"Hallmark Movies Now (formerly called Feeln and originally SpiritClips) is an on-demand streaming video service that offers family-friendly movies, documentaries and short films. The company was founded in 2007 by Academy Award-winning producer Robert N. Fried. Hallmark Movies Now is owned by Hallmark Cards and based out of Los Angeles.\n\nHistory\nHallmark Movies Now was founded in 2007 as SpiritClips. The company was started by Robert N. Fried, a film producer and studio executive who has produced movies such as Rudy and Hoosiers. Fried created SpiritClips as a collection of family-friendly and inspirational films.In April 2012, SpiritClips was acquired by Hallmark Cards, the largest greeting card manufacturer in the United States. Prior to the acquisition, SpiritClips had been working with Hallmark as the official provider for Hallmark's Hall of Fame series of inspirational films. In September 2014, the company was rebranded as Feeln. In October 2017, the service was rebranded as Ha

# Experiment

In [82]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


In [125]:
n_features = 2000

In [53]:
def load_vectorizer(pickle_file='data/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='data/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [54]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [55]:

def get_top_k_vector(vector, feature_ranking, k=20):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]
    

In [56]:
sample2 = """Data science is the study of the extraction of knowledge from data. It uses various techniques from many fields, including signal processing, mathematics, probability, machine learning, computer programming, statistics, data engineering, pattern matching, and data visualization, with the goal of extracting useful knowledge from the data. With computer systems able to handle more data, big data is an important aspect of data science.

A person that does data science is called a data scientist. Data scientists solve complicated data problems using mathematics, statistics and computer science, although very good skill in these subjects are not required.[1] However, a data scientist is most likely to be an expert in only one or two of these disciplines, meaning that cross disciplinary teams can be a key component of data science.

Good data scientists are able to apply their skills to achieve many kinds of purposes. Their skills and competencies vary widely.
"""

In [57]:
def vectorize_text(text):
    vec = load_vectorizer()
    corpus_vectors = load_corpus_vectors().toarray()
    sample_vector = vec.transform([sample2]).toarray()
    feature_ranking = np.argsort(sample_vector[0])[::-1]
    vocab_arr = get_vocab_arr(vec)
    
    distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
    )
    
    nearest_article_idxs = np.argsort(distances)
    nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]
    top_10 = nearest_articles[:10]
    
    return top_10.sort_values(['score'])

In [58]:
vectorize_text(sample2)

Unnamed: 0,content,score
11080,Globalize is a cross-platform JavaScript libra...,12.4
9969,Scientific Data is a peer-reviewed open access...,13.1
11804,"A database is an organized collection of data,...",15.6
12240,A data model (or datamodel) is an abstract mod...,15.8
10389,"In computer science, a collection or container...",16.9
9193,A column-oriented DBMS (or columnar database m...,17.3
6182,Rare or extreme events are events that occur w...,18.1
8130,Data science is a multi-disciplinary field tha...,18.4
10891,A geographic information system (GIS) is a sys...,20.4
7678,The GESIS – Leibniz Institute for the Social S...,20.9


In [13]:
vec = load_vectorizer()
corpus_vectors = load_corpus_vectors().toarray()

In [15]:
sample_vector = vec.transform([sample2]).toarray()

In [16]:
sample_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [17]:
feature_ranking = np.argsort(sample_vector[0])[::-1]

In [18]:
feature_ranking[:20]

array([ 525, 1632,  446, 1759, 1162, 1694, 1635,  118, 1043,  860, 1455,
       1442, 1129, 1331,  228, 1916,  127, 1483,  674, 1786])

In [20]:
vocab_arr = get_vocab_arr(vec)
vocab_arr[feature_ranking[:20]]

array(['data', 'science', 'computer', 'statistics', 'mathematics',
       'skills', 'scientists', 'able', 'knowledge', 'good', 'programming',
       'processing', 'machine', 'pattern', 'aspect', 'useful', 'achieve',
       'purposes', 'engineering', 'subjects'], dtype=object)

In [22]:
sample_vector.shape, corpus_vectors.shape

((1, 2000), (14216, 2000))

In [23]:
(
    get_top_k_vector(sample_vector, feature_ranking).shape,
    get_top_k_vector(corpus_vectors, feature_ranking).shape
)

((1, 20), (14216, 20))

In [24]:
distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
)

In [25]:
get_top_k_vector(sample_vector, feature_ranking)

array([[0.83470286, 0.22166612, 0.18837988, 0.14557046, 0.13767567,
        0.13719084, 0.13478882, 0.1069378 , 0.10534487, 0.09932174,
        0.07654316, 0.07365873, 0.07297837, 0.07146458, 0.06887271,
        0.06806918, 0.06750797, 0.06658652, 0.06529604, 0.06506513]])

In [26]:
nearest_article_idxs = np.argsort(distances)

In [27]:
nearest_article_idxs.shape

(1, 14216)

In [28]:
nearest_article_idxs[0, 0]

8130

In [29]:
distances[0]

array([0.96926104, 0.96382101, 0.96926104, ..., 0.94577355, 0.96527653,
       0.96926104])

In [30]:
nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]

In [48]:
top_10 = nearest_articles[:10]

In [49]:
top_10.sort_values(['score'])

Unnamed: 0,content,score
11080,Globalize is a cross-platform JavaScript libra...,12.4
9969,Scientific Data is a peer-reviewed open access...,13.1
11804,"A database is an organized collection of data,...",15.6
12240,A data model (or datamodel) is an abstract mod...,15.8
10389,"In computer science, a collection or container...",16.9
9193,A column-oriented DBMS (or columnar database m...,17.3
6182,Rare or extreme events are events that occur w...,18.1
8130,Data science is a multi-disciplinary field tha...,18.4
10891,A geographic information system (GIS) is a sys...,20.4
7678,The GESIS – Leibniz Institute for the Social S...,20.9


In [None]:
def show_article(dataframe):
    

In [52]:
nearest_articles.loc[11804, 'content']

'A database is an organized collection of data, generally stored and accessed electronically from a computer system.  Where databases are more complex they are often developed using formal design and modeling techniques.\nThe database management system (DBMS) is the software that interacts with end users, applications, and the database itself to capture and analyze the data.  The DBMS software additionally encompasses the core facilities provided to administer the database.  The sum total of the database, the DBMS and the associated applications can be referred to as a "database system".  Often the term "database" is also used to loosely refer to any of the DBMS, the database system or an application associated with the database.\nComputer scientists may classify database-management systems according to the database models that they support. Relational databases became dominant in the 1980s. These model data as rows and columns in a series of tables, and the vast majority use SQL for w