## Load Files and Vectorize 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [2]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [3]:
with open(r"data/df_best_category.pkl", "rb") as input_file:
    df_best_category = pickle.load(input_file)

In [4]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [5]:
with open (r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [6]:
with open(r"data/df_best_category.pkl", 'rb') as input_file:
      df_best_category = pickle.load(input_file)

In [7]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


## Fourth Model 

In [290]:
import textstat
import operator
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from operator import itemgetter

In [291]:
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text, lemmatizer=lemmatizer):
    return lemmatizer.lemmatize(text, pos='v')

In [292]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [293]:
def give_simpler_level(text, vectorizer=vectorizer):
    """
    Takes in text and returns a harder level read
    """
    unseen_document = str(text)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_vector = dictionary.doc2bow(preprocess(unseen_document))
    text_topics = lda_model_1[bow_vector]
    topic_group = max(text_topics,key=itemgetter(1))[0]
    
    category_df = df_best_category.loc[df_best_category['category'] == topic_group] 
    
#     #find all the articles that are easier than this article 
#     all_harder_text = []
#     input_score = textstat.flesch_kincaid_grade(text)
#     for i in range(len(df_corpus2['score'])):
#         if df_corpus2['score'][i] > input_score:
#             all_harder_text.append(df_corpus2['content'][i])
    
    all_harder_text = []
    
    for i in category_df['article_index']: 
        temp = df_corpus2['content'][i]
        all_harder_text.append(temp)
        
    vec = TfidfVectorizer(stop_words='english', max_features=2000)
    vec = vec.fit(all_harder_text)
    corpus2_vectors = vec.transform(all_harder_text).toarray()
    user_doc = text
    user_doc_vector = vec.transform([user_doc]).toarray() 
    #user_doc_vector_short = np.argsort(user_doc_vector)
    
    distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]
    ranking = np.argsort(distances)
    top = ranking[0]
    best_match = df_corpus2['content'][top]
        
    print(distances[top])
    return (best_match)

In [294]:
sample = "Wine is an alcoholic drink made from fermented grapes.[1] Yeast consumes the sugar in the grapes and converts it to ethanol, carbon dioxide, and heat. Different varieties of grapes and strains of yeasts produce different styles of wine. These variations result from the complex interactions between the biochemical development of the grape, the reactions involved in fermentation, the terroir, and the production process. Many countries enact legal appellations intended to define styles and qualities of wine. These typically restrict the geographical origin and permitted varieties of grapes, as well as other aspects of wine production. Wines not made from grapes include rice wine and fruit wines such as plum, cherry, pomegranate, currant and elderberry."

In [295]:
give_harder_level(sample)

0.3362925132672765


"Hallmark Movies Now (formerly called Feeln and originally SpiritClips) is an on-demand streaming video service that offers family-friendly movies, documentaries and short films. The company was founded in 2007 by Academy Award-winning producer Robert N. Fried. Hallmark Movies Now is owned by Hallmark Cards and based out of Los Angeles.\n\nHistory\nHallmark Movies Now was founded in 2007 as SpiritClips. The company was started by Robert N. Fried, a film producer and studio executive who has produced movies such as Rudy and Hoosiers. Fried created SpiritClips as a collection of family-friendly and inspirational films.In April 2012, SpiritClips was acquired by Hallmark Cards, the largest greeting card manufacturer in the United States. Prior to the acquisition, SpiritClips had been working with Hallmark as the official provider for Hallmark's Hall of Fame series of inspirational films. In September 2014, the company was rebranded as Feeln. In October 2017, the service was rebranded as Ha

# Experiment

In [82]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


In [125]:
n_features = 2000

In [231]:
def load_vectorizer(pickle_file='data/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='data/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [232]:
vec = load_vectorizer()
corpus_vectors = load_corpus_vectors().toarray()

In [314]:
sample2 = """Over the course of evolution, organism size has diversified markedly. Changes in size are thought to have occurred because of developmental, morphological and/or ecological pressures. To perform phylogenetic tests of the potential effects of these pressures, here we generated a dataset of more than ten thousand descriptions of insect eggs, and combined these with genetic and life-history datasets. We show that, across eight orders of magnitude of variation in egg volume, the relationship between size and shape itself evolves, such that previously predicted global patterns of scaling do not adequately explain the diversity in egg shapes. We show that egg size is not correlated with developmental rate and that, for many insects, egg size is not correlated with adult body size. Instead, we find that the evolution of parasitoidism and aquatic oviposition help to explain the diversification in the size and shape of insect eggs. Our study suggests that where eggs are laid, rather than universal allometric constants, underlies the evolution of insect egg size and shape.

"""

In [315]:
sample_vector = vec.transform([sample2]).toarray()

In [316]:
sample_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [317]:
feature_ranking = np.argsort(sample_vector[0])[::-1]

In [318]:
feature_ranking[:20]

array([1693, 1671,  703,  725,  636,  840,  150,  608, 1341, 1332, 1943,
       1905, 1794, 1272, 1499,  422, 1407, 1425,  496,  649])

In [319]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [320]:
vocab_arr = get_vocab_arr(vec)
vocab_arr[feature_ranking[:20]]

array(['size', 'shape', 'evolution', 'explain', 'ecological', 'genetic',
       'adult', 'diversity', 'perform', 'patterns', 'volume', 'universal',
       'suggests', 'occurred', 'rate', 'combined', 'potential',
       'previously', 'course', 'effects'], dtype=object)

In [321]:

def get_top_k_vector(vector, feature_ranking, k=20):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]
    

In [322]:
sample_vector.shape, corpus_vectors.shape

((1, 2000), (14216, 2000))

In [323]:
(
    get_top_k_vector(sample_vector, feature_ranking).shape,
    get_top_k_vector(corpus_vectors, feature_ranking).shape
)

((1, 20), (14216, 20))

In [324]:
distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
)

In [325]:
get_top_k_vector(sample_vector, feature_ranking)

array([[0.74080085, 0.31553463, 0.30303357, 0.2056208 , 0.12568278,
        0.1199519 , 0.11225869, 0.11158558, 0.10528558, 0.10059519,
        0.09742111, 0.09688993, 0.09660782, 0.09636812, 0.09622919,
        0.09246032, 0.09134841, 0.08726221, 0.08724759, 0.08648294]])

In [326]:
nearest_article_idxs = np.argsort(distances)

In [327]:
nearest_article_idxs.shape

(1, 14216)

In [328]:
nearest_article_idxs[0, 0]

6973

In [329]:
distances[0]

array([0.96672001, 0.97049476, 0.97224366, ..., 0.9392796 , 0.96603653,
       0.97224366])

In [330]:
nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]

In [331]:
nearest_articles

Unnamed: 0,content,score
6973,Sexual dimorphism is the condition where the t...,14.8
10607,"Norm, the Norm or NORM may refer to:\n\nIn aca...",56.6
206,The study of geometric morphometrics in anthro...,16.6
1648,The Toyota New Global Architecture (TNGA) are ...,13.1
1977,"Anthropometry (from Greek ἄνθρωπος anthropos, ...",18.7
10522,Central place theory is a geographical theory ...,19.1
5281,The history of anthropometry includes the use ...,15.7
1966,A lek is an aggregation of male animals gather...,12.2
2594,Northern Manitoba is the most northerly region...,12.3
8006,This list displays the official cities in the ...,15.8


In [332]:
nearest_articles.loc[1664, 'content']

'The end of history is a political and philosophical concept that supposes that a particular political, economic, or social system may develop that would constitute the end-point of humanity\'s sociocultural evolution and the final form of human government. \nEnd of history or The End of History may also refer to:\n\n"The End of History?", 1989 essay by Francis Fukuyama on the political and philosophical concept, published in The National Interest\nThe End of History and the Last Man, 1992 political book by Fukuyama expanding on his 1989 essay\nThe End of History (album), a 2006 album by Fionn Regan\nEnd-of-history illusion, a psychological illusion that one will not undergo significant developmental changes in the future\nThe End of History, a 55% ABV beer made by the BrewDog brewery and packaged inside small stuffed animals\nNorte, the End of History, 2013 Filipino drama film\n\nSee also\nEnd of the world (disambiguation)'