## Load Files and Vectorize 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [2]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [3]:
with open(r"data/df_best_category.pkl", "rb") as input_file:
    df_best_category = pickle.load(input_file)

In [4]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [5]:
with open (r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [6]:
with open(r"data/df_best_category.pkl", 'rb') as input_file:
      df_best_category = pickle.load(input_file)

In [42]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


## Third Model 

In [290]:
import textstat
import operator
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from operator import itemgetter

In [291]:
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text, lemmatizer=lemmatizer):
    return lemmatizer.lemmatize(text, pos='v')

In [292]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [293]:
def give_harder_level(text, vectorizer=vectorizer):
    """
    Takes in text and returns a harder level read
    """
    unseen_document = str(text)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_vector = dictionary.doc2bow(preprocess(unseen_document))
    text_topics = lda_model_1[bow_vector]
    topic_group = max(text_topics,key=itemgetter(1))[0]
    
    category_df = df_best_category.loc[df_best_category['category'] == topic_group] 
    
    ##find all the articles that are easier than this article 
#     all_harder_text = []
#     input_score = textstat.flesch_kincaid_grade(text)
#     for i in range(len(df_corpus2['score'])):
#         if df_corpus2['score'][i] > input_score:
#             all_harder_text.append(df_corpus2['content'][i])
    
    all_harder_text = []
    
    for i in category_df['article_index']: 
        temp = df_corpus2['content'][i]
        all_harder_text.append(temp)
        
    vec = TfidfVectorizer(stop_words='english', max_features=2000)
    vec = vec.fit(all_harder_text)
    corpus2_vectors = vec.transform(all_harder_text).toarray()
    user_doc = text
    user_doc_vector = vec.transform([user_doc]).toarray() 
    #user_doc_vector_short = np.argsort(user_doc_vector)
    
    distances = cdist(user_doc_vector,
                  corpus2_vectors,
                  metric='cosine')[0]
    ranking = np.argsort(distances)
    top = ranking[0]
    best_match = df_corpus2['content'][top]
        
    print(distances[top])
    return (best_match)

In [294]:
sample = "Wine is an alcoholic drink made from fermented grapes.[1] Yeast consumes the sugar in the grapes and converts it to ethanol, carbon dioxide, and heat. Different varieties of grapes and strains of yeasts produce different styles of wine. These variations result from the complex interactions between the biochemical development of the grape, the reactions involved in fermentation, the terroir, and the production process. Many countries enact legal appellations intended to define styles and qualities of wine. These typically restrict the geographical origin and permitted varieties of grapes, as well as other aspects of wine production. Wines not made from grapes include rice wine and fruit wines such as plum, cherry, pomegranate, currant and elderberry."

In [295]:
give_harder_level(sample)

0.3362925132672765


"Hallmark Movies Now (formerly called Feeln and originally SpiritClips) is an on-demand streaming video service that offers family-friendly movies, documentaries and short films. The company was founded in 2007 by Academy Award-winning producer Robert N. Fried. Hallmark Movies Now is owned by Hallmark Cards and based out of Los Angeles.\n\nHistory\nHallmark Movies Now was founded in 2007 as SpiritClips. The company was started by Robert N. Fried, a film producer and studio executive who has produced movies such as Rudy and Hoosiers. Fried created SpiritClips as a collection of family-friendly and inspirational films.In April 2012, SpiritClips was acquired by Hallmark Cards, the largest greeting card manufacturer in the United States. Prior to the acquisition, SpiritClips had been working with Hallmark as the official provider for Hallmark's Hall of Fame series of inspirational films. In September 2014, the company was rebranded as Feeln. In October 2017, the service was rebranded as Ha

# Experiment

In [82]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


In [125]:
n_features = 2000

In [126]:
# vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')

In [127]:
# vectorizer.fit(df_corpus2['content'])
# corpus_vectors = vectorizer.transform(df_corpus2['content'])

In [128]:
# with open('data/vectorizer.pkl', 'wb') as f:
#     pickle.dump(vectorizer, f)
    
# with open('data/corpus_vectors.pkl', 'wb') as f:
#     pickle.dump(corpus_vectors, f)

In [8]:
def load_vectorizer(pickle_file='data/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='data/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [9]:
vec = load_vectorizer()
corpus_vectors = load_corpus_vectors().toarray()

In [10]:
sample2 = """The overwhelming majority of scientific knowledge is published as text, which is difficult to analyse by either traditional statistical analysis or modern machine learning methods. By contrast, the main source of machine-interpretable data for the materials research community has come from structured property databases1,2, which encompass only a small fraction of the knowledge present in the research literature. Beyond property values, publications contain valuable knowledge regarding the connections and relationships between data items as interpreted by the authors. To improve the identification and use of this knowledge, several studies have focused on the retrieval of information from scientific literature using supervised natural language processing3,4,5,6,7,8,9,10, which requires large hand-labelled datasets for training. Here we show that materials science knowledge present in the published literature can be efficiently encoded as information-dense word embeddings11,12,13 (vector representations of words) without human labelling or supervision. Without any explicit insertion of chemical knowledge, these embeddings capture complex materials science concepts such as the underlying structure of the periodic table and structure–property relationships in materials. Furthermore, we demonstrate that an unsupervised method can recommend materials for functional applications several years before their discovery. This suggests that latent knowledge regarding future discoveries is to a large extent embedded in past publications. Our findings highlight the possibility of extracting knowledge and relationships from the massive body of scientific literature in a collective manner, and point towards a generalized approach to the mining of scientific literature.

"""

In [11]:
sample_vector = vec.transform([sample2]).toarray()

In [12]:
sample_vector

array([[0.        , 0.04854106, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
feature_ranking = np.argsort(sample_vector[0])[::-1]

In [14]:
feature_ranking[:20]

array([1043, 1160, 1105, 1634, 1464, 1540, 1129, 1478, 1530,  525, 1774,
        972, 1418, 1479, 1632, 1057, 1565,  375,  822, 1016])

In [15]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [16]:
vocab_arr = get_vocab_arr(vec)
vocab_arr[feature_ranking[:20]]

array(['knowledge', 'materials', 'literature', 'scientific', 'property',
       'relationships', 'machine', 'publications', 'regarding', 'data',
       'structure', 'information', 'present', 'published', 'science',
       'large', 'research', 'chemical', 'functional', 'items'],
      dtype=object)

In [17]:

def get_top_k_vector(vector, feature_ranking, k=20):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]
    

In [18]:
sample_vector.shape, corpus_vectors.shape

((1, 2000), (14216, 2000))

In [19]:
(
    get_top_k_vector(sample_vector, feature_ranking).shape,
    get_top_k_vector(corpus_vectors, feature_ranking).shape
)

((1, 20), (14216, 20))

In [20]:
distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
)

In [21]:
get_top_k_vector(sample_vector, feature_ranking)

array([[0.48112503, 0.38004914, 0.3022274 , 0.24705762, 0.22184728,
        0.21975153, 0.1666513 , 0.14229918, 0.14204404, 0.12707356,
        0.12040253, 0.10665622, 0.10262172, 0.10155444, 0.10123808,
        0.09654214, 0.09598864, 0.08716042, 0.08672557, 0.08236192]])

In [22]:
nearest_article_idxs = np.argsort(distances)

In [23]:
nearest_article_idxs.shape

(1, 14216)

In [24]:
nearest_article_idxs[0, 0]

2762

In [25]:
distances[0]

array([0.89863915, 0.89513509, 0.89863915, ..., 0.87572626, 0.88134487,
       0.89863915])

In [26]:
nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]

In [27]:
nearest_articles

Unnamed: 0,content,score
2762,The economics of science aims to understand th...,16.0
10687,Constructivist epistemology is a branch in phi...,15.8
12084,"GRAKN.AI is an open-source, distributed knowle...",15.1
3353,Bloom's taxonomy is a set of three hierarchica...,16.6
11571,"In philosophy, empiricism is a theory that sta...",21.7
6768,"In epistemology, and more specifically, the so...",22.1
5502,Robin Wall Kimmerer (also credited as Robin W....,14.9
7833,Intuition is the ability to acquire knowledge ...,20.6
13337,"Metacognition is ""cognition about cognition"", ...",16.3
7670,An inquiry is any process that has the aim of ...,29.9


In [332]:
nearest_articles.loc[1664, 'content']

'The end of history is a political and philosophical concept that supposes that a particular political, economic, or social system may develop that would constitute the end-point of humanity\'s sociocultural evolution and the final form of human government. \nEnd of history or The End of History may also refer to:\n\n"The End of History?", 1989 essay by Francis Fukuyama on the political and philosophical concept, published in The National Interest\nThe End of History and the Last Man, 1992 political book by Fukuyama expanding on his 1989 essay\nThe End of History (album), a 2006 album by Fionn Regan\nEnd-of-history illusion, a psychological illusion that one will not undergo significant developmental changes in the future\nThe End of History, a 55% ABV beer made by the BrewDog brewery and packaged inside small stuffed animals\nNorte, the End of History, 2013 Filipino drama film\n\nSee also\nEnd of the world (disambiguation)'