## Load Files and Vectorize 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [2]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [3]:
with open(r"data/df_best_category.pkl", "rb") as input_file:
    df_best_category = pickle.load(input_file)

In [4]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [5]:
with open (r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [6]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


## Fourth Model 

In [7]:
import textstat
import operator
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from operator import itemgetter

In [291]:
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text, lemmatizer=lemmatizer):
    return lemmatizer.lemmatize(text, pos='v')

In [292]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
len(df_corpus2)

14216

In [None]:
### First part of model 
def give_simpler_level(text, vectorizer=vectorizer):
    """
    Takes in text and returns the topics the text belongs to based on the LDA model.
    """
    unseen_document = str(text)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_vector = dictionary.doc2bow(preprocess(unseen_document))
    text_topics = lda_model_1[bow_vector]
    topic_group = max(text_topics,key=itemgetter(1))[0]
    
    category_df = df_best_category.loc[df_best_category['category'] == topic_group] 
    
    

In [None]:
def topic_to_articles():
    """
    Takes in a selection of a topic and returns all the articles in the topic.
    """

In [None]:
def articles_to_difficulty():
    """
    Takes in the topic selected and allows user to traverse along difficulty and similarity. 
    Returns a specific article to user. 
    """
    #find all the articles that are easier than this article 
#     all_harder_text = []
#     input_score = textstat.flesch_kincaid_grade(text)
#     for i in range(len(df_corpus2['score'])):
#         if df_corpus2['score'][i] > input_score:
#             all_harder_text.append(df_corpus2['content'][i])

# Model Revision

In [82]:
df_corpus2.head()

Unnamed: 0,content,score
0,"Nature Reviews Nephrology (before 2009, Nature...",20.6
1,Food blogging represents a complex interweavin...,13.6
2,The International Motorcycle Shows are a serie...,13.3
3,"Education in Ancient Greece was vastly ""democr...",11.3
4,Rural economics is the study of rural economie...,12.5


In [125]:
n_features = 2000

In [8]:
def load_vectorizer(pickle_file='data/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='data/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [9]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [10]:

def get_top_k_vector(vector, feature_ranking, k=20):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]
    

In [11]:
sample2 = """Data science is the study of the extraction of knowledge from data. It uses various techniques from many fields, including signal processing, mathematics, probability, machine learning, computer programming, statistics, data engineering, pattern matching, and data visualization, with the goal of extracting useful knowledge from the data. With computer systems able to handle more data, big data is an important aspect of data science.

A person that does data science is called a data scientist. Data scientists solve complicated data problems using mathematics, statistics and computer science, although very good skill in these subjects are not required.[1] However, a data scientist is most likely to be an expert in only one or two of these disciplines, meaning that cross disciplinary teams can be a key component of data science.

Good data scientists are able to apply their skills to achieve many kinds of purposes. Their skills and competencies vary widely.
"""

In [12]:
def vectorize_text(text):
    vec = load_vectorizer()
    corpus_vectors = load_corpus_vectors().toarray()
    sample_vector = vec.transform([sample2]).toarray()
    feature_ranking = np.argsort(sample_vector[0])[::-1]
    vocab_arr = get_vocab_arr(vec)
    
    distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
    )
    
    nearest_article_idxs = np.argsort(distances)
    nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]
    top_10 = nearest_articles[:10]
    
    return top_10.sort_values(['score'])

In [13]:
vectorize_text(sample2)

Unnamed: 0,content,score
11080,Globalize is a cross-platform JavaScript libra...,12.4
9969,Scientific Data is a peer-reviewed open access...,13.1
11804,"A database is an organized collection of data,...",15.6
12240,A data model (or datamodel) is an abstract mod...,15.8
10389,"In computer science, a collection or container...",16.9
9193,A column-oriented DBMS (or columnar database m...,17.3
6182,Rare or extreme events are events that occur w...,18.1
8130,Data science is a multi-disciplinary field tha...,18.4
10891,A geographic information system (GIS) is a sys...,20.4
7678,The GESIS – Leibniz Institute for the Social S...,20.9


In [14]:
vec = load_vectorizer()
corpus_vectors = load_corpus_vectors().toarray()

In [15]:
sample_vector = vec.transform([sample2]).toarray()

In [16]:
sample_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [17]:
feature_ranking = np.argsort(sample_vector[0])[::-1]

In [18]:
feature_ranking[:20]

array([ 525, 1632,  446, 1759, 1162, 1694, 1635,  118, 1043,  860, 1455,
       1442, 1129, 1331,  228, 1916,  127, 1483,  674, 1786])

In [19]:
vocab_arr = get_vocab_arr(vec)
vocab_arr[feature_ranking[:20]]

array(['data', 'science', 'computer', 'statistics', 'mathematics',
       'skills', 'scientists', 'able', 'knowledge', 'good', 'programming',
       'processing', 'machine', 'pattern', 'aspect', 'useful', 'achieve',
       'purposes', 'engineering', 'subjects'], dtype=object)

In [20]:
sample_vector.shape, corpus_vectors.shape

((1, 2000), (14216, 2000))

In [21]:
(
    get_top_k_vector(sample_vector, feature_ranking).shape,
    get_top_k_vector(corpus_vectors, feature_ranking).shape
)

((1, 20), (14216, 20))

In [22]:
distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
)

In [23]:
get_top_k_vector(sample_vector, feature_ranking)

array([[0.83470286, 0.22166612, 0.18837988, 0.14557046, 0.13767567,
        0.13719084, 0.13478882, 0.1069378 , 0.10534487, 0.09932174,
        0.07654316, 0.07365873, 0.07297837, 0.07146458, 0.06887271,
        0.06806918, 0.06750797, 0.06658652, 0.06529604, 0.06506513]])

In [24]:
nearest_article_idxs = np.argsort(distances)

In [25]:
nearest_article_idxs.shape

(1, 14216)

In [26]:
nearest_article_idxs[0, 0]

8130

In [27]:
distances[0]

array([0.96926104, 0.96382101, 0.96926104, ..., 0.94577355, 0.96527653,
       0.96926104])

In [28]:
nearest_articles = df_corpus2.loc[nearest_article_idxs[0], :]

In [29]:
article_num = str(nearest_articles[:1].index)

In [30]:
article_num_str = article_num.strip("Int64Index([], dtype='int64')")

In [31]:
article_num_final = int(article_num_str)

In [32]:
top_df = top_10.sort_values(['score'])

NameError: name 'top_10' is not defined

In [34]:
article_num_final

8130

In [35]:
nearest_articles.loc[article_num_final, 'content']

'Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data. Data science is the same concept as data mining and big data: "use the most powerful hardware, the most powerful programming systems, and the most efficient algorithms to solve problems".Data science is a "concept to unify statistics, data analysis, machine learning and their related methods" in order to "understand and analyze actual phenomena" with data. It employs techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, and information science. Turing award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge. In 2015, the American Statistical Associa

In [25]:
with open(r"data/df_corpus2.pkl", "rb") as input_file:
    df_corpus2 = pickle.load(input_file)

In [26]:
content_clean = []
for article in df_corpus2['content']:
    temp = article.rstrip('\r\n')
    content_clean.append(temp)

In [27]:
new_corpus = df_corpus2

In [28]:
new_corpus['content_updated'] = content_clean

In [30]:
new_corpus = new_corpus.drop(['content'], axis=1)

In [31]:
new_corpus.head()

Unnamed: 0,score,content_updated
0,20.6,"Nature Reviews Nephrology (before 2009, Nature..."
1,13.6,Food blogging represents a complex interweavin...
2,13.3,The International Motorcycle Shows are a serie...
3,11.3,"Education in Ancient Greece was vastly ""democr..."
4,12.5,Rural economics is the study of rural economie...


In [32]:
new_corpus['content'] = new_corpus['content_updated']

In [35]:
new_corpus = new_corpus.drop(['content_updated'], axis=1)

In [36]:
# with open('data/new_corpus.pkl', 'wb') as f:
#     pickle.dump(new_corpus, f)

In [37]:
new_corpus.head()

Unnamed: 0,score,content
0,20.6,"Nature Reviews Nephrology (before 2009, Nature..."
1,13.6,Food blogging represents a complex interweavin...
2,13.3,The International Motorcycle Shows are a serie...
3,11.3,"Education in Ancient Greece was vastly ""democr..."
4,12.5,Rural economics is the study of rural economie...
