## Load Files and Vectorize 

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [2]:
with open(r"data/new_corpus.pkl", "rb") as input_file:
    new_corpus = pickle.load(input_file)

In [3]:
with open(r"data/df_best_category.pkl", "rb") as input_file:
    df_best_category = pickle.load(input_file)

In [4]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [5]:
with open (r"data/lda_model_1.pkl", "rb") as input_file:
    lda_model_1 = pickle.load(input_file)

In [6]:
df_best_category.head()

Unnamed: 0,category,article_index
0,2,0
1,5,1
2,5,2
3,4,3
4,6,4


## Second Function

In [8]:
import textstat
import operator
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from operator import itemgetter

In [9]:
def load_vectorizer(pickle_file='data/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='data/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [10]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [11]:
def get_top_k_vector(vector, feature_ranking, k=50):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]
    

In [18]:
def top_50_text(text):
    vec = load_vectorizer()
    corpus_vectors = load_corpus_vectors().toarray()
    sample_vector = vec.transform([text]).toarray()
    feature_ranking = np.argsort(sample_vector[0])[::-1]
    vocab_arr = get_vocab_arr(vec)
    
    distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
    )
    
    nearest_article_idxs = np.argsort(distances)
    nearest_articles = new_corpus.loc[nearest_article_idxs[0], :]
    top_50 = nearest_articles[:50]
    
    return top_50.sort_values(['score'])

In [19]:
sample = """Data science is the study of the extraction of knowledge from data. It uses various techniques from many fields, including signal processing, mathematics, probability, machine learning, computer programming, statistics, data engineering, pattern matching, and data visualization, with the goal of extracting useful knowledge from the data. With computer systems able to handle more data, big data is an important aspect of data science.

A person that does data science is called a data scientist. Data scientists solve complicated data problems using mathematics, statistics and computer science, although very good skill in these subjects are not required.[1] However, a data scientist is most likely to be an expert in only one or two of these disciplines, meaning that cross disciplinary teams can be a key component of data science.

Good data scientists are able to apply their skills to achieve many kinds of purposes. Their skills and competencies vary widely.
"""

In [43]:
df = top_50_text(sample)

In [44]:
df.reset_index()

Unnamed: 0,index,score,content
0,13252,7.7,Oregon is a state located in the Western Unite...
1,1233,8.5,Walls And Mirrors is a computer science textbo...
2,6277,12.0,"In computing, the Global File System 2 or GFS2..."
3,11080,12.4,Globalize is a cross-platform JavaScript libra...
4,2539,12.5,Pure Data (Pd) is a visual programming languag...
5,9969,13.1,Scientific Data is a peer-reviewed open access...
6,12275,14.1,"In computer science and computer programming, ..."
7,610,14.4,Digital anthropology is the anthropological st...
8,5831,14.6,A relational database is a digital database ba...
9,10459,14.7,Symbolic regression is a type of regression an...


In [45]:
def get_level_change(x,text):
    """
    Takes in a value and returns the article with the score closest to that value.
    """
    top_50_df = top_50_text(text)
    top_50_df = top_50_df.reset_index()
    top_50_dict = top_50_df['score'].to_dict()
    abs_values = {}
    for key, value in top_50_dict.items():
        temp = abs(value-x)
        abs_values.update({key:temp})
    article_id = min(abs_values, key=abs_values.get)
    level_change = top_50_df['content'][article_id]
    return level_change
    
    

In [59]:
get_level_change(17,sample)

'Wireless sensor network (WSN) refers to a group of spatially dispersed and dedicated sensors for monitoring and recording the physical conditions of the environment and organizing the collected data at a central location. WSNs measure environmental conditions like temperature, sound, pollution levels, humidity, wind, and so on.\nThese are similar to wireless ad hoc networks in the sense that they rely on wireless connectivity and spontaneous formation of networks so that sensor data can be transported wirelessly. WSNs are spatially distributed autonomous sensors to monitor physical or environmental conditions, such as temperature, sound, pressure, etc. and to cooperatively pass their data through the network to a main location. The more modern networks are bi-directional, also enabling control of sensor activity.  The development of wireless sensor networks was motivated by military applications such as battlefield surveillance; today such networks are used in many industrial and cons