## Final Functions

In [1]:
import pickle
import os
import requests
from flask import Flask, request, render_template, jsonify

import numpy as np
import pandas as pd
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

import gensim
import operator
from operator import itemgetter

In [9]:
import pymongo
from pymongo import MongoClient

mc = pymongo.MongoClient()
db = mc['wikicache']
articles = db['articles']

In [12]:
data = list(articles.find())
clean_df = pd.DataFrame(data)


In [14]:
clean_df.head()

Unnamed: 0,_id,summary,text,title,url
0,5d1fc2f4258b4b335c5b2093,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences,https://en.wikipedia.org/wiki/Avans_University...
1,5d1fc2f5258b4b335c5b2094,"Guy de Lussigny (30 August 1929 in Cambrai, no...","Guy de Lussigny (30 August 1929 in Cambrai, no...",Guy de Lussigny,https://en.wikipedia.org/wiki/Guy_de_Lussigny
2,5d1fc2f6258b4b335c5b2095,"In literature, a trope is a common plot conven...","In literature, a trope is a common plot conven...",Trope (literature),https://en.wikipedia.org/wiki/Trope_(literature)
3,5d1fc2f6258b4b335c5b2096,The Global Television Network (more commonly c...,The Global Television Network (more commonly c...,Global Television Network,https://en.wikipedia.org/wiki/Global_Televisio...
4,5d1fc2f7258b4b335c5b2097,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts,https://en.wikipedia.org/wiki/Nickelodeon_Guts


In [15]:
clean_df['text']

11978

## Using Old Corpus

In [5]:
with open(r"data/new_corpus.pkl", "rb") as input_file:
    new_corpus = pickle.load(input_file)

In [6]:
with open (r"data/dictionary.pkl", "rb") as input_file: 
    dictionary = pickle.load(input_file)

In [27]:
def download_and_open(url, mode='rb', folder='data'):
    """Downloads the specified file if it isn't already downloaded, then opens it."""
    filename = os.path.basename(url)
    pathname = os.path.join(folder, filename)
    if not os.path.exists(pathname):
        response = requests.get(url)
        with open(pathname, 'wb') as f:
            f.write(response.content)
    return open(pathname, mode)

In [7]:
def load_vectorizer(pickle_file='https://text-ascent.s3-us-west-2.amazonaws.com/vectorizer.pkl'):
    """Loads the trained TF/IDF vectorizer."""
    with download_and_open(pickle_file, 'rb') as f:
        return pickle.load(f)
    
def load_corpus_vectors(pickle_file='https://text-ascent.s3-us-west-2.amazonaws.com/corpus_vectors.pkl'):
    """Loads the corpus vectors."""
    with download_and_open(pickle_file, 'rb') as f:
        return pickle.load(f)

In [10]:
with open (r"data/vectorizer.pkl", "rb") as input_file: 
    vectorizer = pickle.load(input_file)

In [11]:
with open (r"data/corpus_vectors.pkl", "rb") as input_file: 
    corpus_vectors = pickle.load(input_file)

In [12]:
def get_vocab_arr(vec):
    n_features = len(vec.vocabulary_)
    vocab_arr = np.empty(n_features, dtype=object)
    for word, idx in vec.vocabulary_.items():
        vocab_arr[idx] = word
    return vocab_arr

In [15]:
import sys
import numpy
#numpy.set_printoptions(threshold=sys.maxsize)

In [33]:
#get_vocab_arr(vectorizer)

In [36]:
corpus_vectors

<bound method _cs_matrix.maximum of <14216x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 3288779 stored elements in Compressed Sparse Row format>>

In [28]:
def get_top_k_vector(vector, feature_ranking, k=20):
    """Return the top k vector according to feature_ranking."""
    return vector[:, feature_ranking[:k]]

In [31]:
def top_50_text(text):
    vec = load_vectorizer()
    corpus_vectors = load_corpus_vectors().toarray()
    sample_vector = vec.transform([text]).toarray()
    feature_ranking = np.argsort(sample_vector[0])[::-1]
    vocab_arr = get_vocab_arr(vec)
    
    distances = cdist(
    get_top_k_vector(sample_vector, feature_ranking),
    get_top_k_vector(corpus_vectors, feature_ranking),
    )
    
    nearest_article_idxs = np.argsort(distances)
    nearest_articles = new_corpus.loc[nearest_article_idxs[0], :]
    top_50 = nearest_articles[:50]
    
    return top_50.sort_values(['score']).to_html()


In [32]:
top_50_text(sample)

Unnamed: 0,score,content
13252,7.7,Oregon is a state located in the Western Unite...
1233,8.5,Walls And Mirrors is a computer science textbo...
6277,12.0,"In computing, the Global File System 2 or GFS2..."
11080,12.4,Globalize is a cross-platform JavaScript libra...
2539,12.5,Pure Data (Pd) is a visual programming languag...
9969,13.1,Scientific Data is a peer-reviewed open access...
12275,14.1,"In computer science and computer programming, ..."
610,14.4,Digital anthropology is the anthropological st...
5831,14.6,A relational database is a digital database ba...
10459,14.7,Symbolic regression is a type of regression an...


In [None]:
def show_5_top_50(table): 
    

In [16]:
def get_level_change(x,text):
    """
    Takes in a value and returns the article with the score closest to that value.
    """
    top_50_df = top_50_text(text)
    top_50_df = top_50_df.reset_index()
    top_50_dict = top_50_df['score'].to_dict()
    abs_values = {}
    for key, value in top_50_dict.items():
        temp = abs(value-x)
        abs_values.update({key:temp})
    article_id = min(abs_values, key=abs_values.get)
    level_change = top_50_df['content'][article_id]
    return level_change
    
    

In [25]:
sample = """Data science is the study of the extraction of knowledge from data. It uses various techniques from many fields, including signal processing, mathematics, probability, machine learning, computer programming, statistics, data engineering, pattern matching, and data visualization, with the goal of extracting useful knowledge from the data. With computer systems able to handle more data, big data is an important aspect of data science.

A person that does data science is called a data scientist. Data scientists solve complicated data problems using mathematics, statistics and computer science, although very good skill in these subjects are not required.[1] However, a data scientist is most likely to be an expert in only one or two of these disciplines, meaning that cross disciplinary teams can be a key component of data science.

Good data scientists are able to apply their skills to achieve many kinds of purposes. Their skills and competencies vary widely.
"""

In [18]:
wireless = get_level_change(17,sample)

In [19]:
print(wireless)

Wireless sensor network (WSN) refers to a group of spatially dispersed and dedicated sensors for monitoring and recording the physical conditions of the environment and organizing the collected data at a central location. WSNs measure environmental conditions like temperature, sound, pollution levels, humidity, wind, and so on.
These are similar to wireless ad hoc networks in the sense that they rely on wireless connectivity and spontaneous formation of networks so that sensor data can be transported wirelessly. WSNs are spatially distributed autonomous sensors to monitor physical or environmental conditions, such as temperature, sound, pressure, etc. and to cooperatively pass their data through the network to a main location. The more modern networks are bi-directional, also enabling control of sensor activity.  The development of wireless sensor networks was motivated by military applications such as battlefield surveillance; today such networks are used in many industrial and consum