## Clean_df creation

In [1]:
import pickle
import os
import requests
from flask import Flask, request, render_template, jsonify

import numpy as np
import pandas as pd
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

import gensim
import operator
from operator import itemgetter

In [2]:
import pymongo
from pymongo import MongoClient

mc = pymongo.MongoClient()
db = mc['wikicache']
articles = db['articles']

In [3]:
data = list(articles.find())
clean_df = pd.DataFrame(data)


In [4]:
clean_df.head()

Unnamed: 0,_id,summary,text,title,url
0,5d1fc2f4258b4b335c5b2093,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences,https://en.wikipedia.org/wiki/Avans_University...
1,5d1fc2f5258b4b335c5b2094,"Guy de Lussigny (30 August 1929 in Cambrai, no...","Guy de Lussigny (30 August 1929 in Cambrai, no...",Guy de Lussigny,https://en.wikipedia.org/wiki/Guy_de_Lussigny
2,5d1fc2f6258b4b335c5b2095,"In literature, a trope is a common plot conven...","In literature, a trope is a common plot conven...",Trope (literature),https://en.wikipedia.org/wiki/Trope_(literature)
3,5d1fc2f6258b4b335c5b2096,The Global Television Network (more commonly c...,The Global Television Network (more commonly c...,Global Television Network,https://en.wikipedia.org/wiki/Global_Televisio...
4,5d1fc2f7258b4b335c5b2097,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts,https://en.wikipedia.org/wiki/Nickelodeon_Guts


In [13]:
clean_df['title']

0                     Avans University of Applied Sciences
1                                          Guy de Lussigny
2                                       Trope (literature)
3                                Global Television Network
4                                         Nickelodeon Guts
5                                Theatre of ancient Greece
6                                       Bliss point (food)
7                      Transportation theory (mathematics)
8        Department of Planning and Environment (New So...
9               Capital University of Science & Technology
10                                       Itasca State Park
11       University of International Business and Econo...
12                                         Global Tel Link
13                                          Caucasian race
14                                          Actors Theater
15                               Indian Actors Association
16                                             Star Movi

In [5]:
len(clean_df['text'])

11978

In [6]:
n_features = 2000
vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')


In [7]:
vectorizer.fit(clean_df['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
with open('data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [8]:
len(vectorizer.vocabulary_)

2000

In [9]:
corpus_vectors = vectorizer.transform(clean_df['text']).toarray()

In [10]:
with open('data/corpus_vectors.pkl', 'wb') as f:
     pickle.dump(corpus_vectors, f)

In [33]:
clean_text = clean_df['text'].tolist()

In [38]:
len(clean_text)

11978

In [40]:
complexity = [textstat.flesch_kincaid_grade(doc) for doc in clean_text]

In [42]:
clean_df['score'] = pd.Series(complexity)

In [43]:
clean_df.head()

Unnamed: 0,_id,summary,text,title,url,score
0,5d1fc2f4258b4b335c5b2093,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences (Dutch: A...,Avans University of Applied Sciences,https://en.wikipedia.org/wiki/Avans_University...,18.0
1,5d1fc2f5258b4b335c5b2094,"Guy de Lussigny (30 August 1929 in Cambrai, no...","Guy de Lussigny (30 August 1929 in Cambrai, no...",Guy de Lussigny,https://en.wikipedia.org/wiki/Guy_de_Lussigny,25.1
2,5d1fc2f6258b4b335c5b2095,"In literature, a trope is a common plot conven...","In literature, a trope is a common plot conven...",Trope (literature),https://en.wikipedia.org/wiki/Trope_(literature),16.6
3,5d1fc2f6258b4b335c5b2096,The Global Television Network (more commonly c...,The Global Television Network (more commonly c...,Global Television Network,https://en.wikipedia.org/wiki/Global_Televisio...,15.0
4,5d1fc2f7258b4b335c5b2097,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts (stylized as Nickelodeon GUTS...,Nickelodeon Guts,https://en.wikipedia.org/wiki/Nickelodeon_Guts,14.4


In [44]:
# with open('data/clean_df.pkl', 'wb') as f:
#     pickle.dump(clean_df, f)