In [39]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.externals import joblib
from flask import Flask, request, jsonify
import webhoseio


wnl = WordNetLemmatizer()
cols = ["uuid", "ord_in_thread", "author", "published", "title", "text", "language", "crawled", "site_url", "country", "domain_rank", "thread_title", "spam_score", "main_img_url", "replies_count", "participants_count", "likes", "comments", "shares", "type"]

In [54]:
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [91]:
test_data = pd.read_table(u'Data_3-31.csv')
# test_data.to_csv('abeldanger_test_articles.csv')
test_data = test_data[cols]
del test_data['uuid']
test_data = test_data.reset_index()
del test_data['index']
del test_data['thread_title']
del test_data['spam_score']
del test_data['main_img_url']
del test_data['published']
del test_data['crawled']
del test_data['type']
test_data['title'].fillna('', inplace=True)
test_data['text'].fillna('', inplace=True)
test_data.fillna(0, inplace=True)
title = title_cleaner(test_data['title'][0])
text = title_cleaner(test_data['text'][0])
# print(title, text)
test_data[test_data['site_url'] == 'breitbart.com']

Unnamed: 0,ord_in_thread,author,title,text,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,shares
631,0.0,Europa-News,Albania’s Soros-Affiliated PM Is Ruining Our C...,"March 27, 2017 Albania’s Soros-Affiliated PM I...",english,breitbart.com,DE,967.0,0.0,1.0,0.0,0.0,0.0
632,0.0,Charlie Spiering,Exclusive: Scott Pruitt Promises ‘EPA Orginali...,\r\nThe newly appointed Environmental Protecti...,english,breitbart.com,US,967.0,0.0,1.0,481.0,0.0,481.0
633,0.0,"Thomas D. Williams, Ph.D.",Report: Ozone Hole Has Shrunk by More Than Fou...,"\r\nIn the period from 2000-2015, the hole in ...",english,breitbart.com,US,967.0,0.0,1.0,0.0,0.0,0.0
634,0.0,Chris Tomlinson,"Hungary Refuses to Take 5,000 Migrants From Sw...",\r\nHungary has refused a request by the Swedi...,english,breitbart.com,US,967.0,0.0,1.0,0.0,0.0,0.0
635,0.0,Jeff Poor,Gowdy on Calls for Nunes to Be Replaced as Int...,\r\nMonday on Fox News Channel’s “The First 10...,english,breitbart.com,US,967.0,0.0,1.0,0.0,0.0,0.0
636,0.0,Breitbart Jerusalem,"McClatchy: Breitbart’s Klein, Bannon’s Man in ...","McClatchy: Breitbart’s Klein, Bannon’s Man in ...",english,breitbart.com,US,967.0,0.0,1.0,40.0,0.0,40.0
637,0.0,Breitbart Jerusalem,Fatah Students: Blood Of Martyrs Will Build ‘P...,Fatah Students: Blood Of Martyrs Will Build ‘P...,english,breitbart.com,US,967.0,0.0,1.0,23.0,0.0,23.0
638,0.0,Breitbart Jerusalem,"European, Arab States join Israel and U.S. in ...","European, Arab States join Israel and U.S. in ...",english,breitbart.com,US,967.0,0.0,1.0,10.0,0.0,10.0
639,23.0,Midwest Patriot,,"I guess lesbians can be quite aggressive, but ...",english,breitbart.com,US,967.0,100.0,78.0,0.0,0.0,0.0
640,24.0,Dyskord,,Don't forget Muslims have urges and its perfec...,english,breitbart.com,US,967.0,100.0,78.0,0.0,0.0,0.0


In [84]:
title_tag = pos_tag(title.split())
title_clean_wnl = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in title_tag])
text_tag = pos_tag(text.split())
text_clean_wnl = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in text_tag])

In [85]:
le = joblib.load('label_encoder.pkl') 
classifier = joblib.load('classifier.pkl')

In [86]:
l = ['country','site_url','author','language']
for col in l:
    le.fit(test_data[col])
    test_data[col] = le.transform(test_data[col])
    test_data[col] = test_data[col].astype(float)
test_data['title'] = title_clean_wnl
test_data['text'] = text_clean_wnl
test_data

Unnamed: 0,ord_in_thread,author,title,text,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,shares
0,0.0,0.0,thought greenwich winnetka hillary clinton lose,ap reluctant focus much hillary clinton electi...,0.0,0.0,0.0,283.0,0.0,1.0,0.0,0.0,0.0


In [87]:
prediction = classifier.predict_proba(test_data)

In [88]:
prediction[0]

array([ 0.14,  0.53,  0.33])