In [1]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.externals import joblib
from flask import Flask, request, jsonify
from clean_article import *
import webhoseio


wnl = WordNetLemmatizer()
cols = ["uuid", "ord_in_thread", "author", "published", "title", "text", "language", "crawled", "site_url", "country", "domain_rank", "thread_title", "spam_score", "main_img_url", "replies_count", "participants_count", "likes", "comments", "shares", "type"]



In [2]:
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [5]:
test_data = pd.read_table('Data_3-31.csv')
test_data = test_data[cols]
del test_data['uuid']
test_data = test_data.reset_index()
del test_data['index']
del test_data['thread_title']
del test_data['spam_score']
del test_data['main_img_url']
del test_data['published']
del test_data['crawled']
del test_data['type']
test_data['title'].fillna('', inplace=True)
test_data['text'].fillna('', inplace=True)
test_data.fillna(0, inplace=True)
title = title_cleaner(test_data['title'][0])
text = title_cleaner(test_data['text'][0])
# print(title, text)
test_data

Unnamed: 0,ord_in_thread,author,title,text,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,shares
0,3.0,Anonymous,,ANYTHING THAT COMES FROM NBC IS SIMPLY BULL . ...,english,abeldanger.net,US,0.0,3.0,1.0,0.0,0.0,0.0
1,4.0,Markus Allen,,Illegal and inappropriate behavior? Oh noooo...,english,abeldanger.net,US,0.0,4.0,1.0,0.0,0.0,0.0
2,0.0,tokyowashi (noreply@blogger.com),Russia and China Catch Security Council in a D...,"Source: Russia Insider \r\n\r\nMarch 25, 2017 ...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
3,1.0,Anonymous,,"Senator Black should be in that job , the JEWI...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
4,2.0,Anonymous,,"This sure sounds familiar , next scam uncovere...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
5,0.0,tokyowashi (noreply@blogger.com),BREAKING: Trump Calls For Clintons To Be INVES...,Source: Cash Daily \r\n\r\nPresident Trump's t...,english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
6,1.0,Anonymous,,"Senator Black should be in that job , the JEWI...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
7,2.0,Anonymous,,"This sure sounds familiar , next scam uncovere...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
8,1.0,Anonymous,,"Senator Black should be in that job , the JEWI...",english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0
9,0.0,tokyowashi (noreply@blogger.com),#2935: Obama Fake News Carbon Bridge / Boeing ...,From: United States Marine Field McConnell Pl...,english,abeldanger.net,US,0.0,2.0,1.0,0.0,0.0,0.0


In [6]:
title_tag = pos_tag(title.split())
title_clean_wnl = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in title_tag])
text_tag = pos_tag(text.split())
text_clean_wnl = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in text_tag])

In [137]:
le = joblib.load('label_encoder') 
classifier = joblib.load()

In [138]:
title_df = pd.DataFrame(title_vectorizer.fit_transform(np.array([title_clean_wnl])).toarray())
text_df = pd.DataFrame(text_vectorizer.fit_transform(np.array([text_clean_wnl])).toarray())
test_one = pd.concat([test_data, title_df, text_df], axis = 1)

del test_one['title']
del test_one['thread_title']
del test_one['text']
del test_one['spam_score']
del test_one['main_img_url']
del test_one['published']
del test_one['crawled']

In [139]:
l = ['country','site_url','author','language']
le=LabelEncoder()
for col in l:
    le.fit(test_one[col])
    test_one[col]=le.transform(test_one[col])
    
test_one

Unnamed: 0,ord_in_thread,author,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,...,40,41,42,43,44,45,46,47,48,49
0,0.0,0,0,0,0,0.0,2.0,1.0,0.0,0.0,...,4,6,3,3,5,4,14,19,3,5


In [166]:
train_data = pd.read_csv('../train_data.csv')
y_train_type = pd.read_csv('../y_train_type.csv')
x_train, x_test, y_train, y_test = train_test_split(train_data, y_train_type, test_size=0.05, random_state=42)
y_train = y_train['0'].tolist()
y_test = y_test['0'].tolist()
del x_train['Unnamed: 0']


forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(x_train, y_train)

# x_train
predictions = np.array([[0.0, 0.0, 0.0]])
for i in range(5):
    pred_one = forest.predict_proba(test_one)
    predictions += pred_one
predictions/5

array([[ 0.08,  0.56,  0.36]])