In [18]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.externals import joblib


In [19]:
wnl = WordNetLemmatizer()
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [48]:
#reading the file
data = pd.read_pickle('DataAll_5-3.pkl')

del data['uuid']
data = data.dropna().reset_index()
del data['index']
del data['thread_title']
# del data['spam_score']
del data['main_img_url']
del data['published']
del data['crawled']

data[2:50]

Unnamed: 0,ord_in_thread,author,title,text,language,site_url,country,domain_rank,spam_score,replies_count,participants_count,likes,comments,shares,type
2,0.0,Barracuda Brigade,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,100percentfedup.com,US,25689,0.0,0.0,1.0,0.0,0.0,0.0,bias
3,0.0,Fed Up,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,100percentfedup.com,US,25689,0.068,0.0,0.0,0.0,0.0,0.0,bias
4,0.0,Fed Up,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,100percentfedup.com,US,25689,0.865,0.0,0.0,0.0,0.0,0.0,bias
5,0.0,Barracuda Brigade,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,english,100percentfedup.com,US,25689,0.0,0.0,1.0,0.0,0.0,0.0,bias
6,0.0,Fed Up,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,100percentfedup.com,US,25689,0.701,0.0,0.0,0.0,0.0,0.0,bias
7,0.0,Fed Up,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,100percentfedup.com,US,25689,0.188,0.0,0.0,0.0,0.0,0.0,bias
8,0.0,Fed Up,BREAKING: CLINTON CLEARED...Was This A Coordin...,\nLimbaugh said that the revelations in the Wi...,english,100percentfedup.com,US,25689,0.144,0.0,0.0,0.0,0.0,0.0,bias
9,0.0,Fed Up,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""â¦Bu...",Email \nThese people are sick and evil. They w...,english,100percentfedup.com,US,25689,0.995,0.0,0.0,0.0,0.0,0.0,bias
10,0.0,EdJenner,YIKES! HILLARY GOES OFF THE RAILSâ¦Pulls A Ho...,,english,100percentfedup.com,US,25689,0.0,0.0,1.0,0.0,0.0,0.0,bias
11,0.0,Fed Up,SAY GOODBYE! These 23 Hollywood Celebs Threate...,\nWho? Comedian. \nWhere would she move? Spain...,english,100percentfedup.com,US,25689,0.998,0.0,0.0,0.0,0.0,0.0,bias


In [37]:
types = set(data['type'].tolist())
print(types)
result_type = [0 if val in ['fake'] else 1 for val in data['type']]
del data['type']

{'hate', 'bias', 'trustworthy', 'conspiracy', 'state', 'bs', 'junksci', 'fake', 'biased'}


In [38]:
num_title = len(data['title'])
title_clean_original = []
for i in range(0,num_title):
    if( (i+1)%1000 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    title_clean_original.append(title_cleaner(data['title'][i]))
    
num_text = len(data['text'])
text_clean_original = []
for i in range(0,num_text):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for text_clean_original" %(i+1)) 
    text_clean_original.append(title_cleaner(data['text'][i]))
        

Done with 1000 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 5000 title for title_clean_original
Done with 6000 title for title_clean_original
Done with 7000 title for title_clean_original
Done with 500 title for text_clean_original
Done with 1000 title for text_clean_original
Done with 1500 title for text_clean_original
Done with 2000 title for text_clean_original
Done with 2500 title for text_clean_original
Done with 3000 title for text_clean_original
Done with 3500 title for text_clean_original
Done with 4000 title for text_clean_original
Done with 4500 title for text_clean_original
Done with 5000 title for text_clean_original
Done with 5500 title for text_clean_original
Done with 6000 title for text_clean_original
Done with 6500 title for text_clean_original
Done with 7000 title for text_clean_original
Done with 7500 title for text_clean_original


In [39]:
title_clean_wnl = []
for i, val in enumerate(title_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    title_clean_wnl.append(temp)
    
text_clean_wnl = []
for i, val in enumerate(text_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for text_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    text_clean_wnl.append(temp)

Done with 500 title for title_clean_wnl
Done with 1000 title for title_clean_wnl
Done with 1500 title for title_clean_wnl
Done with 2000 title for title_clean_wnl
Done with 2500 title for title_clean_wnl
Done with 3000 title for title_clean_wnl
Done with 3500 title for title_clean_wnl
Done with 4000 title for title_clean_wnl
Done with 4500 title for title_clean_wnl
Done with 5000 title for title_clean_wnl
Done with 5500 title for title_clean_wnl
Done with 6000 title for title_clean_wnl
Done with 6500 title for title_clean_wnl
Done with 7000 title for title_clean_wnl
Done with 7500 title for title_clean_wnl
Done with 500 title for text_clean_wnl
Done with 1000 title for text_clean_wnl
Done with 1500 title for text_clean_wnl
Done with 2000 title for text_clean_wnl
Done with 2500 title for text_clean_wnl
Done with 3000 title for text_clean_wnl
Done with 3500 title for text_clean_wnl
Done with 4000 title for text_clean_wnl
Done with 4500 title for text_clean_wnl
Done with 5000 title for te

In [40]:
data['text'] = text_clean_wnl
data['title'] = title_clean_wnl
train_data = data.interpolate()
le = LabelEncoder()
def encode_columns():
    l = ['country','site_url','author','language']
    for col in l:
        le.fit(train_data[col])
        train_data[col]=le.transform(train_data[col])
        train_data[col] = train_data[col].astype(float)
train_data['domain_rank'] = train_data['domain_rank'].astype(float)
encode_columns()
train_data.head()

Unnamed: 0,ord_in_thread,author,title,text,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,shares
0,0.0,210.0,muslim bust stole million gov benefit,print pay back money plus interest entire fami...,3.0,0.0,16.0,25689.0,0.0,1.0,0.0,0.0,0.0
1,0.0,2178.0,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,3.0,0.0,16.0,25689.0,0.0,1.0,0.0,0.0,0.0
2,0.0,210.0,break weiner cooperate fbi hillary email inves...,red state fox news sunday report morning antho...,3.0,0.0,16.0,25689.0,0.0,1.0,0.0,0.0,0.0
3,0.0,628.0,pin drop speech father daughter kidnap kill is...,email kayla mueller prisoner torture isi chanc...,3.0,0.0,16.0,25689.0,0.0,0.0,0.0,0.0,0.0
4,0.0,628.0,fantastic trump point plan reform healthcare b...,email healthcare reform make america great sin...,3.0,0.0,16.0,25689.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# create y_train data for spam and result
y_train_type = pd.DataFrame(result_type)
# write trained_data to csv for quick training later
train_data.to_csv('train_data.csv')
y_train_type.to_csv('y_train_type.csv')

In [42]:
vec = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 250) 

def featurize(vectorizer):
    return DataFrameMapper([(['ord_in_thread'], None),
                            ('author', None),
                            ('title', vectorizer),
                            ('text', vectorizer),
                            ('language', None),
                            ('site_url', None),                                       
                            ('country', None),
                            ('domain_rank', None),
                            ('replies_count', None),
                            ('participants_count', None),
                            ('likes', None),
                            ('comments', None),
                            ('shares', None)])

In [43]:
# create 50/50  random test/train split (can change test size ratio) for cross validation
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, y_train_type, test_size=0.50, random_state=42)
y_train = y_train[0].tolist()
y_test = y_test[0].tolist()



In [44]:
x_train.dtypes
train_data['domain_rank'] = train_data['domain_rank'].astype(float)

In [45]:
from sklearn.pipeline import Pipeline
forest = RandomForestClassifier(n_estimators = 100)
pipe = Pipeline([('featurize', featurize(vec)), ('forest', forest)])
pipe.fit_transform(x_train, y_train)
x_test_pred = pipe.predict(x_test)
accuracy = metrics.accuracy_score(y_test,x_test_pred)
accuracy



0.9656640936917753

In [29]:
from sklearn.externals import joblib
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(pipe, 'classifier.pkl')

['classifier.pkl']

In [192]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
logreg = logreg.fit(x_train, y_train)

x_test_pred = logreg.predict(x_test)
accuracy  = metrics.accuracy_score(y_test,x_test_pred)
accuracy

0.87324542747766909