In [174]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [175]:
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

wnl = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [176]:
#reading the file
data = pd.read_csv('fake.csv')

del data['uuid']
data = data.dropna().reset_index()
del data['index']

In [177]:
positive = data[data['spam_score'] < 0.5]
negative = data[data['spam_score'] >= 0.5]
no_of_positive = len(positive)
no_of_negative = len(negative)
print ("Number of not spam news: ", no_of_positive)
print ("Number of spam news: :", no_of_negative)
print ("Percentage of spam vs ham:" ,no_of_negative*100/no_of_positive)

Number of not spam news:  4656
Number of spam news: : 46
Percentage of spam vs ham: 0.9879725085910653


In [178]:
types = set(data['type'].tolist())
print(types)
result_spam = [0 if val < 0.5 else 1 for val in data['spam_score']]
result_type = [2 if val in ['bs', 'fake'] else 1 if val in ['bias', 'hate'] else 0 for val in data['type']]
del data['type']

{'fake', 'hate', 'conspiracy', 'state', 'junksci', 'bias', 'bs'}


In [179]:
data

Unnamed: 0,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares
0,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0
1,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0
2,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.000,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0
3,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0
4,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0
5,0,Barracuda Brigade,2016-11-02T16:31:28.550+02:00,Hillary Goes Absolutely Berserk On Protester A...,Print Hillary goes absolutely berserk! She exp...,english,2016-11-02T16:31:28.550+02:00,100percentfedup.com,US,25689.0,Hillary Goes Absolutely Berserk On Protester A...,0.000,http://bb4sp.com/wp-content/uploads/2016/11/Fu...,0,1,0,0,0
6,0,Fed Up,2016-11-04T19:40:00.000+02:00,BREAKING! NYPD Ready To Make Arrests In Weiner...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,2016-11-05T02:13:46.065+02:00,100percentfedup.com,US,25689.0,BREAKING! NYPD Ready To Make Arrests In Weiner...,0.701,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0
7,0,Fed Up,2016-11-05T01:19:00.000+02:00,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,BREAKING! NYPD Ready To Make Arrests In Weiner...,english,2016-11-05T05:59:07.458+02:00,100percentfedup.com,US,25689.0,WOW! WHISTLEBLOWER TELLS CHILLING STORY Of Mas...,0.188,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0
8,0,Fed Up,2016-11-06T23:54:00.000+02:00,BREAKING: CLINTON CLEARED...Was This A Coordin...,\nLimbaugh said that the revelations in the Wi...,english,2016-11-07T10:20:06.409+02:00,100percentfedup.com,US,25689.0,BREAKING: CLINTON CLEARED...Was This A Coordin...,0.144,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0
9,0,Fed Up,2016-11-07T02:43:00.000+02:00,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",Email \nThese people are sick and evil. They w...,english,2016-11-07T10:20:27.252+02:00,100percentfedup.com,US,25689.0,"EVIL HILLARY SUPPORTERS Yell ""F*ck Trump""…Burn...",0.995,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0


In [180]:
len_of_reviews = np.array([])
for val in data['title']:
    temp = len(val)
    len_of_reviews = np.append(len_of_reviews,temp)
print ("The average length of all the reviews: ",len_of_reviews.max())

The average length of all the reviews:  277.0


In [181]:
num_title = len(data['title'])
title_clean_original = []
for i in range(0,num_title - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    title_clean_original.append(title_cleaner(data['title'][i]))
    
num_text = len(data['text'])
text_clean_original = []
for i in range(0,num_text - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for text_clean_original" %(i+1)) 
    text_clean_original.append(title_cleaner(data['text'][i]))
        

Done with 500 title for title_clean_original
Done with 1000 title for title_clean_original
Done with 1500 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 2500 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 3500 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 4500 title for title_clean_original
Done with 500 title for text_clean_original
Done with 1000 title for text_clean_original
Done with 1500 title for text_clean_original
Done with 2000 title for text_clean_original
Done with 2500 title for text_clean_original
Done with 3000 title for text_clean_original
Done with 3500 title for text_clean_original
Done with 4000 title for text_clean_original
Done with 4500 title for text_clean_original


In [182]:
title_clean_wnl = []
for i, val in enumerate(title_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    title_clean_wnl.append(temp)
    
text_clean_wnl = []
for i, val in enumerate(text_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for text_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    text_clean_wnl.append(temp)

Done with 500 title for title_clean_wnl
Done with 1000 title for title_clean_wnl
Done with 1500 title for title_clean_wnl
Done with 2000 title for title_clean_wnl
Done with 2500 title for title_clean_wnl
Done with 3000 title for title_clean_wnl
Done with 3500 title for title_clean_wnl
Done with 4000 title for title_clean_wnl
Done with 4500 title for title_clean_wnl
Done with 500 title for text_clean_wnl
Done with 1000 title for text_clean_wnl
Done with 1500 title for text_clean_wnl
Done with 2000 title for text_clean_wnl
Done with 2500 title for text_clean_wnl
Done with 3000 title for text_clean_wnl
Done with 3500 title for text_clean_wnl
Done with 4000 title for text_clean_wnl
Done with 4500 title for text_clean_wnl


In [183]:
title_vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 5) 

text_vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 50) 

In [184]:
title_df = pd.DataFrame(title_vectorizer.fit_transform(title_clean_wnl).toarray())
text_df = pd.DataFrame(text_vectorizer.fit_transform(text_clean_wnl).toarray())
train_data = pd.concat([data, title_df, text_df], axis = 1)
train_data = train_data.interpolate()


In [185]:
# clean trained dataset
del train_data['title']
del train_data['thread_title']
del train_data['text']
del train_data['spam_score']
del train_data['main_img_url']
del train_data['published']
del train_data['crawled']

In [186]:
l = ['country','site_url','author','language']
le=LabelEncoder()
for col in l:
    le.fit(train_data[col])
    train_data[col]=le.transform(train_data[col])

In [187]:
# create y_train data for spam and result
y_train_spam = pd.DataFrame(result_spam)
y_train_type = pd.DataFrame(result_type)

In [188]:
train_data

Unnamed: 0,ord_in_thread,author,language,site_url,country,domain_rank,replies_count,participants_count,likes,comments,...,40,41,42,43,44,45,46,47,48,49
0,0,104,3,0,10,25689.0,0,1,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,1023,3,0,10,25689.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0,104,3,0,10,25689.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,310,3,0,10,25689.0,0,0,0,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,310,3,0,10,25689.0,0,0,0,0,...,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0
5,0,104,3,0,10,25689.0,0,1,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,310,3,0,10,25689.0,0,0,0,0,...,0.0,1.0,0.0,3.0,1.0,2.0,0.0,0.0,0.0,3.0
7,0,310,3,0,10,25689.0,0,0,0,0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
8,0,310,3,0,10,25689.0,0,0,0,0,...,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0
9,0,310,3,0,10,25689.0,0,0,0,0,...,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [189]:
# write trained_data to csv for quick training later
train_data.to_csv('train_data.csv')
y_train_spam.to_csv('y_train_spam.csv')
y_train_type.to_csv('y_train_type.csv')

In [190]:
# create 50/50  random test/train split (can change test size ratio) for cross validation
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, y_train_type, test_size=0.50, random_state=42)
y_train = y_train[0].tolist()
y_test = y_test[0].tolist()

In [191]:
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(x_train, y_train)
x_test_pred = forest.predict(x_test)
accuracy  = metrics.accuracy_score(y_test,x_test_pred)
accuracy

0.96469587409612934

In [192]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
logreg = logreg.fit(x_train, y_train)

x_test_pred = logreg.predict(x_test)
accuracy  = metrics.accuracy_score(y_test,x_test_pred)
accuracy

0.87324542747766909