In [1]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

wnl = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [3]:
#reading the file
data = pd.read_csv('fake.csv')

del data['uuid']
data = data.dropna().reset_index()
del data['index']

In [4]:
positive = data[data['spam_score'] < 0.5]
negative = data[data['spam_score'] >= 0.5]
no_of_positive = len(positive)
no_of_negative = len(negative)
print ("Number of not spam news: ", no_of_positive)
print ("Number of spam news: :", no_of_negative)
print ("Percentage of spam vs ham:" ,no_of_negative*100/no_of_positive)

Number of not spam news:  4656
Number of spam news: : 46
Percentage of spam vs ham: 0.9879725085910653


In [5]:
types = set(data['type'].tolist())
print(types)
result_spam = [0 if val < 0.5 else 1 for val in data['spam_score']]
result_type = [2 if val in ['bs', 'fake'] else 1 if val in ['bias', 'hate'] else 0 for val in data['type']]
del data['type']

{'fake', 'hate', 'conspiracy', 'state', 'junksci', 'bias', 'bs'}


In [6]:
len_of_reviews = np.array([])
for val in data['title']:
    temp = len(val)
    len_of_reviews = np.append(len_of_reviews,temp)
print ("The average length of all the reviews: ",len_of_reviews.max())

The average length of all the reviews:  277.0


In [7]:
num_title = len(data['title'])
title_clean_original = []
for i in range(0,num_title - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    title_clean_original.append(title_cleaner(data['title'][i]))
    
num_text = len(data['text'])
text_clean_original = []
for i in range(0,num_text - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    text_clean_original.append(title_cleaner(data['text'][i]))
        

Done with 500 title for title_clean_original
Done with 1000 title for title_clean_original
Done with 1500 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 2500 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 3500 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 4500 title for title_clean_original
Done with 500 title for title_clean_original
Done with 1000 title for title_clean_original
Done with 1500 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 2500 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 3500 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 4500 title for title_clean_original


In [8]:
title_clean_wnl = []
for i, val in enumerate(title_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    title_clean_wnl.append(temp)
    
text_clean_wnl = []
for i, val in enumerate(text_clean_original):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for text_clean_wnl" %(i+1)) 
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    text_clean_wnl.append(temp)

Done with 500 title for title_clean_wnl
Done with 1000 title for title_clean_wnl
Done with 1500 title for title_clean_wnl
Done with 2000 title for title_clean_wnl
Done with 2500 title for title_clean_wnl
Done with 3000 title for title_clean_wnl
Done with 3500 title for title_clean_wnl
Done with 4000 title for title_clean_wnl
Done with 4500 title for title_clean_wnl
Done with 500 title for text_clean_wnl
Done with 1000 title for text_clean_wnl
Done with 1500 title for text_clean_wnl
Done with 2000 title for text_clean_wnl
Done with 2500 title for text_clean_wnl
Done with 3000 title for text_clean_wnl
Done with 3500 title for text_clean_wnl
Done with 4000 title for text_clean_wnl
Done with 4500 title for text_clean_wnl


In [9]:
vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 277) 

In [27]:
df = pd.DataFrame(vectorizer.fit_transform(title_clean_wnl).toarray())
df1 = pd.DataFrame(vectorizer.fit_transform(text_clean_wnl).toarray())
train_data = pd.concat([data], axis = 1)
train_data = train_data.interpolate()

In [28]:
# clean trained dataset
del train_data['title']
del train_data['thread_title']
del train_data['text']
del train_data['spam_score']
del train_data['main_img_url']
del train_data['published']
del train_data['crawled']

In [29]:
l = ['country','site_url','author','language']
le=LabelEncoder()
for col in l:
    le.fit(train_data[col])
    train_data[col]=le.transform(train_data[col])

In [30]:
# create y_train data for spam and result
y_train_spam = pd.DataFrame(result_spam)
y_train_type = pd.DataFrame(result_type)

In [31]:
# write trained_data to csv for quick training later
train_data.to_csv('train_data.csv')
y_train_spam.to_csv('y_train_spam.csv')
y_train_type.to_csv('y_train_type.csv')

In [35]:
# create 50/50  random test/train split (can change test size ratio) for cross validation
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data, y_train_type, test_size=0.50, random_state=42)
y_train = y_train[0].tolist()
y_test = y_test[0].tolist()

In [37]:
forest = RandomForestClassifier(n_estimators = 40) 
forest = forest.fit(x_train, y_train)
x_test_pred = forest.predict(x_test)
accuracy  = metrics.accuracy_score(y_test,x_test_pred)
accuracy

0.93449595916631223

In [88]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
logreg = logreg.fit(x_train, y_train)

x_test_pred = logreg.predict(x_test)
accuracy  = metrics.accuracy_score(y_test,x_test_pred)
accuracy

0.87111867290514677