In [53]:
#importing libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
def title_cleaner(title):
    title = re.sub('[^a-zA-Z]',' ', title)
    title = title.lower()
    title = nltk.word_tokenize(title) 
    eng_stopwords = set(stopwords.words("english"))
    title = [w for w in title if not w in eng_stopwords]
    title = ' '.join([word for word in title])
    return(title)

wnl = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
    '''Treebank to wordnet POS tag'''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n' #basecase POS

In [10]:
#reading the file
data = pd.read_csv('fake.csv')

In [11]:
del data['uuid']

In [12]:
data = data.dropna().reset_index()

In [13]:
del data['index']

In [14]:
positive = data[data['spam_score'] < 0.5]
negative = data[data['spam_score'] >= 0.5]
no_of_positive = len(positive)
no_of_negative = len(negative)
print ("Number of not spam news: ", no_of_positive)
print ("Number of spam news: :", no_of_negative)
print ("Percentage of spam vs ham:" ,no_of_negative*100/no_of_positive)

Number of not spam news:  4656
Number of spam news: : 46
Percentage of spam vs ham: 0.9879725085910653


In [15]:
result = [0 if val < 0.5 else 1 for val in data['spam_score']]

In [16]:
len_of_reviews = np.array([])
for val in data['title']:
    temp = len(val)
    len_of_reviews = np.append(len_of_reviews,temp)

print ("The average length of all the reviews: ",len_of_reviews.max())

The average length of all the reviews:  277.0


In [17]:
num_title = len(data['title'])
title_clean_original = []
for i in range(0,num_title - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    title_clean_original.append(title_cleaner(data['title'][i]))
    
num_text = len(data['text'])
text_clean_original = []
for i in range(0,num_text - 1):
    if( (i+1)%500 == 0 ):
         # print progress
        print("Done with %d title for title_clean_original" %(i+1)) 
    text_clean_original.append(title_cleaner(data['text'][i]))
        

Done with 500 title for title_clean_original
Done with 1000 title for title_clean_original
Done with 1500 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 2500 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 3500 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 4500 title for title_clean_original
Done with 500 title for title_clean_original
Done with 1000 title for title_clean_original
Done with 1500 title for title_clean_original
Done with 2000 title for title_clean_original
Done with 2500 title for title_clean_original
Done with 3000 title for title_clean_original
Done with 3500 title for title_clean_original
Done with 4000 title for title_clean_original
Done with 4500 title for title_clean_original


In [18]:
title_clean_wnl = []
for val in title_clean_original:
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    title_clean_wnl.append(temp)
    
text_clean_wnl = []
for val in text_clean_original:
    l = pos_tag(val.split())
    temp = ' '.join([wnl.lemmatize(w,pos=get_wordnet_pos(t)) for w,t in l])
    text_clean_wnl.append(temp)

In [19]:
vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 277) 

In [21]:
df = pd.DataFrame(vectorizer.fit_transform(data['title']).toarray())
df1 = pd.DataFrame(vectorizer.fit_transform(data['title']).toarray())

In [22]:
train_data = pd.concat([data,df,df1], axis = 1)

In [23]:
del train_data['title']

In [24]:
del train_data['thread_title']

In [25]:
del train_data['text']

In [26]:
del train_data['type']
del train_data['spam_score']

In [27]:
del train_data['main_img_url']

In [28]:
del train_data['published']
del train_data['crawled']

In [47]:
l = ['country','site_url','author','language']
le=LabelEncoder()
for col in l:
    le.fit(train_data[col])
    train_data[col]=le.transform(train_data[col])

In [48]:
y_train = result

In [50]:
forest = RandomForestClassifier(n_estimators = 50) 
forest = forest.fit(train_data, y_train)

In [54]:
prediction = forest.predict(train_data)
accuracy  = metrics.accuracy_score(y_train,prediction)
accuracy

1.0