In [1]:
#import the required libraries

from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 

from sklearn import metrics
from sklearn.metrics import confusion_matrix
%matplotlib inline

#import seaborn as sns

import numpy as np 
import pandas as pd 

import os
import re
import nltk 

In [2]:
#read the train and test datasets

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
#print the shape i.e. total rows and columns

print(train.shape, test.shape)

(20800, 5) (5200, 4)


## Data Cleaning and preprocessing:

In [6]:
#find the total null values in each column

print("Train data info: \n")
print(train.isnull().sum())
print("\nTest data info: \n")
print(test.isnull().sum())

Train data info: 

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Test data info: 

id          0
title     122
author    503
text        7
dtype: int64


In [7]:
#Fill missing values with blank spaces
test = test.fillna(' ')          
train = train.fillna(' ')

#Combine the columns title, author and text into one named total 
test['total'] = test['title'] + ' ' + test['author'] + test['text']          
train['total'] = train['title'] + ' ' + train['author'] + train['text']

In [8]:
#returns a pd series in which only real news is present
#as label 1 corresponds to real news
train[train['label']==1].total

0        House Dem Aide: We Didn’t Even See Comey’s Let...
2        Why the Truth Might Get You Fired Consortiumne...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
6        Life: Life Of Luxury: Elton John’s 6 Favorite ...
                               ...                        
20788    Maine’s Gov. LePage Threatens To ‘Investigate’...
20791    Lawyer Who Kept Hillary Campaign Chief Out of ...
20793    Idiot Who Destroyed Trump Hollywood Star Gets ...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799    What Keeps the F-35 Alive David Swanson  David...
Name: total, Length: 10413, dtype: object

'''!()-[]{};:'"\, <>./?@#$%^&*_~'''

In [11]:
#Downloading nltk data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#import stopwords from the nltk corpus
from nltk.corpus import stopwords

# use stopwords of english language
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#we will iterate through each row of the train dataset

for index, row in train.iterrows():
    
    #every row of 'total' column will be replaced by this filtered sentence
    filter_sentence = ''
    sentence = row['total']
    
    #remove all special characters, punctuations, symbols, digits using sub() method of python regex module
    sentence = re.sub(r'[^\w\s]','',sentence) 
   
    #tokenize the sentences
    words = nltk.word_tokenize(sentence)
    
    #list comprehension to remove stop words
    words = [w for w in words if not w in stop_words]  #stopwords removal
    
    for word in words:
    #lemmatize every word i.e convert every word to root form
    #also convert every word to it's lowercase
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()
    
    #change the data at the particular index by using loc attribute of pandas dataframe
    train.loc[index,'total'] = filter_sentence


## Feature Extraction with just CountVectorizer()(Bag of Words):

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer  #Tfidf Vector
from sklearn.feature_extraction.text import CountVectorizer   #Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer   #Bag of Words + Tfidf

In [205]:
train = train[['total','label']]

In [206]:
X_train = train['total']       #News in X_train
Y_train = train['label']       #Label(Fake or Real) in Y_train

In [207]:
#Convert into matrix form
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)

In [208]:
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(freq_term_matrix, Y_train, random_state=0)

In [209]:
logreg = LogisticRegression(C=1e5, max_iter=1000)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print("Accuracy of logistic regression model with BOW: {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of logistic regression model with BOW: 97.04%


array([[2476,   88],
       [  66, 2570]], dtype=int64)

In [210]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, y_train)
pred = NB.predict(X_test)
print("Accuracy of Multionomial Naive Bayes model with BOW: {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of Multionomial Naive Bayes model with BOW: 91.88%


array([[2515,   49],
       [ 373, 2263]], dtype=int64)

In [213]:
from sklearn import svm
svmmod = svm.SVC()
svmmod.fit(X_train, y_train)
pred = svmmod.predict(X_test)
print("Accuracy of Support Vector Machine model with BOW: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of Support Vector Machine model with BOW: 93.88%


array([[2429,  135],
       [  55, 2581]], dtype=int64)

In [211]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
sc = SGDClassifier()
SGDmodel = sc.fit(X_train, y_train)
prediction = SGDmodel.predict(X_test)
print("Accuracy of Stochastic gradient descent model with BOW: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, prediction)
cm

Accuracy of Stochastic gradient descent model with BOW: 96.5%


array([[2484,   80],
       [ 102, 2534]], dtype=int64)

In [212]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RandomForestmodel = RF.fit(X_train, y_train) 
prediction = RandomForestmodel.predict(X_test)
print("Accuracy of Random Forest Classifier model with BOW: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, prediction)
cm

Accuracy of Random Forest Classifier model with BOW: 93.88%


array([[2492,   72],
       [ 246, 2390]], dtype=int64)

## Feature Extraction using CountVectorizer and TfIdf-Transformer

In [214]:
train = train[['total','label']]

In [215]:
X_train = train['total']
Y_train = train['label']

In [216]:
#Feature extraction using count vectorization and tfidf.
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer()
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [217]:
tf_idf_matrix

<20800x220387 sparse matrix of type '<class 'numpy.float64'>'
	with 5987666 stored elements in Compressed Sparse Row format>

In [218]:
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)

In [219]:
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print("Accuracy of logistic regression model with BOW and TF-IDF: {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of logistic regression model with BOW and TF-IDF: 97.79%


array([[2493,   71],
       [  44, 2592]], dtype=int64)

In [220]:
from sklearn.linear_model import LogisticRegression

In [221]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, y_train)
pred = NB.predict(X_test)
print("Accuracy of Multionomial Naive Bayes model with BOW and TF-IDF: {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of Multionomial Naive Bayes model with BOW and TF-IDF: 83.48%


array([[2558,    6],
       [ 853, 1783]], dtype=int64)

In [222]:
from sklearn import svm
svmmod = svm.SVC()
svmmod.fit(X_train, y_train)
pred = svmmod.predict(X_test)
print("accuracy : {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

accuracy : 97.33%


array([[2480,   84],
       [  55, 2581]], dtype=int64)

In [224]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
sc = SGDClassifier()
SGDmodel = sc.fit(X_train, y_train)
prediction = SGDmodel.predict(X_test)
print("Accuracy of Stochastic gradient descent model with BOW and TF-IDF: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, prediction)
cm

Accuracy of Stochastic gradient descent model with BOW and TF-IDF: 97.56%


array([[2485,   79],
       [  48, 2588]], dtype=int64)

In [225]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RandomForestmodel = RF.fit(X_train, y_train) 
prediction = RandomForestmodel.predict(X_test)
print("Accuracy of Random Forest Classifier model with BOW and TF-IDF: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of Random Forest Classifier model with BOW and TF-IDF: 93.54%


array([[2480,   84],
       [  55, 2581]], dtype=int64)

## Feature Extraction with CountVectorizer, TfidfVectorizer and N-grams

In [16]:
train = train[['total','label']]

In [17]:
X_train = train['total']
Y_train = train['label']

In [18]:
#Feature extraction using count vectorization and tfidf.
tfidf_vectorizer = TfidfVectorizer(ngram_range = (3, 3))
tfidf_vectorizer.fit_transform(X_train)
tf_idf_matrix = tfidf_vectorizer.transform(X_train)

In [19]:
#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)

In [20]:
tf_idf_matrix

<20800x7914553 sparse matrix of type '<class 'numpy.float64'>'
	with 9385902 stored elements in Compressed Sparse Row format>

In [231]:
logreg = LogisticRegression(C=1e5, max_iter = 1000)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print("Accuracy of logistic regression model with BOW and TF-IDF and N-Grams(Trigram): {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of logistic regression model with BOW and TF-IDF and N-Grams(Trigram): 89.88%


array([[2065,  499],
       [  27, 2609]], dtype=int64)

In [232]:
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, y_train)
pred = NB.predict(X_test)
print("Accuracy of Multionomial Naive Bayes model with BOW and TF-IDF and N-Grams(Trigram): {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of Multionomial Naive Bayes model with BOW and TF-IDF and N-Grams(Trigram): 93.65%


array([[2308,  256],
       [  74, 2562]], dtype=int64)

In [234]:
from sklearn import svm
svmmod = svm.SVC()
svmmod.fit(X_train, y_train)
pred = svmmod.predict(X_test)
print("Accuracy of SVM model with BOW and TF-IDF and N-Grams(Trigram): {}%".format(round(accuracy_score(y_test, pred)*100,2)))
cm = confusion_matrix(y_test, pred)
cm

Accuracy of SVM model with BOW and TF-IDF and N-Grams(Trigram): 94.98%


array([[2382,  182],
       [  79, 2557]], dtype=int64)

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
sc = SGDClassifier()
SGDmodel = sc.fit(X_train, y_train)
prediction = SGDmodel.predict(X_test)
print("Accuracy of Stochastic gradient descent model with BOW and TF-IDF and N-Grams(Trigram): {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
cm = confusion_matrix(y_test, prediction)
cm

Accuracy of Stochastic gradient descent model with BOW and TF-IDF and N-Grams(Trigram): 93.54%


array([[2277,  287],
       [  49, 2587]], dtype=int64)

## Saving the model with best accuracy by using the concept of pipelining:

## We can see that the logistic regression model with countVectorizer and TfidfTransformer gives the best accuracy(accuracy of 97.79%). So, we will save the model with concept of pipelining. 

In [23]:
#Assiging the variables again as once transformed vectors can't be transformed again using pipeline.
X_train = train['total']
Y_train = train['label']

In [24]:
from sklearn.pipeline import Pipeline
import joblib
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', linear_model.LogisticRegression(C=1e5)),
])

In [26]:
pipeline.fit(X_train, Y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=100000.0))])

In [27]:
pipeline.predict(["flynn hillary clinton big woman campus breitbart daniel j flynnever get feeling life circle roundabout rather head straight line toward intended destination hillary clinton remains big woman campus leafy liberal wellesley massachusetts everywhere else vote likely inauguration dress remainder day way miss havisham forever wore wedding dress speaking great expectations hillary rodham overflowed 48 year ago first addressed wellesley graduating class the president college informed gathered 1969 student needed debate far i could ascertain spokesman kind like democratic primary 2016 minus term unknown even seven sisters school i glad miss adams made clear i speaking today u 400 u miss rodham told classmate after appointing edger bergen charlie mccarthys mortimer snerds attendance bespectacled granny glass awarding matronly wisdom least john lennon wisdom took issue previous speaker despite becoming first win election seat u s senate since reconstruction edward brooke came criticism calling empathy goal protestors criticized tactic though clinton senior thesis saul alinsky lamented black power demagogue elitist arrogance repressive intolerance within new left similar word coming republican necessitated brief rebuttal trust rodham ironically observed 1969 one word i asked class rehearsal wanted say everyone came said talk trust talk lack trust u way feel others talk trust bust what say what say feeling permeates generation perhaps even understood distrusted the trust bust certainly busted clintons 2016 plan she certainly even understand people distrusted after whitewater travelgate vast conspiracy benghazi missing email clinton found distrusted voice friday there load compromising road broadening political horizon and distrust american people trump edged 48 percent 38 percent question immediately prior novembers election stood major reason closing horizon clinton described vanquisher supporter embracing lie con alternative fact assault truth reason she failed explain american people chose lie truth as history major among today know well people power invent fact attack question mark beginning end free society offered that hyperbole like many people emerge 1960s hillary clinton embarked upon long strange trip from high school goldwater girl wellesley college republican president democratic politician clinton drank time place gave degree more significantly went idealist cynic comparison two wellesley commencement address show way back lamented long leader viewed politics art possible challenge practice politics art making appears impossible possible now big woman campus odd woman white house wonder current station even possible why arent i 50 point ahead asked september in may asks isnt president the woman famously dubbed congenital liar bill safire concludes lie mind getting stood election day like finding jilted bride wedding day inspires dangerous delusion"])

array([0], dtype=int64)

In [28]:
#saving the pipeline
filename = 'fakenewspipeline.sav'
joblib.dump(pipeline, filename)

['fakenewspipeline.sav']

In [29]:
filename = './fakenewspipeline.sav'

In [32]:
loaded_model = joblib.load(filename)
result = loaded_model.predict(["flynn hillary clinton big woman campus breitbart daniel j flynnever get feeling life circle roundabout rather head straight line toward intended destination hillary clinton remains big woman campus leafy liberal wellesley massachusetts everywhere else vote likely inauguration dress remainder day way miss havisham forever wore wedding dress speaking great expectations hillary rodham overflowed 48 year ago first addressed wellesley graduating class the president college informed gathered 1969 student needed debate far i could ascertain spokesman kind like democratic primary 2016 minus term unknown even seven sisters school i glad miss adams made clear i speaking today u 400 u miss rodham told classmate after appointing edger bergen charlie mccarthys mortimer snerds attendance bespectacled granny glass awarding matronly wisdom least john lennon wisdom took issue previous speaker despite becoming first win election seat u s senate since reconstruction edward brooke came criticism calling empathy goal protestors criticized tactic though clinton senior thesis saul alinsky lamented black power demagogue elitist arrogance repressive intolerance within new left similar word coming republican necessitated brief rebuttal trust rodham ironically observed 1969 one word i asked class rehearsal wanted say everyone came said talk trust talk lack trust u way feel others talk trust bust what say what say feeling permeates generation perhaps even understood distrusted the trust bust certainly busted clintons 2016 plan she certainly even understand people distrusted after whitewater travelgate vast conspiracy benghazi missing email clinton found distrusted voice friday there load compromising road broadening political horizon and distrust american people trump edged 48 percent 38 percent question immediately prior novembers election stood major reason closing horizon clinton described vanquisher supporter embracing lie con alternative fact assault truth reason she failed explain american people chose lie truth as history major among today know well people power invent fact attack question mark beginning end free society offered that hyperbole like many people emerge 1960s hillary clinton embarked upon long strange trip from high school goldwater girl wellesley college republican president democratic politician clinton drank time place gave degree more significantly went idealist cynic comparison two wellesley commencement address show way back lamented long leader viewed politics art possible challenge practice politics art making appears impossible possible now big woman campus odd woman white house wonder current station even possible why arent i 50 point ahead asked september in may asks isnt president the woman famously dubbed congenital liar bill safire concludes lie mind getting stood election day like finding jilted bride wedding day inspires dangerous delusion"])
print(result) 

[0]
