# Fake News Classification (Naive Bayes and SVM)

## 1. Import Packages and Read Data**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#import keras
import re
import string

In [3]:
## Read the first 50000 records of the training dataset
df = pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [5]:
len(df)

4009

## 3. Clean Data

In [6]:
wordnet = WordNetLemmatizer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_doc(doc):
    
    # remove punctuation
    doc = re.sub(r"[\s+\.\!\/_,|%^*#(+\"\')?<>:-]", " ", doc)
    
    # remove @
    pattern2 = r"@\S+"
    doc = re.sub(pattern2, "", doc)

    # tokenization
    tokens = nltk.word_tokenize(doc)
    
    # lower words
    tokens = [word.lower() for word in tokens]
    
    # remove stopwords
    tokens = [w for w in tokens if not w in stop_words]
    
    # Return a cleaned string or list
    return" ".join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yipin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
print("The number of nan headlines is", df['Headline'].isna().sum())
print("The number of nan bodies is", df['Body'].isna().sum())
print("The number of nan bodies is", df['Label'].isna().sum())

df = df.dropna(subset=[ 'Body'])
len(df)

The number of nan headlines is 0
The number of nan bodies is 21
The number of nan bodies is 0


3988

In [8]:
nltk.download('punkt')
df['Headline_clean'] = df['Headline'].apply(lambda x: clean_doc(x))
df['Body_clean'] = df['Body'].apply(lambda x: clean_doc(x))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yipin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 4. Text Vectorization

In [9]:
corpus = pd.concat([df['Headline_clean'], df['Body_clean']])
MAX_FEATURES = 10000

#Assign the type of vectorizer
VECTORIZER = 'tfidf'

from sklearn.feature_extraction.text import TfidfVectorizer
def fit_tfidf(documents):
    tfidf = TfidfVectorizer(input = 'content', stop_words = 'english',  
                            max_features = MAX_FEATURES )
    tfidf.fit(documents.values)
    return tfidf

from sklearn.feature_extraction.text import CountVectorizer
def fit_bow(documents):
    bow = CountVectorizer(input='content', stop_words='english',
                          max_features = MAX_FEATURES)
    bow.fit(documents.values)
    return bow

if VECTORIZER == 'tfidf':
    vectorizer = fit_tfidf(corpus)

elif VECTORIZER == 'bow':
    vertorizer = fit_bow(corpus)

headline_matrix = vectorizer.transform(df['Headline_clean'])
body_matrix = vectorizer.transform(df['Body_clean'])

In [10]:
body_matrix

<3988x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 599965 stored elements in Compressed Sparse Row format>

## 5. Split the Dataset to Training and Testing Dataset

In [11]:
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, hstack
X = hstack([headline_matrix, body_matrix]).toarray()
#X = np.concatenate((headline_matrix, body_matrix), axis=1)
y = df.Label
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42)

## 6. Baseline Model Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
##--Multinomial Naive Bayes Model
clf = MultinomialNB().fit(X_train,  y_train)
prediction = clf.predict(X_test)
print("The accuracy for testing data is", np.mean(prediction == y_test) )
print("The accuracy for training data is", np.mean(clf.predict(X_train) == y_train) )

The accuracy for testing data is 0.9388164493480441
The accuracy for training data is 0.9739217652958877


## 7. Grid Search for SVM (The result might be slightly different for different runing instances)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
param_grid =  {'loss': ['hinge', 'squared_loss'], 'penalty': ['l2','l1'],
       'alpha':[0.01,0.001],'max_iter':[10]}

clf2 = GridSearchCV(SGDClassifier(), param_grid)
clf2.fit(X_train, y_train)
print("Best parameters are: ", clf2.best_params_)

Best parameters are:  {'alpha': 0.001, 'penalty': 'l2', 'max_iter': 10, 'loss': 'hinge'}


In [14]:
from sklearn.metrics import classification_report 
y_pred = clf2.predict(X_test)
print("The classification report is:")
print(classification_report(y_test, y_pred))

print("The accuracy for testing data is", np.mean(y_pred == y_test) )
print("The accuracy for training data is", np.mean(clf2.predict(X_train) == y_train) )

The classification report is:
             precision    recall  f1-score   support

          0       0.99      0.96      0.98       560
          1       0.95      0.99      0.97       437

avg / total       0.97      0.97      0.97       997

The accuracy for testing data is 0.9729187562688064
The accuracy for training data is 0.9983283182881979
