<a href="https://colab.research.google.com/github/profshai/natural-language-processing/blob/main/fake_news_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Classifier using Natural language Classifier

Dataset source: https://www.kaggle.com/c/fake-news/data#

In [59]:
import pandas as pd
import numpy as np

In [60]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [61]:
df.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

In [62]:
X=df.drop('label', axis=1)

In [63]:
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [64]:
y = df['label']

In [65]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [66]:
len(df)

20800

In [67]:
df.shape

(20800, 5)

In [68]:
df.dropna()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [69]:
news=df.copy()

In [70]:
news.reset_index(inplace=True)

In [71]:
news.head()

Unnamed: 0,index,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [72]:
news['title'] = news['title'].astype(str)

### Cleaning the texts

In [73]:
import re
import nltk
# stopwords -  words which are not useful in our prediction
nltk.download('stopwords')
from nltk.corpus import stopwords
# Stemming helps to reduce the dimensions of the final sparce matrix
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, len(news)):
  # replace anything not a-z or A-Z with a space.
  review = re.sub('[^a-zA-Z]', ' ', news['title'][i])
  review = review.lower()
  review = review.split()

  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove("not")
  all_stopwords.remove("isn't")
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(corpus)

### Creating the Bag of Words model

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()
y = news['label']

In [76]:
X.shape

(20800, 5000)

In [77]:
y.shape

(20800,)

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Naive Bayes model

In [79]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [80]:
y_pred = classifier.predict(X_test)

In [81]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"Accuracy score is {accuracy_score(y_test, y_pred)}")

[[1469  577]
 [ 210 1904]]
Accuracy score is 0.8108173076923076


### MultinomialNB Algorithm


The accuracy of the model is 81 percent.

In [82]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [83]:
from sklearn.metrics import plot_confusion_matrix
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test, pred)
print(cm)
print("Accuracy score is   %0.3f" % score)

[[1830  216]
 [ 196 1918]]
Accuracy score is   0.901


The accuracy of the model is 90 percent.

### Passive Aggressive Classifier Algorithm

In [84]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier()

In [85]:
linear_clf.fit(X_train, y_train)
pred3 = linear_clf.predict(X_test)
score = accuracy_score(y_test, pred3)
cm = confusion_matrix(y_test, pred3)
print(cm)
print("Accuracy score is   %0.3f" % score)

# cm = metrics.confusion_matrix(y_test, pred)
# plot_confusion_matrix(cm, classes=['FAKE Data', 'REAL Data'])

[[1882  164]
 [ 165 1949]]
Accuracy score is   0.921


The accuracy of the model is 92 percent.

Predicting if a single title is positive or negative

In [86]:
new_title = 'Why should the truth be told'
new_review = re.sub('[^a-zA-Z]', ' ', new_title)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]


This is classified as a fake news.

End of Notebook!