## Fake News Detection

#### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle
import re

In [2]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Package stopwords is already up-to-date!


In [2]:
fake_data=pd.read_csv("Fake.csv")
true_data=pd.read_csv("True.csv")

In [None]:
true_news.head()

In [None]:
fake_news.head()


In [None]:
true_news=pd.concat([true_data.text,true_data.title],axis=1)
fake_news=pd.concat([fake_data.text,fake_data.title],axis=1)
true_news["label"]="true"
fake_news["label"]="fake"

In [None]:
true_news.head(),fake_news.head()


In [None]:
df=pd.concat([true_news,fake_news],axis=0,ignore_index=True)
df['status']=[1 if label=="fake" else 0 for label in df.label]
df = df.sample(frac = 1)
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.head()

In [None]:
df.isna().sum(),df.isnull().sum()

In [None]:
def stemmings(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:

df1 = df.text.apply(stemmings)
print(df1)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot("status",data=df)
plt.title("distribution of news")
plt.grid()

In [4]:
df = pd.read_csv("train.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,text,title,label,status
0,0,0,WASHINGTON (Reuters) - The United States shoul...,EPA chief says Paris climate agreement 'bad de...,true,0
1,1,1,President Trump just tweeted out a new policy ...,BREAKING NEWS: President Trump Announces Major...,fake,1
2,2,2,WASHINGTON (Reuters) - U.S. Republican preside...,Trump says New Hampshire win not necessary to ...,true,0
3,3,3,MOSCOW (Reuters) - The Kremlin said on Thursda...,Kremlin: U.S. sanctions aimed at turning busin...,true,0
4,4,4,Kellyanne Conway s response to Williams criti...,MUST WATCH: Kellyanne Conway PUNCHES BACK Afte...,fake,1


In [6]:
df.shape

(42000, 6)

In [7]:
df.isnull().sum()

Unnamed: 0    0
index         0
text          0
title         0
label         0
status        0
dtype: int64

In [8]:
labels = df.label

In [9]:
labels.head()

0    true
1    fake
2    true
3    true
4    fake
Name: label, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(df["text"], labels, test_size = 0.2, random_state = 20)

In [25]:
x_test.head()

36020    21st Century Wire says Just days before the US...
34849    NEW YORK (Reuters) - Hospitals and health insu...
33327    KATHMANDU (Reuters) - Nepalis began voting in ...
8604     On Tuesday, the United States received a major...
10801    SEOUL (Reuters) - Soon after North Korea teste...
Name: text, dtype: object

In [14]:
# initilise a Tfidvectorizer
vector = TfidfVectorizer(stop_words='english', max_df=0.7)

In [15]:
# fit and tranform
tf_train = vector.fit_transform(x_train)
tf_test = vector.transform(x_test)

In [16]:
# initilise a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [17]:
# predicton the tst dataset
y_pred = pac.predict(tf_test)

In [18]:
score = accuracy_score(y_test, y_pred)

In [19]:
print(f"Accuracy : {round(score*100,2)}%")

Accuracy : 99.63%


In [23]:
# confusion metrics
confusion_matrix(y_test, y_pred, labels=['fake', 'true'])

array([[4368,   17],
       [  14, 4001]], dtype=int64)

In [21]:
# save model
import pickle
filename = 'finalized_last_model.pkl'
pickle.dump(pac, open(filename, 'wb'))

In [22]:
# save vectorizer
filename = 'vectorizer_last.pkl'
pickle.dump(vector, open(filename, 'wb'))