## Fake News Prediction

0 = Fake news
1 = Real news

In [5]:
#importing the dependencies
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [23]:
#importing dataset
df = pd.read_csv("/content/fake_or_real_news.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Preprocessing

In [26]:
df.shape

(6335, 4)

In [27]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [28]:
#Rename unamed columns as index
df.rename(columns = {'Unnamed: 0':'index'}, inplace = True)
df.columns

Index(['index', 'title', 'text', 'label'], dtype='object')

In [29]:
#counting missing values
df.isnull().sum()

Unnamed: 0,0
index,0
title,0
text,0
label,0


In [30]:
df.sort_values("index", inplace=True)

In [32]:
df["label"] = df["label"].map({"FAKE":0,"REAL":1})
df.head()

Unnamed: 0,index,title,text,label
5603,2,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,1
565,3,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",1
2099,5,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,1
2197,6,"Despite Constant Debate, Americans' Abortion O...",It's been a big week for abortion news.\n\nCar...,1
5974,7,Obama Argues Against Goverment Shutdown Over P...,President Barack Obama said Saturday night tha...,1


In [33]:
X = df.drop("label",axis=1)
y= df["label"]

In [34]:
print(X,y)

      index                                              title  \
5603      2  Study: women had to drive 4 times farther afte...   
565       3        Trump, Clinton clash in dueling DC speeches   
2099      5  As Reproductive Rights Hang In The Balance, De...   
2197      6  Despite Constant Debate, Americans' Abortion O...   
5974      7  Obama Argues Against Goverment Shutdown Over P...   
...     ...                                                ...   
1213  10549  French Political Leader: “France is about to E...   
4863  10551  Man with unfashionable front door feared by ne...   
5397  10553  Obamacare Architect on Rising Premiums: It’s N...   
208   10555  Clinton Vs. Trump: Latest Electoral Prediction...   
1021  10557  A List of Best Password Managers Offering Both...   

                                                   text  
5603  Ever since Texas laws closed about half of the...  
565   Donald Trump and Hillary Clinton, now at the s...  
2099  WASHINGTON -- Forty-three y

Stemming:
Stemming is the process of reducing word to its root word


eg:
actor, actress, acting --> act

In [35]:
port_stem = PorterStemmer()

In [38]:
def stemming(text):
  stemmed_content = re.sub('[^a-zA-Z]',' ',text)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [39]:
df["text"] = df["text"].apply(stemming)

In [40]:
print(df["text"])

5603    ever sinc texa law close half state abort clin...
565     donald trump hillari clinton start line gener ...
2099    washington forti three year suprem court estab...
2197    big week abort news carli fiorina passion inac...
5974    presid barack obama said saturday night congre...
                              ...                        
1213    post octob theodor shoebat major polit leader ...
4863    man unfashion front door fear neighbour man wi...
5397    washington free beacon octob dr ezekiel emmanu...
208     news fun look poll use data decid candid win s...
1021    want support anonym independ investig news ple...
Name: text, Length: 6335, dtype: object


In [43]:
df["title"] = df["title"].apply(stemming)

In [44]:
df.head()

Unnamed: 0,index,title,text,label
5603,2,studi women drive time farther texa law close ...,ever sinc texa law close half state abort clin...,1
565,3,trump clinton clash duel dc speech,donald trump hillari clinton start line gener ...,1
2099,5,reproduct right hang balanc debat moder drop ball,washington forti three year suprem court estab...,1
2197,6,despit constant debat american abort opinion r...,big week abort news carli fiorina passion inac...,1
5974,7,obama argu gover shutdown plan parenthood,presid barack obama said saturday night congre...,1


In [45]:
X = df["title"].values
y = df['label'].values

In [46]:
print(X)
print(y)

['studi women drive time farther texa law close abort clinic'
 'trump clinton clash duel dc speech'
 'reproduct right hang balanc debat moder drop ball' ...
 'obamacar architect rise premium big increas'
 'clinton vs trump latest elector predict greg laden blog scienc technolog'
 'list best password manag offer free premium servic']
[1 1 1 ... 0 0 0]


In [47]:
y.shape

(6335,)

In [48]:
X.shape

(6335,)

In [50]:
#Converting textual data into numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [51]:
print(X)

  (0, 13)	0.30253041518054014
  (0, 1132)	0.4085539043677144
  (0, 1140)	0.2794972924014823
  (0, 1834)	0.32776348602297023
  (0, 2208)	0.4085539043677144
  (0, 3460)	0.26717449838551593
  (0, 5890)	0.31479190396476253
  (0, 6121)	0.30043282809416283
  (0, 6178)	0.22869064960211646
  (0, 6786)	0.27578917729197844
  (1, 1109)	0.47842946242579415
  (1, 1133)	0.2046109522218675
  (1, 1505)	0.48462828389371915
  (1, 1854)	0.5629327470851186
  (1, 5735)	0.38126697110127633
  (1, 6315)	0.1792128711072826
  (2, 451)	0.37295270458832164
  (2, 456)	0.41297486788543813
  (2, 1520)	0.2245577845465415
  (2, 1837)	0.3205950664107954
  (2, 2711)	0.40196197494988445
  (2, 3927)	0.33607513336210826
  (2, 5081)	0.4471840379158042
  (2, 5166)	0.25031411284866495
  (3, 13)	0.3595695283301892
  :	:
  (6332, 294)	0.46890831307940434
  (6332, 584)	0.33149264414550705
  (6332, 3020)	0.42313864614514557
  (6332, 4226)	0.3264852331593853
  (6332, 4698)	0.4781656999001524
  (6332, 5174)	0.39491298143864556
  (6

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [53]:
print(X.shape,X_train.shape,X_test.shape)

(6335, 6884) (5068, 6884) (1267, 6884)


In [54]:
model = LogisticRegression()

In [55]:
model.fit(X_train,y_train)

In [56]:
y_train_pred = model.predict(X_train)
accuracy_score(y_train,y_train_pred)

0.9088397790055248

In [57]:
y_test_pred = model.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.8176795580110497

#Predictive System

In [62]:
X_news = X_test[81]

prediction  = model.predict(X_news)
print(prediction)

if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[0]
The news is Fake


In [63]:
y_test[81]

0

In [73]:
pip install joblib



In [75]:
import joblib
with open('model.pkl', 'wb') as file:
    joblib.dump(model, file)

In [76]:
with open('vectorizer.pkl', 'wb') as file:
    joblib.dump(vectorizer, file)