In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
news_dataset = pd.read_csv('/content/fake_and_real_news_dataset.csv')


In [5]:
news_dataset.shape

(4594, 4)

In [6]:
news_dataset.head()


Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


In [7]:
news_dataset.isnull().sum()


idd      0
title    1
text     0
label    0
dtype: int64

In [8]:
news_dataset = news_dataset.fillna('')


In [9]:
news_dataset['content'] = news_dataset['text']+''+news_dataset['title']


In [10]:
x = news_dataset.drop(columns='label',axis=1)
y = news_dataset['label']

In [11]:
print(x)
print(y)

             idd                                              title  \
0     Fq+C96tcx+  ‘A target on Roe v. Wade ’: Oklahoma bill maki...   
1     bHUqK!pgmv  Study: women had to drive 4 times farther afte...   
2     4Y4Ubf%aTi        Trump, Clinton clash in dueling DC speeches   
3     _CoY89SJ@K  Grand jury in Texas indicts activists behind P...   
4     +rJHoRQVLe  As Reproductive Rights Hang In The Balance, De...   
...          ...                                                ...   
4589  ukZm6JTO#x                 Russia Calls the War Party's Bluff   
4590  yu0xKEiapJ  Bernie Sanders: The Democratic primary gave me...   
4591  c4Y370E_9c  Pipeline Police Strip Search Native Girl, Then...   
4592  bBbeuCUeMH  Currency Crisis: Alasdair MacLeod On The Vexed...   
4593  vE44sWBnd9                   Paper Tiger ISIS Digs Into Mosul   

                                                   text  \
0     UPDATE: Gov. Fallin vetoed the bill on Friday....   
1     Ever since Texas laws c

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]','',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ''.join(stemmed_content)
  return stemmed_content

In [14]:
news_dataset['content'] = news_dataset['content'].apply(stemming)


In [15]:
print(news_dataset['content'])


0       updategovfallinvetoedthebillonfridayheadherefo...
1       eversincetexaslawsclosedabouthalfofthestatesab...
2       donaldtrumpandhillaryclintonnowatthestartingli...
3       ahoustongrandjuryinvestigatingcriminalallegati...
4       washingtonfortythreeyearsafterthesupremecourte...
                              ...                        
4589    licensedmcacoldwarhasreachedunprecedentedhyste...
4590    printsenberniesanderslaidoutthewayshewouldleve...
4591    asthepressuretostartconstructiononthedakotaacc...
4592    tweethomegoldgoldnewscurrencycrisisalasdairmac...
4593    writtenbyericmargolisasaformersoldierandwarcor...
Name: content, Length: 4594, dtype: object


In [16]:
x = news_dataset['content'].values
y = news_dataset['label'].values


In [17]:
print(x)


['updategovfallinvetoedthebillonfridayheadhereformorelawmakersinoklahomaapprovedabillthursdaythatwouldmakeperformingabortionsafelonyandrevokethemedicallicensesofmostphysicianswhoassistinsuchproceduresthissweepingmeasurewhichopponentsdescribedasunconstitutionalandunprecedentednowheadstogovmaryfallinrshewillhavefivedaysnotincludingsundaytodecidewhethertosignthebillvetoitorallowittobecomelawwithouthersignatureaccordingtoaspokesmanthegovernorwillwithholdcommentonthatbillasshedoesonmostbillsuntilsheandherstaffhavehadachancetoreviewitmichaelmcnuttaspokesmanforfallinsaidinanemailtheoklahomabillisthefirstsuchmeasureofitskindaccordingtothecenterforreproductiverightswhichsaysthatotherstatesseekingtobanabortionhavesimplybannedtheprocedureratherthanattachingpenaltieslikethisaccordingtothemeasureknownassbapersonwhoperformsorinducesanabortionwillbeguiltyofafelonyandpunishedwithbetweenoneandthreeyearsinthestatepenitentiarythislegislationalsosaysthatanyphysicianwhoparticipatesinanabortiondeemedunprofe

In [18]:
print(y)


['REAL' 'REAL' 'REAL' ... 'FAKE' 'FAKE' 'FAKE']


In [19]:
y.shape


(4594,)

In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)


In [21]:
print(x)


  (0, 4051)	1.0
  (1, 1312)	1.0
  (2, 993)	1.0
  (3, 101)	1.0
  (4, 4242)	1.0
  (5, 4218)	1.0
  (6, 1100)	1.0
  (7, 4510)	1.0
  (8, 3503)	1.0
  (9, 1423)	1.0
  (10, 3299)	1.0
  (11, 3726)	1.0
  (12, 715)	1.0
  (13, 660)	1.0
  (14, 4227)	1.0
  (15, 2398)	1.0
  (16, 2516)	1.0
  (17, 210)	1.0
  (18, 4428)	1.0
  (19, 43)	1.0
  (20, 3484)	1.0
  (21, 3445)	1.0
  (22, 3311)	1.0
  (23, 4372)	1.0
  (24, 131)	1.0
  :	:
  (4569, 1356)	1.0
  (4570, 419)	1.0
  (4571, 663)	1.0
  (4572, 2996)	1.0
  (4573, 3052)	1.0
  (4574, 2432)	1.0
  (4575, 3304)	1.0
  (4576, 4301)	1.0
  (4577, 1469)	1.0
  (4578, 1810)	1.0
  (4579, 230)	1.0
  (4580, 1453)	1.0
  (4581, 3897)	1.0
  (4582, 3879)	1.0
  (4583, 1497)	1.0
  (4584, 4118)	1.0
  (4585, 4274)	1.0
  (4586, 3477)	1.0
  (4587, 2213)	1.0
  (4588, 3404)	1.0
  (4589, 2248)	1.0
  (4590, 2982)	1.0
  (4591, 286)	1.0
  (4592, 4012)	1.0
  (4593, 4520)	1.0


In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify=y,random_state=2)


In [23]:
model = LogisticRegression()


In [24]:
model.fit(x_train,y_train)

In [25]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)



In [26]:
print('Accuracy score of the training data:',training_data_accuracy)


Accuracy score of the training data: 1.0


In [27]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction,y_test)


In [28]:
print('Accuracy score of the test data:',test_data_accuracy)


Accuracy score of the test data: 0.5048966267682263


In [29]:
x_new = x_test[3]
prediction = model.predict(x_new)
print(prediction)
if(prediction[0]==0):
  print(' the news is real')
else:
  print('the news is fake')


['FAKE']
the news is fake
