In [60]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, classification_report

In [61]:
!pip install nltk

[0m

In [62]:
import nltk

In [63]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /usr/share/nltk_

True

In [64]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
ps=PorterStemmer()

In [65]:
fake=pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

In [66]:
true=pd.read_csv('../input/fake-and-real-news-dataset/True.csv', encoding = "ISO-8859-1")

In [67]:
true['label']=1
fake['label']=0

In [68]:
fake.shape,true.shape

((23481, 5), (21417, 5))

In [69]:
df=pd.concat([fake,true])

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [71]:
df[df.duplicated()]

Unnamed: 0,title,text,subject,date,label
9942,HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...,No time to waste we've got to fight with eve...,politics,"Sep 9, 2017",0
11446,FORMER DEMOCRAT WARNS Young Americans: “Rioter...,"Who is silencing political speech, physically...",politics,"Mar 10, 2017",0
14925,[VIDEO] #BlackLivesMatter Terrorists Storm Dar...,They were probably just looking for a safe sp...,politics,"Nov 16, 2015",0
445,Senate tax bill stalls on deficit-focused 'tri...,WASHINGTON (Reuters) - The U.S. Senate on Thur...,politicsNews,"November 30, 2017",1
778,Trump warns 'rogue regime' North Korea of grav...,BEIJING (Reuters) - U.S. President Donald Trum...,politicsNews,"November 8, 2017",1
...,...,...,...,...,...
21228,France unveils labor reforms in first step to ...,PARIS (Reuters) - French President Emmanuel Ma...,worldnews,"August 31, 2017",1
21263,Guatemala top court sides with U.N. graft unit...,GUATEMALA CITY (Reuters) - Guatemala s top cou...,worldnews,"August 29, 2017",1
21290,"Europeans, Africans agree renewed push to tack...",PARIS (Reuters) - Europe s big four continen...,worldnews,"August 28, 2017",1
21353,Thailand's ousted PM Yingluck has fled abroad:...,BANGKOK (Reuters) - Ousted Thai prime minister...,worldnews,"August 25, 2017",1


In [72]:
df=df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44689 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44689 non-null  object
 1   text     44689 non-null  object
 2   subject  44689 non-null  object
 3   date     44689 non-null  object
 4   label    44689 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.0+ MB


In [73]:
df=df.reset_index(drop=True)

In [74]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [75]:
df=df.drop(columns=['date'],axis=1)

In [76]:
df.subject.value_counts()

politicsNews       11220
worldnews           9991
News                9050
politics            6838
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: subject, dtype: int64

In [77]:
import re
corpus=[]
for i in range(len(df)):
  review=re.sub('[^a-zA-Z]',' ',df.title[i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(i) for i in review if i not in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

corpus  

['donald trump send embarrass new year eve messag disturb',
 'drunk brag trump staffer start russian collus investig',
 'sheriff david clark becom internet joke threaten poke peopl eye',
 'trump obsess even obama name code websit imag',
 'pope franci call donald trump christma speech',
 'racist alabama cop brutal black boy handcuff graphic imag',
 'fresh golf cours trump lash fbi deputi director jame comey',
 'trump said insan racist stuff insid oval offic wit back',
 'former cia director slam trump un bulli openli suggest act like dictat tweet',
 'watch brand new pro trump ad featur much kiss make sick',
 'papa john founder retir figur racism bad busi',
 'watch paul ryan told us care struggl famili live blue state',
 'bad news trump mitch mcconnel say repeal obamacar',
 'watch lindsey graham trash media portray trump kooki forget word',
 'heiress disney empir know gop scam us shred tax bill',
 'tone deaf trump congrat rep scalis lose weight almost die',
 'internet brutal mock disney n

In [78]:
len(corpus)

44689

In [79]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.models import Sequential

In [80]:
vocab_size=10000
one_hot_repr=[one_hot(word,vocab_size) for word in corpus]
one_hot_repr[0]

[267, 5171, 3468, 9212, 1553, 6756, 1031, 1394, 8104]

In [81]:
sent_length=20
pad_doc=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
pad_doc[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        267, 5171, 3468, 9212, 1553, 6756, 1031, 1394, 8104], dtype=int32)

In [82]:
emb_vec_fea=100
model=Sequential()
model.add(Embedding(vocab_size,emb_vec_fea,input_length=sent_length))
model.add(LSTM(200))
model.add(Dense(1,activation='sigmoid'))


In [83]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')

In [84]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 100)           1000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 1,241,001
Trainable params: 1,241,001
Non-trainable params: 0
_________________________________________________________________


In [85]:
X=pad_doc
y=df.label

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbf7c6c2350>

In [88]:
y_pred=model.predict(X_test)
y_pred=y_pred>0.5
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      4738
           1       0.93      0.95      0.94      4200

    accuracy                           0.94      8938
   macro avg       0.94      0.94      0.94      8938
weighted avg       0.94      0.94      0.94      8938

