In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
path='/content/drive/MyDrive/FakeNews Classification using LSTM RNN.ipynb'

#### About the Project:
This project uses bidirectional LSTM RNN to predict if a news is fake or not.

Bidirectional LSTM RNN tends to have the context of the upcoming word as well previous words to be able to predict. On the other side, LSTM RNN tends to have the context of the previous words only.

This project involves some basic EDA and Text pre processing steps such as Tokenization, stopwords, Stemming/Lemmatization and converting words into vectors before passing through it fully connected layers.

In [3]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv('/content/drive/MyDrive/fake_news_train.csv')

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
# Removing Unneccessary Features such as ID:
df.drop('id',axis=1, inplace=True)

In [7]:
df.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

#### Dealing with Missing Values:

We have text data so it won't be very meaningful if we replace the null values with something else so we shall drop the null values. for example, we can't replace the author's null value with something else.

Additionally, we have enough record therefore removing around 2k records won't affect the dataset.

In [8]:
# Dropping the NaN values:
df=df.dropna()

In [9]:
df.shape

(18285, 4)

In [11]:
# getting the indepedent features to be able to perform text pre processing on the columns contaning text.
X=df.iloc[:,:-1]
y=df.iloc[:,-1:]

In [12]:
print(X.shape)
print(y.shape)

(18285, 3)
(18285, 1)


In [13]:
#checking if the dataset is balanced or imbalanced:
y.value_counts()

label
0        10361
1         7924
dtype: int64

In [15]:
#importing the dependencies:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [16]:
#Vocbulary_Size-Number of unique words
voc_size=5000

#### One-Hot Representation:
that is to create one hot representation of the title column. that includes assigning an index to each word based on the location of words present in the vocabulary. the reason of choosing title over text is to make the training process work faster as it has less number of words.

here we can play with either title column or text column. we are going to use the title column to make the predictions.


In [17]:
messages=X.copy()

In [18]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

#### Text Preprocessing:
It involves Tokenization, Stopwords, Stemming, Lammetization

In [19]:
# importing dependencies
import nltk
import re
from nltk.corpus import stopwords

In [20]:
#downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [23]:
#downloading wordnet lemmatizer 
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Here using Lemmatizer over Stemming is more fruitful as it reduces words to dictionary root words which will help improve the accuracy of the model.

In [24]:
messages.head()

Unnamed: 0,title,author,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [25]:
messages.reset_index(inplace=True)

In [27]:
#Dataset Preprocessing/Data Cleaning
corpus=[]
for i in range(len(messages)):
  review=re.sub(r'[^a-zA-Z]',' ',messages['title'][i])
  review=review.lower()
  review=review.split() # Tokenization
  review=[lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')] # Stopwords and Lemmatization
  review=' '.join(review)
  corpus.append(review)

In [28]:
corpus[1]

'flynn hillary clinton big woman campus breitbart'

In [29]:
# Applying One-Hot Encoding:
# it assigns indexes to all the unique words based on the vocbulary size.
one_hot_repr=[one_hot(words, voc_size) for words in corpus]
one_hot_repr

[[4794, 4725, 602, 961, 2092, 1597, 1700, 3701, 906, 585],
 [4585, 1051, 863, 4004, 793, 1318, 917],
 [4014, 2547, 3228, 3928],
 [3226, 4423, 2966, 3188, 278, 951],
 [125, 793, 2722, 3399, 875, 954, 793, 308, 1844, 3746],
 [3043,
  1472,
  4348,
  2507,
  3209,
  2178,
  1535,
  3588,
  1812,
  3827,
  2050,
  145,
  3658,
  3280,
  917],
 [4478, 1895, 1945, 2824, 2682, 4954, 565, 2352, 3100, 1797, 3584],
 [971, 516, 3585, 4309, 946, 3829, 2178, 4463, 3100, 1797, 3584],
 [1090, 2158, 3994, 299, 1925, 828, 201, 3850, 2178, 2663],
 [3531, 1711, 890, 3845, 2625, 1957, 3969, 4041],
 [3042, 3188, 2649, 2902, 4397, 3619, 4466, 1898, 3719, 4065, 2537],
 [3188, 4141, 2092, 4311, 2178, 946],
 [1514, 3908, 3924, 1830, 136, 4826, 198, 4965, 1674],
 [2922, 4110, 3099, 3406, 3911, 1024, 807, 3100, 1797, 3584],
 [3355, 2753, 3808, 4506, 1019, 3100, 1797, 3584],
 [4508, 2455, 1193, 4844, 3912, 967, 3418, 137, 1887, 3045],
 [1604, 2299, 1051],
 [4792, 2215, 3686, 4860, 2178, 481, 1607, 917],
 [4109, 1

In [31]:
# comparison of sentences vs indexes of the words in the sentences
print(corpus[1])
# Index of the words present in the corpus[1]
print(one_hot_repr[1])

flynn hillary clinton big woman campus breitbart
[4585, 1051, 863, 4004, 793, 1318, 917]


#### Embedding Representation

This is to convert words into vectors. 
we also have to apply padding to make all the sentences of equal length as each sentence is of different length.

In [32]:
#checking the maximum length of any sentence in the corpus
max_length=0
for i in corpus:
  length=len(i.split())
  if length>max_length:
    max_length=length
print(max_length) # maximum length of any sentence in the corpus

47


#### Padding:

That is done to make the length of all the sentences the same.

In [33]:
# Padding
sent_length=50
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ..., 3701,  906,  585],
       [   0,    0,    0, ...,  793, 1318,  917],
       [   0,    0,    0, ..., 2547, 3228, 3928],
       ...,
       [   0,    0,    0, ..., 3100, 1797, 3584],
       [   0,    0,    0, ..., 1967, 1916, 2628],
       [   0,    0,    0, ..., 4224, 1651, 2291]], dtype=int32)

In [37]:
# difference between before padding and after padding
print(one_hot_repr[0])
print(embedded_docs[0])

[4794, 4725, 602, 961, 2092, 1597, 1700, 3701, 906, 585]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0 4794 4725
  602  961 2092 1597 1700 3701  906  585]


#### Embedding Layer:
That is to convert words into vectors:

In [38]:
# Creating Model: Applying LSTM and Embedding Layer
embedding_vector_features=40 # feature representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length)) # creating embedding layer
model.add(Dropout(0.3)) # 30% neurons to be disabled.
model.add(Bidirectional(LSTM(100))) # creating Bidirectional LSTM layer with 100 neurons
# it would have 100 neurons forward and 100 neurons backward
model.add(Dropout(0.3)) # 30% neurons to be disabled.
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 50, 40)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 200)              112800    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
__________________________________________________

In [39]:
type(embedded_docs)

numpy.ndarray

In [40]:
# Converting lists into Arrays
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [41]:
#train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final, test_size=0.3, random_state=42)

#### Model Training:
After converting words into vectors, we are going to train the model.

In [42]:
model.fit(X_train,y_train,validation_split=0.2, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f64b83df850>

#### Model Evalutation

In [43]:
y_pred=model.predict(X_test)



In [44]:
y_pred=np.where(y_pred>0.5,1,0)

In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [48]:
confusion_matrix(y_test,y_pred)

array([[2836,  271],
       [ 259, 2120]])

In [49]:
accuracy_score(y_test,y_pred)

0.9033904484141451