In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
path='/content/drive/MyDrive/FakeNews Classification using LSTM RNN.ipynb'

#### About the Project:
This project aims to predict if a given news is fake or not. that is a binary classificaton problem involving different steps of text preprocessing.

Alternatively, multiple steps to be performed including Text Pre-processing which further includes Tokenization, stopwords, Stemming, Lematization, conversion of words into vectors.

then we will train the LSTM RNN to build a model.

In [3]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv('/content/drive/MyDrive/fake_news_train.csv')

In [5]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [6]:
# Removing Unneccessary Features such as ID:
df.drop('id',axis=1, inplace=True)

In [7]:
df.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

#### Dealing with Missing Values:

We have text data so it won't be very meaningful if we replace the null values with something else so we shall drop the null values. for example, we can't replace the author's null value with something else.

Additionally, we have enough record therefore removing around 2k records won't affect the dataset.

In [8]:
# Dropping the NaN values:
df=df.dropna()

In [9]:
df.shape

(18285, 4)

In [10]:
# getting the indepedent features
X=df.iloc[:,:-1]
y=df.iloc[:,-1:]

In [11]:
print(X.shape)
print(y.shape)

(18285, 3)
(18285, 1)


In [101]:
#importing the dependencies:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [13]:
#Checking the tensorflow version
tf.__version__

'2.11.0'

In [14]:
#Vocbulary_Size-Number of unique words
voc_size=5000

#### One-Hot Representation:
that is to create one hot representation of the title column. that includes assigning an index to each word based on the location of words present in the vocabulary. the reason of choosing title over text is to make the training process work faster as it has less number of words.

















In [15]:
messages=X.copy()

In [16]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

#### Text Preprocessing:

In [17]:
# importing dependencies
import nltk
import re
from nltk.corpus import stopwords


In [18]:
#downloading stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
#downloading wordnet lemmatizer 
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [20]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Here using Lemmatizer over Stemming is more fruitful as it reduces words to dictionary root words which will help improve the accuracy of the model.

In [21]:
messages.head()

Unnamed: 0,title,author,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [22]:
messages.reset_index(inplace=True)

#### Text PreProcessing:
In the below steps, we are going to perform Tokenization, Stopwords and Lemmatization before we convert words into vectors.
This is all done to clean the data and reduce the dimensionality of the data by reducing the number of unique words.

In [23]:
#Dataset Preprocessing/Data Cleaning
for i in range(len(messages)):
  review=re.sub(r'[^a-zA-Z]',' ',messages['title'][i])
  review=review.lower()
  review=review.split() # Tokenization
  review=[lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')] # Stopwords and Lemmatization
  review=' '.join(review)
  corpus.append(review)

In [28]:
corpus[1]

'flynn hillary clinton big woman campus breitbart'

In [None]:
# Applying One-Hot Encoding:
# it assigns indexes to all the unique words based on the vocbulary size.
one_hot_repr=[one_hot(words, voc_size) for words in corpus]
one_hot_repr

In [30]:
corpus[1]

'flynn hillary clinton big woman campus breitbart'

In [32]:
# Index of the words present in the corpus[1]
one_hot_repr[1]

[1213, 2207, 548, 2391, 54, 4744, 2405]

#### Embedding Representation

This is to convert words into vectors. 
we also have to apply padding to make all the sentences of equal length as each sentence is of different length.

In [51]:
#checking the maximum length of any sentence in the corpus
max_length=0
for i in corpus:
  length=len(i.split())
  if length>max_length:
    max_length=length
print(max_length) # maximum length of any sentence in the corpus

47


In [53]:
# Padding
sent_length=50
embedded_docs=pad_sequences(one_hot_repr,padding='pre',maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ...,  603,  717,  888],
       [   0,    0,    0, ...,   54, 4744, 2405],
       [   0,    0,    0, ..., 2885, 1723,  288],
       ...,
       [   0,    0,    0, ...,  897, 2684, 3881],
       [   0,    0,    0, ...,  825, 4250, 2080],
       [   0,    0,    0, ..., 4627,  675, 1819]], dtype=int32)

In [55]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0, 4123, 1748, 3941, 3551,
         37, 2731, 1943,  603,  717,  888], dtype=int32)

In [56]:
one_hot_repr[0]

[4123, 1748, 3941, 3551, 37, 2731, 1943, 603, 717, 888]

In [102]:
# Creating Model: Applying LSTM and Embedding Layer
embedding_vector_features=40 # feature representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length)) # creating embedding layer
model.add(Dropout(0.3)) # 30% neurons to be disabled.
model.add(LSTM(100)) # creating LSTM layer with 100 neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 50, 40)            0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [103]:
len(embedded_docs), y.shape,embedded_docs.shape

(18285, (18285, 1), (18285, 50))

In [104]:
# Converting lists into Arrays
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [105]:
X_final.shape, y_final.shape


((18285, 50), (18285, 1))

In [106]:
#train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final, test_size=0.3, random_state=42)

#### Model Training:
After converting words into vectors, we are going to train the model.

In [107]:
model.fit(X_train,y_train,validation_split=0.2, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5295f1ed00>

#### Model Evalutation

In [108]:
y_pred=model.predict(X_test)



In [109]:
y_pred=np.where(y_pred>0.5,1,0) # alternatively AUC and ROC can be used to find the threshold value

In [110]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [111]:
confusion_matrix(y_test,y_pred)

array([[2815,  292],
       [ 207, 2172]])

In [112]:
accuracy_score(y_test,y_pred)

0.9090411957710536

In [113]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      3107
           1       0.88      0.91      0.90      2379

    accuracy                           0.91      5486
   macro avg       0.91      0.91      0.91      5486
weighted avg       0.91      0.91      0.91      5486

