<a href="https://colab.research.google.com/github/salonigupta1/Fake-News-Classifier/blob/main/FakeNewsClassificationUsingBidirectionalLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd

In [12]:
df=pd.read_csv('spam.csv', encoding='latin-1')

In [13]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
 
df['v1']= label_encoder.fit_transform(df['v1'])
 
df['v1'].unique()

array([0, 1])

In [16]:
###Drop Nan Values
df=df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)


In [18]:
## Get the Independent Features

X=df['v2']

In [19]:
## Get the Dependent features
y=df['v1']

In [20]:
X.shape

(5572,)

In [21]:
y.shape

(5572,)

In [22]:
import tensorflow as tf

In [23]:
tf.__version__

'2.6.0'

In [38]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional

In [25]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [26]:
messages=X.copy()

In [27]:
messages[1]

'Ok lar... Joking wif u oni...'

In [28]:
import nltk
import re
from nltk.corpus import stopwords

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    #print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [33]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 

### Embedding Representation

In [34]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

In [35]:
embedded_docs[0]

array([   0,    0,    0,    0,  307,  439, 2074, 4366, 2379,  317, 1180,
        537, 1532, 4103, 3560, 1085, 2630, 1756, 4480, 1946], dtype=int32)

In [51]:
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               112800    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
len(embedded_docs),y.shape

(5572, (5572,))

In [54]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [55]:
X_final.shape,y_final.shape

((5572, 20), (5572,))

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [57]:
y_test

array([0, 0, 1, ..., 0, 0, 1])

In [58]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc02d7950d0>

### Adding Dropout 

In [None]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Dropout(0.3))
model1.add(LSTM(100))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Performance Metrics And Accuracy

In [59]:
y_pred=model1.predict(X_test)

In [60]:
from sklearn.metrics import confusion_matrix

In [61]:
y_pred = np.argmax(y_pred, axis=1)

In [62]:
confusion_matrix(y_test,y_pred)

array([[1587,    0],
       [ 252,    0]])

In [63]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8629690048939641