***PROBLEM STATEMENT***

# Can you use this data set to make an algorithm able to determine if an article is fake news or not?

**Dataset Link - https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset**

In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional

In [None]:
true_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_data = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
print(true_data.shape)

In [None]:
print(fake_data.shape)

In [None]:
fake_data['label'] = 1
true_data['label'] = 0

In [None]:
plt.figure(figsize = (8, 8))
sns.countplot(y = "subject", data = data)

In [None]:
!pip install WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
nltk.download("stopwords")

In [None]:
data = pd.concat([true_data,fake_data],axis=0,ignore_index=True)

In [None]:
data.shape

**The dataset has 44898 observations and 5 features**

In [None]:
data.isnull().sum()

In [None]:
x = data.drop('label',axis = 1)
y = data['label']

**Cleaning the review feature with different transformations and stemming using Porter Stemmer**

In [None]:
ps = PorterStemmer()
corpus = []
for i in range(len(x['title'])):
    review = re.sub('[^a-zA-Z]', ' ', x['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
voc_size = 10000
onehot_repr = [one_hot(word,voc_size) for word in corpus]
onehot_repr

**Padding the document using pre and applying one hot encoding to the dataset**

In [None]:
sentlen = 20
embedding_doc = pad_sequences(onehot_repr,padding = 'pre',maxlen=sentlen)
embedding_doc

# LSTM Model

In [None]:
embedding_feature = 60
model = Sequential()
model.add(Embedding(voc_size,embedding_feature,input_length=sentlen))
model.add(LSTM(64,return_sequences=True))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(LSTM(32))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(Dense(1,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

**SPLITTING THE DATASET INTO 70% TRAIN SET AND 30% VALIDATION SET**

In [None]:
x_final = np.array(embedding_doc)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_final,y,test_size=0.3,random_state=1)

**EARLY STOPPING AFTER REACHING THE OPTIMUM EPOCH WHILE FITTING WHERE BOTH THE TRAINING SET AND VALIDATION SET GENERALIZES TOGETHER AND RESTORING THE WEIGHTS FROM THAT EPOCH.**

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5,min_delta=0.001,restore_best_weights=True)

**FITTING THE MODEL**

In [None]:
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,
                    batch_size=128,
                    callbacks=[early_stopping])

**VALIDATION CURVE FOR LSTM MODEL BETWEEN NUMBER OF EPOCHS ON X-AXIS AND ACCURACY ON Y-AXIS**

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title = 'validation curve')

In [None]:
result = model.evaluate(x_test, y_test)

loss = result[0]
accuracy = result[1]
print(f"[+] Accuracy: {accuracy*100:.2f}%")

In [None]:
pred = model.predict(x_test)

In [None]:
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
cm

# **Birectional LSTM Model**

In [None]:
embedding_feature = 60
model = Sequential()
model.add(Embedding(voc_size,embedding_feature,input_length=sentlen))
model.add(Bidirectional(LSTM(64 ,return_sequences=True)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))
model.add(Dense(1,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5,min_delta=0.001,restore_best_weights=True)

In [None]:
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),
                    epochs=10,batch_size=128,
                    callbacks=[early_stopping])

**VALIDATION CURVE FOR BIDIRECTIONAL LSTM MODEL BETWEEN NUMBER OF EPOCHS ON X-AXIS AND ACCURACY ON Y-AXIS**

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title = 'validation curve')

In [None]:
result = model.evaluate(x_test, y_test)

loss = result[0]
accuracy = result[1]
print(f"[+] Accuracy: {accuracy*100:.2f}%")

In [None]:
pred = model.predict(x_test)

In [None]:
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
cm

# We have applied two models for this dataset LSTM and Birectional LSTM, and Birectional LSTM is giving us a better accuracy of 92.43% as compared to LSTM model which was giving us an accuracy of 89.84%.

> # **Predicted class distribution(True Positive and True Negative)**

# **12450 records are correctly classfied out of 13470. (obtained from confusion matrix of Birectional LSTM model.**