# Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
# Importing  pandas for efficiently storing and manipulating large datasets
import pandas as pd 

In [2]:
# Reads data from a CSV file into a Pandas DataFrame. 
df= pd.read_csv("Datasets/datasets.csv")

In [3]:
# Function is used to display the first few rows of a Pandas DataFrame. By default
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
# Drop Nan Values
df=df.dropna()
print(df.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [5]:
# Get the Independent Features, drop label attribute from dataframe 'X'

X=df.drop('label',axis=1)

In [6]:
# Get the Dependent features
y=df['label']

In [7]:
# Typically represents the shape of a data array X
X.shape

(18285, 4)

In [8]:
# Typically represents the shape of the target variable or output 
y.shape

(18285,)

In [9]:
# Extensively provides neural network development,nlp tasks,time series analysis
import tensorflow as tf




In [None]:
# Install lastest version of tensorflow
!pip install tensorflow


In [None]:
tf.__version__

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
### Vocabulary size
voc_size=5000

### Onehot Representation 

In [None]:
# To convert categorical data into a format that can be effectively used by machine learning models, which typically require numerical input.

In [None]:
messages=X.copy()

In [None]:
messages['title'][1]

In [None]:
messages.reset_index(inplace=True)

In [None]:
# Perform tasks such as tokenization, stemming, tagging, parsing

import nltk
import re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
# Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

### Embedding Representation

In [None]:
# Padding is performed to fixed the size of each sentence
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

In [None]:
embedded_docs[0]

In [None]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
len(embedded_docs),y.shape

In [None]:
# Provides support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays

import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
X_final.shape,y_final.shape

In [None]:
# Splitting, Train_Test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)


### Model Training

### Performance Metrics And Accuracy

In [None]:
### Training the model
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=3,batch_size=64)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.2f}, Test Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.metrics import confusion_matrix

# Generate predictions on the test data
y_pred  = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

In [None]:
# Assuming you have the test labels in 'test_labels' and model predictions in 'test_predictions'
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

In [None]:
from sklearn.metrics import confusion_matrix

# Generate predictions on the test data
test_predictions  = model.predict(X_test)
test_predictions = (test_predictions > 0.5).astype(int)  # Convert probabilities to binary predictions

# Assuming you have the test labels in 'test_labels' and model predictions in 'test_predictions'
confusion_mat = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(confusion_mat)

In [None]:
import matplotlib.pyplot as plt  # Import the pyplot module from matplotlib
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create a heatmap of the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#Calculate individual metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print overall metrics
print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Overall Precision: {precision:.4f}")
print(f"Overall Recall: {recall:.4f}")
print(f"Overall F1-Score: {f1:.4f}")