In [41]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
#Pretty much crucial for all NLP tasks
from tensorflow.keras.preprocessing.sequence import pad_sequences
#To ensure that the sentences are of the same size
import nltk
#Natural Language ToolKit
import re
#Regular expressions
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
#To implement stemming and stopwords (removing high freq words, finding the stem of a word)
#Fundamental preprocessing to Information Retrieval tasks.
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df=pd.read_csv('/content/drive/My Drive/data/FakeNewsClassifier/train.csv')
# https://www.kaggle.com/c/fake-news/data#

In [6]:
df = df.dropna()
#Dropping NaN values
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [8]:
#We are trying to predict the label which is the dependent feature here.
#Extracting the independent features
X = df.drop('label', axis = 1)

#Extracting the dependent features
Y = df['label']

#Checking how many 0,1 labels
Y.value_counts()

0    10361
1     7924
Name: label, dtype: int64

In [20]:
vocabulary_size = 5000
#5000 very common words, this excludes stopwords
messages = X.copy()
print(messages['title'][0])
messages.reset_index(inplace = True)


House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It


In [18]:
#Data pre-processing
ps = PorterStemmer()
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    #Removing all special characters apart from a-z , A-Z
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #Removing the stop words and performing stemming.
    #An alternate could be to do lemmatization
    review = ' '.join(review)
    #reconstructing the sentence and appending to the corpus
    corpus.append(review)

#corpus

In [21]:
one_hot_representation = [one_hot(words, vocabulary_size) for words in corpus]
#A one-hot word representation for the words in our corpus

In [23]:
embedding_length = 20
embedded_representation = pad_sequences(one_hot_representation, padding = 'pre', maxlen = embedding_length)
#An embedded one-hot-representation of our vocabulary
#Since we're using an bidirectional LSTM, we can pre-pad or post-pad with zeros with no difference in result

In [39]:
embedding_features = 50
architecture = Sequential()
architecture.add(Embedding(vocabulary_size, embedding_features, input_length = embedding_length))
architecture.add(Bidirectional(LSTM(128)))
architecture.add(Dropout(0.4))
architecture.add(Dense(1 , activation = 'relu'))
architecture.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(architecture.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 50)            250000    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 433,553
Trainable params: 433,553
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
X = np.array(embedded_representation)
Y = np.array(Y)

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y , shuffle = True, test_size = 0.2, random_state = 69)

In [40]:
architecture.fit(X_train, Y_train, validation_data = (X_test, Y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3e7241b2b0>

In [45]:
#Performance metrics
predictions = architecture.predict_classes(X_test)
print(confusion_matrix(Y_test, predictions))
print(accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))



[[1941  187]
 [ 124 1405]]
0.9149576155318567
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      2128
           1       0.88      0.92      0.90      1529

    accuracy                           0.91      3657
   macro avg       0.91      0.92      0.91      3657
weighted avg       0.92      0.91      0.92      3657

