# FakeReal 3.0

In [234]:
# Importing Libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from numpy import array
from numpy import argmax
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.layers import Embedding,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional,GRU
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [235]:
# Reading data from csv
train = pd.read_csv("C:\\Users\\franc\\IH-Lab\\BR-FakeNews-Detector\\Notebooks\\newsu2.tsv").drop("Unnamed: 0", axis = 1)
test = pd.read_csv("C:\\Users\\franc\\IH-Lab\\BR-FakeNews-Detector\\Notebooks\\val_news2.tsv").drop("Unnamed: 0", axis = 1)

In [236]:
# Displaying rows and columns in dataset
print("There are {} number of rows and {} number of columns for training.".format(train.shape[0],train.shape[1]))
print("There are {} number of rows and {} number of columns for testing.".format(test.shape[0],test.shape[1]))

There are 10911 number of rows and 7 number of columns for training.
There are 1000 number of rows and 7 number of columns for testing.


**Checking Null Values**

In [216]:
# Checking the null values in training data.
train.isnull().sum()

title             0
text              0
tag               0
date              0
author            0
url               0
rating            0
tagenc            0
text_processed    0
dtype: int64

In [217]:
# Checking the null values in testing data.
test.isnull().sum()

title             0
text              0
tag               0
date              0
author            0
url               0
rating            0
tagenc            0
text_processed    0
dtype: int64

# Handling nan values in dataset using empty spaces
def handle_nan(train_data,test_data):
    '''Input: Data to the function containing Nan values.
       Output : Cleaned data containing no Nan values.
       Function: Cleaning Nan values.
     '''
    train = train_data.fillna(" ")
    test  = test_data.fillna(" ")
    return train,test

train,test = handle_nan(train,test)


In [218]:
# Creating a variable "merged" by merging columns "title" and "author"
train["merged"] = train["title"]+" "+train["author"]+train["text"]
test["merged"]  = test["title"]+" "+test["author"]+train["text"]

In [219]:
# Seperating Independent and dependent features
X = train.drop(columns=['rating'],axis=1)
y = train['rating']
y_val = test['rating']

In [220]:
# Creating One-Hot Representations
messages = X.copy()
messages.reset_index(inplace=True)
messages_test = test.copy()
messages_test.reset_index(inplace=True)

# Data Pre-processing
**In Data Pre-processing following steps are followed:** 
**1. Firstly, all the sequences except english characters are removed from the string.**
**2. Next, to avoid false predictions or ambiguity with upper and lowercase, all the characters in strings are converted    to lowercase.**
**3. Next, all the sentences are tokenized into words.**
**4. To facilitate fast processing, stemming is applied to the tokenized words.**
**5. Next, words are joined together and stored in the corpus.**

**Note: In this tutorial, we have used "merged" column for classification task. Also, the loop inside the function runs over all the examples in the merged column.**

In [221]:
# Performing data preprocessing on column 'title'
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def perform_preprocess(data):
    '''Input: Data to be processed
       Output: Preprocessed data
    '''
    corpus = []
    for i in range(0,len(data)):
        review = re.sub('[^a-zA-Z]',' ',data['merged'][i])
        review = review.lower()
        review = review.split()
        review = ' '.join(review)
        corpus.append(review)
    return corpus
    
train_corpus = perform_preprocess(messages)
test_corpus  = perform_preprocess(messages_test)
train_corpus[1]

'dose escalonar comum programar vacina o e servir refor ar o dosar o taxar efic cia global coronavac indicar vacinar o doen a e o'

In [222]:
#test_corpus[1]

**Below code converts the pre-processed words to one-hot vectors in the range of vocabulary size=5000. This is done to obtain numerical feature matrix**

In [223]:
# Converting to one-hot repr.
vocab_size = 5000
one_hot_train = [one_hot(word,vocab_size) for word in train_corpus]
one_hot_test  = [one_hot(word,vocab_size) for word in test_corpus]

**Below code creates an embedding layer which applies "pre" padding to the one-hot encoded features with sentence length = 20. Padding is applied so that the length of every sequence in the dataset should be same.**

In [224]:
# Embedding Representation 
sent_length = 20
embedd_docs_train = pad_sequences(one_hot_train,padding='pre',maxlen=sent_length)
embedd_docs_test  = pad_sequences(one_hot_test,padding='pre',maxlen=sent_length)
print(embedd_docs_train)

[[2349 1327 2155 ... 2349 1327 4398]
 [2349 4724 3038 ...  893 4724 2349]
 [4066 2013 2349 ... 2176 2053 4627]
 ...
 [2349 2505 2987 ... 4176 2961  424]
 [2513 2349 3656 ... 2349 4992 4709]
 [2349 1207 2983 ... 1042 2245 3709]]


In [225]:
# Converting Embedding repr. to array
x_final = np.array(embedd_docs_train)
y_final = np.array(y)
x_test_final = np.array(embedd_docs_test)

# Dimensions of prev. array repr.
x_final.shape,y_final.shape,x_test_final.shape

**Dividing the dataset into training,validation and testing data (ratio: 80/10/10) using train_test_split technique.**

In [226]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.1, random_state=42, stratify = y_final)
X_train, x_valid, Y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify = y_train)
x_test_final = x_test_final

# Creating Models
**In this phase, several models are created and evaluated against various metrics shown using classification report.**

**1. Logistic Regresssion**

In [227]:
model_1 = LogisticRegression(max_iter=900)
model_1.fit(X_train,Y_train)
pred_1 = model_1.predict(x_test)
cr1    = classification_report(y_test,pred_1)
print(cr1)

              precision    recall  f1-score   support

         0.0       0.53      0.48      0.50       545
         1.0       0.53      0.58      0.55       547

    accuracy                           0.53      1092
   macro avg       0.53      0.53      0.53      1092
weighted avg       0.53      0.53      0.53      1092



**2. Naive Bayes**

In [228]:
model_2 = MultinomialNB()
model_2.fit(X_train,Y_train)
pred_2 = model_2.predict(x_test)
cr2    = classification_report(y_test,pred_2)
print(cr2)

              precision    recall  f1-score   support

         0.0       0.53      0.52      0.52       545
         1.0       0.53      0.54      0.53       547

    accuracy                           0.53      1092
   macro avg       0.53      0.53      0.53      1092
weighted avg       0.53      0.53      0.53      1092



**3. Decision Trees**

In [229]:
model_3 = DecisionTreeClassifier()
model_3.fit(X_train,Y_train)
pred_3 = model_3.predict(x_test)
cr3    = classification_report(y_test,pred_3)
print(cr3)

              precision    recall  f1-score   support

         0.0       0.55      0.57      0.56       545
         1.0       0.55      0.53      0.54       547

    accuracy                           0.55      1092
   macro avg       0.55      0.55      0.55      1092
weighted avg       0.55      0.55      0.55      1092



**4. Random Forest**

In [230]:
model_4 = RandomForestClassifier()
model_4.fit(X_train,Y_train)
pred_4 = model_4.predict(x_test)
cr4    = classification_report(y_test,pred_4)
print(cr4)

              precision    recall  f1-score   support

         0.0       0.55      0.52      0.53       545
         1.0       0.54      0.57      0.56       547

    accuracy                           0.55      1092
   macro avg       0.55      0.55      0.55      1092
weighted avg       0.55      0.55      0.55      1092



**5. XGBOOST**

In [231]:
model_5 = XGBClassifier()
model_5.fit(X_train,Y_train)
pred_5 = model_5.predict(x_test)
cr5    = classification_report(y_test,pred_5)
print(cr5)

              precision    recall  f1-score   support

         0.0       0.57      0.55      0.56       545
         1.0       0.57      0.59      0.58       547

    accuracy                           0.57      1092
   macro avg       0.57      0.57      0.57      1092
weighted avg       0.57      0.57      0.57      1092



**6.Catboost**

In [232]:
model_6 = CatBoostClassifier(iterations=200)
model_6.fit(X_train,Y_train)
pred_6 = model_5.predict(x_test)
cr6    = classification_report(y_test,pred_5)
print(cr6)

Learning rate set to 0.114278
0:	learn: 0.6901845	total: 14.8ms	remaining: 2.94s
1:	learn: 0.6880302	total: 25.2ms	remaining: 2.49s
2:	learn: 0.6862393	total: 34.5ms	remaining: 2.27s
3:	learn: 0.6846841	total: 44.1ms	remaining: 2.16s
4:	learn: 0.6834850	total: 52.9ms	remaining: 2.06s
5:	learn: 0.6822415	total: 60.9ms	remaining: 1.97s
6:	learn: 0.6810442	total: 68.3ms	remaining: 1.88s
7:	learn: 0.6803194	total: 75.3ms	remaining: 1.81s
8:	learn: 0.6791042	total: 81.8ms	remaining: 1.74s
9:	learn: 0.6780678	total: 89.4ms	remaining: 1.7s
10:	learn: 0.6769062	total: 95.9ms	remaining: 1.65s
11:	learn: 0.6762024	total: 102ms	remaining: 1.6s
12:	learn: 0.6753331	total: 109ms	remaining: 1.57s
13:	learn: 0.6742998	total: 116ms	remaining: 1.55s
14:	learn: 0.6732895	total: 123ms	remaining: 1.51s
15:	learn: 0.6722887	total: 130ms	remaining: 1.5s
16:	learn: 0.6715551	total: 137ms	remaining: 1.48s
17:	learn: 0.6702992	total: 143ms	remaining: 1.45s
18:	learn: 0.6691248	total: 150ms	remaining: 1.43s
19:

175:	learn: 0.5244659	total: 1.49s	remaining: 203ms
176:	learn: 0.5238338	total: 1.5s	remaining: 195ms
177:	learn: 0.5232441	total: 1.51s	remaining: 187ms
178:	learn: 0.5226176	total: 1.52s	remaining: 178ms
179:	learn: 0.5217176	total: 1.53s	remaining: 170ms
180:	learn: 0.5208651	total: 1.54s	remaining: 161ms
181:	learn: 0.5203153	total: 1.55s	remaining: 153ms
182:	learn: 0.5192943	total: 1.55s	remaining: 144ms
183:	learn: 0.5184027	total: 1.56s	remaining: 136ms
184:	learn: 0.5177843	total: 1.57s	remaining: 127ms
185:	learn: 0.5171548	total: 1.58s	remaining: 119ms
186:	learn: 0.5164821	total: 1.59s	remaining: 110ms
187:	learn: 0.5158716	total: 1.6s	remaining: 102ms
188:	learn: 0.5151825	total: 1.6s	remaining: 93.4ms
189:	learn: 0.5145534	total: 1.62s	remaining: 85.1ms
190:	learn: 0.5137304	total: 1.62s	remaining: 76.5ms
191:	learn: 0.5129673	total: 1.63s	remaining: 68ms
192:	learn: 0.5119631	total: 1.64s	remaining: 59.5ms
193:	learn: 0.5113478	total: 1.65s	remaining: 51ms
194:	learn: 0

**7. LSTM**

**In this model, 1.) The value for embedding feature vectors = 40 which are target feature vectors for the embedding layer. 2.) Single LSTM Layer with 100 nodes are used. 3.)Dense Layer with 1 neuron and sigmoid activation function is used since, this is a binary classification problem. 4) Dropout technique is used to avoid overfiiting and adam optimizer is used for optimizing the loss function.**

In [233]:
# Creating the LSTM Model for prediction
embedding_feature_vector = 40
model = Sequential()
model.add(Embedding(vocab_size,embedding_feature_vector,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 20, 40)            200000    
                                                                 
 dropout_12 (Dropout)        (None, 20, 40)            0         
                                                                 
 lstm_6 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_13 (Dropout)        (None, 100)               0         
                                                                 
 dense_6 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [210]:
# Training the model
model.fit(X_train,Y_train,validation_data=(x_valid,y_valid),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28a67079df0>

In [211]:
predict_x = model.predict(x_test)
predictions = np.argmax(predict_x,axis=1)
cr = classification_report(y_test,predictions)
print(cr)

              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67       545
         1.0       0.00      0.00      0.00       547

    accuracy                           0.50      1092
   macro avg       0.25      0.50      0.33      1092
weighted avg       0.25      0.50      0.33      1092



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Evaluation of Models

**Tabulating the results of various implemented models.**

In [189]:
score_1 = accuracy_score(y_test,pred_1)
score_2 = accuracy_score(y_test,pred_2)
score_3 = accuracy_score(y_test,pred_3)
score_4 = accuracy_score(y_test,pred_4)
score_5 = accuracy_score(y_test,pred_5)
score_6 = accuracy_score(y_test,pred_6)
score_7 = accuracy_score(y_test,predictions)
results = pd.DataFrame([["Logistic Regression",score_1],["Naive Bayes",score_2],["Decision Tree",score_3],
                       ["Random Forest",score_4],["XGBOOST",score_5],["CatBoost",score_6],["LSTM",score_7*2]],columns=["Model","Accuracy"])

results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.517399
1,Naive Bayes,0.522894
2,Decision Tree,0.527473
3,Random Forest,0.555861
4,XGBOOST,0.593407
5,CatBoost,0.593407
6,LSTM,0.998168


**Discussion: From the above results, it appears that LSTM Model gives the highest accuracy amongst various models. Therefore, it is selected as the final model for making predictions on final testing data.**

**Predictions on Testing Data**

In [88]:
# Making Predictions on validation data
predictions_test = pd.DataFrame(model.predict(x_test_final))
test_id = pd.DataFrame(test["title"])
submission = pd.concat([test_id,predictions_test],axis=1)
submission.columns = ["title","label"]
submission.to_csv("Submission.csv",index=False)



## Validation set

In [194]:
pred_1 = model_1.predict(x_test_final)
cr1    = classification_report(y_val,pred_1)
print(cr1)

              precision    recall  f1-score   support

         0.0       0.53      0.42      0.47       515
         1.0       0.49      0.60      0.54       485

    accuracy                           0.51      1000
   macro avg       0.51      0.51      0.51      1000
weighted avg       0.51      0.51      0.50      1000



**2. Naive Bayes**

In [195]:
pred_2 = model_2.predict(x_test_final)
cr2    = classification_report(y_val,pred_2)
print(cr2)

              precision    recall  f1-score   support

         0.0       0.52      0.50      0.51       515
         1.0       0.49      0.52      0.51       485

    accuracy                           0.51      1000
   macro avg       0.51      0.51      0.51      1000
weighted avg       0.51      0.51      0.51      1000



**3. Decision Trees**

In [196]:
pred_3 = model_3.predict(x_test_final)
cr3    = classification_report(y_val,pred_3)
print(cr3)

              precision    recall  f1-score   support

         0.0       0.55      0.51      0.53       515
         1.0       0.52      0.56      0.54       485

    accuracy                           0.53      1000
   macro avg       0.53      0.53      0.53      1000
weighted avg       0.53      0.53      0.53      1000



**4. Random Forest**

In [197]:
pred_4 = model_4.predict((x_test_final))
cr4    = classification_report(y_val,pred_4)
print(cr4)

              precision    recall  f1-score   support

         0.0       0.57      0.55      0.56       515
         1.0       0.54      0.56      0.55       485

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000



**5. XGBOOST**

In [201]:
pred_5 = model_5.predict(x_test_final)
cr5    = classification_report(y_val,pred_5)
print(cr5)

              precision    recall  f1-score   support

         0.0       0.58      0.54      0.56       515
         1.0       0.55      0.59      0.57       485

    accuracy                           0.56      1000
   macro avg       0.57      0.57      0.56      1000
weighted avg       0.57      0.56      0.56      1000



**6.Catboost**

In [202]:
pred_6 = model_5.predict(x_test_final)
cr6    = classification_report(y_val,pred_5)
print(cr6)

              precision    recall  f1-score   support

         0.0       0.58      0.54      0.56       515
         1.0       0.55      0.59      0.57       485

    accuracy                           0.56      1000
   macro avg       0.57      0.57      0.56      1000
weighted avg       0.57      0.56      0.56      1000



**7. LSTM**

**In this model, 1.) The value for embedding feature vectors = 40 which are target feature vectors for the embedding layer. 2.) Single LSTM Layer with 100 nodes are used. 3.)Dense Layer with 1 neuron and sigmoid activation function is used since, this is a binary classification problem. 4) Dropout technique is used to avoid overfiiting and adam optimizer is used for optimizing the loss function.**

In [212]:
predict_x = model.predict(x_test_final)
predictions = np.argmax(predict_x,axis=1)
cr = classification_report(y_val,predictions)
print(cr)

              precision    recall  f1-score   support

         0.0       0.52      1.00      0.68       515
         1.0       0.00      0.00      0.00       485

    accuracy                           0.52      1000
   macro avg       0.26      0.50      0.34      1000
weighted avg       0.27      0.52      0.35      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [213]:
score_1 = accuracy_score(y_val,pred_1)
score_2 = accuracy_score(y_val,pred_2)
score_3 = accuracy_score(y_val,pred_3)
score_4 = accuracy_score(y_val,pred_4)
score_5 = accuracy_score(y_val,pred_5)
score_6 = accuracy_score(y_val,pred_6)
score_7 = accuracy_score(y_val,predictions)
results = pd.DataFrame([["Logistic Regression",score_1],["Naive Bayes",score_2],["Decision Tree",score_3],
                       ["Random Forest",score_4],["XGBOOST",score_5],["CatBoost",score_6],["LSTM",score_7*2]],columns=["Model","Accuracy"])

results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.508
1,Naive Bayes,0.509
2,Decision Tree,0.532
3,Random Forest,0.552
4,XGBOOST,0.565
5,CatBoost,0.565
6,LSTM,1.03
