<a href="https://colab.research.google.com/github/satnavpt/Wiki-Wiki-Editor/blob/master/DataSamples/Fake_News_Stage/Main_Fake_News_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **`Imports`**

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report, accuracy_score 
import tensorflow
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Bidirectional
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **`Data Preprocessing`**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Fake News Stage/UTK_train.csv')
df = df.dropna()
text = df.text.apply(lambda x: (re.sub(r'[^\w ]+', "", x).lower()))
#Remove punctuation from the input. Numbers are kept as they may be facts that would help indicate whether a sentence contains fake news. 
#Implicit preprocessing is also completed via one_hot. 
label = df.label.tolist()

In [None]:
voc_size = 200000 # Vocabulary size - the number of possible distinct words used in this ML model.
#According to the Oxford English Dictionary, there are an estimated 171,146 words currently in use in the English language.
#A buffer size has been added as new words / abbreviations / slang may be added to Wikipedia articles as time progresses. 
#In addition, having a vocab size that's larger than the true vocab size increases the uniqueness of the hashes completed by one_hot, for more accurate results. 

one_hot_representation = [one_hot(sentences, voc_size) for sentences in text] 
#Converts each word into a unique numerical represenation as the ML model used only operates on numerics.
#It also split words based on white space.

sentence_len = 30 
#The average sentence size in English is 15 - 20 words, so this is a conservative number to limit the amount of information loss from longer senetences whilst also
#ensuring that excessive padding isn't applied to each sentence. 
embedded_sentence = pad_sequences(one_hot_representation, padding = 'post', maxlen = sentence_len)
#This padds (adds 0s to) a sentence's one-hot if the sentence is under sentence_len words
#and truncates the one-hot of sentences over sentence_len words to ensure that every sentence's representation is the same shape. 
#This is necessary as LSTM operates on fixed-size data points. 
#post padding is used so neural network training is efficient. 

## **`LSTM Model`**

In [None]:
misinformation_model = Sequential([
#A sequential model allows the model to be created layer-by-layer. Its input is an embedded sentence (of size sentence_len) and its output is a 0 - 1 float. 
                    Embedding(input_dim = (voc_size + 1), output_dim = 100, input_length = sentence_len, mask_zero = True),
#Create a 2D vector containing an embedding vector of size 100 for each word's one-hot.
#Embedding vector -  Encodes the meaning of a word such that words that are closer together in the learned vector space are expected to be simmilar in meaning.
#(voc_size + 1) as the padding value 0 is masked out.
                    Dropout(0.1, noise_shape = None, seed = None),
#Randomly set input units to 0 with a frequency of 0.1 and recale up the rest such that the sum over all inputs is unchanged. This helps prevent overfitting. 
                    MaxPooling1D(pool_size = 3, strides = 3, padding = 'same', data_format = 'channels_last'),
#Iterates over the inputs and takes the highest value, which compresses the feature space whilst retaining the important features. 
                    Bidirectional(LSTM(100, activation = 'tanh', use_bias = 'true', kernel_initializer = 'glorot_uniform', recurrent_initializer = 'orthogonal', bias_initializer = 'zeros',
                                       unit_forget_bias = True), merge_mode = 'concat'),
#LSTM is a RNN that's effective in making predictions for long sequences of data such as sentences as it uses a memory cell to withhold past infromation for a longer time.
#tanh is used as it's second derivative can sustain for a long range before going to zero, which helps to overcome the vanishing gradient problem.                       
                    Dropout(0.1, noise_shape = None, seed = None),
                    Dense(1, activation = 'sigmoid', use_bias = True, kernel_initializer = 'glorot_uniform')
#Dense feeds all outputs from the previous layer to all of its neurons, with each neuron providing one output (a misinformation prediction) via matrix-vector multiplication. 
                  ])
misinformation_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
print(misinformation_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 100)           20000100  
                                                                 
 dropout (Dropout)           (None, 30, 100)           0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 10, 100)          0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 200)              160800    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 1)                 2

In [None]:
data_train, data_valid, label_train, label_valid = train_test_split(embedded_sentence, df['label'], test_size = 0.2, random_state = 42)
#random_state is used for reproducible output across multiple function calls to allow the model to be fine_tuned

In [None]:
misinformation_model.fit(data_train, label_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2999714a50>

In [None]:
misinformation_model.save('A_misinformation_model_LSTM.h5') #Creates a HDF5 file for the model. 
with open("A_one_hot_encoder", "wb") as f: 
    pickle.dump(one_hot, f)

## **`Model Evaluation & Testing`**

In [None]:
test_pred = (misinformation_model.predict(data_valid) >= 0.5).astype(int)
accuracy_score(label_valid, test_pred, normalize = True)

0.8640962537599125

In [None]:
df_test_data = pd.read_csv('UTK_test.csv')
df_test_label = pd.read_csv('UTK_submit.csv')
df_test = pd.merge(df_test_data, df_test_label)
df_test = df_test.dropna()
test_text = df_test.text.apply(lambda x: (re.sub(r'[^\w ]+', "", x).lower()))
test_one_hot_representation = [one_hot(sentences, voc_size) for sentences in test_text] 
test_embedded_sentence = pad_sequences(test_one_hot_representation, padding = 'post', maxlen = sentence_len)
test_label = df_test.label.tolist()
test_pred = (misinformation_model.predict(test_embedded_sentence) >= 0.5).astype(int)
accuracy_score(test_label, test_pred, normalize = True)