In [None]:
import numpy as np
import pandas as pd

## Loading and preparing the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
fake = pd.read_csv('drive/MyDrive/SAP test/Fake.csv')
real = pd.read_csv('drive/MyDrive/SAP test/True.csv')

In [None]:
print(fake.shape,real.shape)
#No imbalance of dataset

(23481, 4) (21417, 4)


In [None]:
real['real'] = 1
fake['real'] = 0
df = pd.concat([real,fake])

Data cleaning codes taken from [here](https://www.kaggle.com/madz2000/nlp-using-glove-embeddings-99-87-accuracy)

In [None]:

!pip install BeautifulSoup4
from bs4 import BeautifulSoup
import re,string,unicodedata
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop_words:
            final_text.append(i.strip())
    return " ".join(final_text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text
#Apply function on review column
df['text']=df['text'].apply(denoise_text)

Since the purpose of this test is to access my ability to apply NLP modelling techniques, I will be using only the "text" column which consist the main bulk of the data.

In [None]:
x = df['text']
y = df['real']

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,random_state=123)

x_train,x_val,y_train,y_val = train_test_split(x,y, test_size=0.1,random_state=123)

print(x_train.shape,y_train.shape,x_val.shape,y_val.shape,x_test.shape,y_test.shape)

(40408,) (40408,) (4490,) (4490,) (4490,) (4490,)


## Analysing the problem and choosing text processing technique

News constantly evolve overtime and the context of news changes as well. For example, in the year 2019, no one knows about COVID-19, but in the following year, news of COVID-19 appears everywhere. Because of this, it is important for us to constantly check and update the model with new dataset so that it is able to keep up with trends.

To minimise the impact of changing trend, models like Tfidf or bag-of-words that solely tries to match documents based on words occurance might give high results in training and test set but when it comes to new unknown data a year later, the accuracy might drop significantly. Additionally, it might also result in a Out-of-Vocab problem.

Because of this limitation and the context of the problem, I will be exploring the following word embedding methods that uses words dictionary on a global scale rather than just limiting on the local dataset. Word embedding methods also places emphases on the semantic relationships between words instead of solely relying on just the frequency. These are the following word embedding methods that I have shortlisted:

1. GloVe
2. Word2Vec
3. CountVectorization

The last on the list is a not a word embedding method but a frequency based embedding method. I picked on non-word embedding method to get a broader understanding of how each method would perform in this dataset. 

In [None]:
from sklearn import metrics

def f1scores(y_true,y_pred):
  print("F1 scores {}".format(metrics.f1_score(y_true,y_pred)))

def accuracyScores(y_true, y_pred):
  print("Accuracy Scores {}".format(metrics.accuracy_score(y_true,y_pred)))

In [None]:
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf

## GloVe

In [None]:
## Run this code to download the glove embedding file
# import zipfile

# !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
# zip_ref = zipfile.ZipFile('glove.twitter.27B.zip', 'r')
# zip_ref.extractall('')
# zip_ref.close()

In [None]:
embedding_file = 'drive/MyDrive/SAP test/glove.twitter.27B.100d.txt'

def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embedding_file))

In [None]:
max_features = 10000 ##Will take into account of only the top 10000 most common words
maxlen = 300

In [None]:
#Tokenizing the text column
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
tokenized_train_text = tokenizer.texts_to_sequences(x_train)
x_train_text = pad_sequences(tokenized_train_text, maxlen=maxlen)

tokenized_val_text = tokenizer.texts_to_sequences(x_val)
x_val_text = pad_sequences(tokenized_val_text, maxlen=maxlen)

tokenized_test_text = tokenizer.texts_to_sequences(x_test)
x_test_text = pad_sequences(tokenized_test_text, maxlen=maxlen)

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    #Retrieve weights of words that appeared in the text column 
    embedding_vector = embeddings_index.get(word) 
    
    ##Add the weights to the embedding matrix, if it exist inside the embedding_index
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if self.run_code(code, result):


Creating the model with Weights given by GloVe 

In [None]:
##Codes taken from  https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
batch_size = 256
epochs = 5
embed_size = 100

##Training the Model
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer with weights taken from the GloVe dataset
model.add(Embedding(max_features, output_dim=embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy',f1_m])



In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 100)          1000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 300, 128)          117248    
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 1,168,769
Trainable params: 168,769
Non-trainable params: 1,000,000
_________________________________________________________________


In [None]:
history = model.fit(x_train_text, y_train, batch_size = batch_size , validation_data = (x_val_text,y_val) , epochs = epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
_, accuracy_gl, f1_score_gl = model.evaluate(x=x_test_text, y= y_test )



In [None]:
print("GloVe: Accuracy score   {}".format(accuracy_gl))
print("GloVe: F1 score {}".format(f1_score_gl))


GloVe: Accuracy score   0.9919821619987488
GloVe: F1 score 0.990760087966919


**word2Vec**

In [None]:
!ls

drive  sample_data


In [None]:
%cd  drive/MyDrive/SAP test/
!ls

/content/drive/MyDrive/SAP test
 Fake.csv		      GoogleNews-vectors-negative300.bin.zip
'fake news detection.ipynb'   True.csv
'Final Draft.ipynb'	     'Untitled document.gdoc'
 glove.twitter.27B.100d.txt


In [None]:
## Run this code to download the glove embedding file
import zipfile
zip_ref = zipfile.ZipFile('GoogleNews-vectors-negative300.bin.zip', 'r')
zip_ref.extractall('')
zip_ref.close()

In [None]:
!ls

 Fake.csv		      GoogleNews-vectors-negative300.bin
'fake news detection.ipynb'   GoogleNews-vectors-negative300.bin.zip
'Final Draft.ipynb'	      True.csv
 glove.twitter.27B.100d.txt  'Untitled document.gdoc'


In [None]:
from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
EMBEDDING_DIM=300

In [None]:
embeddings_index = {}
for word, vector in zip(word_vectors.vocab, word_vectors.vectors):
  coefs = np.asarray(vector, dtype='float32')
  embeddings_index[word] = coefs

In [None]:
vocab_size = len(word_index)+1
embedding_matrix_w2v = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix_w2v[i] = embedding_vector
    except KeyError:
        embedding_matrix_w2v[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del word_vectors 

In [None]:
##Training the Model
#Defining Neural Network
model_w2v = Sequential()
#Non-trainable embeddidng layer
model_w2v.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix_w2v], input_length=maxlen, trainable=False))
#LSTM 
model_w2v.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model_w2v.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model_w2v.add(Dense(units = 32 , activation = 'relu'))
model_w2v.add(Dense(1, activation='sigmoid'))
model_w2v.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy',f1_m])



In [None]:
model_w2v.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 300)          38178900  
_________________________________________________________________
lstm_6 (LSTM)                (None, 300, 128)          219648    
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 38,450,069
Trainable params: 271,169
Non-trainable params: 38,178,900
_________________________________________________________________


In [None]:
history_w2v = model_w2v.fit(x_train_text, y_train, batch_size = batch_size , validation_data = (x_val_text,y_val) , epochs = epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
_, accuracy_w2v, f1_score_w2v = model_w2v.evaluate(x=x_test_text, y= y_test )



In [None]:
print("word2Vec: Accuracy score   {}".format(accuracy_w2v))
print("word2Vec: F1 score {}".format(f1_score_w2v))


word2Vec: Accuracy score   0.9982182383537292
word2Vec: F1 score 0.9980553984642029


**Count Vectorization and Naive Baye**

In [None]:
x_train

20107    progressive Hillary insults women victims sexu...
7390     Mitch McConnell probably expecting Fox News ho...
11612    MOSCOW (Reuters) - U.N. special envoy Syria St...
20116    Communities like Ferguson, Baltimore Milwaukee...
19686    NAYPYITAW (Reuters) - Myanmar leader Aung San ...
                               ...                        
7763     WINSTON-SALEM, N.C. (Reuters) - North Carolina...
15377    MOSCOW (Reuters) - Kremlin aide said Wednesday...
17730    TAMPA, Fla. (Reuters) - Defense Secretary Jim ...
6613     April 29, Los Angeles County Superior Court Ju...
15725    NEW YORK (Reuters) - wealthy Turkish gold trad...
Name: text, Length: 40408, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vect = CountVectorizer()
x_train_cv = vect.fit_transform(x_train)
x_test_cv = vect.transform(x_test)

nb = MultinomialNB()
nb.fit(x_train_cv,y_train)
y_pred = nb.predict(x_test_cv)

accuracy_cv, f1_score_cv = metrics.accuracy_score(y_test,y_pred), metrics.f1_score(y_test,y_pred)

In [None]:
print("CountVectorizater: Accuracy score   {}".format(accuracy_w2v))
print("CountVectorizater: F1 score {}".format(f1_score_w2v))

CountVectorizater: Accuracy score   0.9982182383537292
CountVectorizater: F1 score 0.9980553984642029


# Summary

Generally, across all model, the Accuracy and F1-score seems to be at the level of 99% and above. This might be due to the biasness in the dataset as pointed out in this [kaggle notebook.](https://www.kaggle.com/josutk/only-one-word-99-2)

However, assuming that there is no bias in the dataset, it might be more ideal to use the **GloVe text-processing with the LSTM model.**  

GloVe is preferred over CountVectorization because it uses a global dataset which reduces the chance of Out-of-Vocab problem and therefore it increases the sustainability and ease of maintainability in the future. Additionally, GloVe is able to capture semantic relationship between words. 

As compared to word2Vec, GloVe is also preferred as it uses a "count-based" model while word2Vec uses a "predictive model". 

Count-based model is easily parallelisable which makes it more ideal as it will be faster to train new model when the current model gets updated due to the evolving news content.  

However, if dimensionality is an issue, word2vec will be preferred.

