# 1: UNDERSTAND THE PROBLEM STATMENT AND BUSINESS CASE

The goal is to detect fake news based on Recurrent Neural Networks.
Natural Language processors (NLP) works by converting words into numbers 
These numbers are then used to train an AI/ML model to make predictions 
We will analyze thousand of news text to detect if it's fake or not

# 2: IMPORT LIBRARIES AND DATASETS

In [None]:
!pip install --upgrade tensorflow-gpu==2.0

In [None]:
!pip install jupyterthemes
!pip install plotly
!pip install --upgrade nbformat
!pip install nltk
!pip install spacy # spaCy is an open-source software library for advanced natural language processing
!pip install WordCloud
!pip install gensim # Gensim is an open-source library for unsupervised topic modeling and natural language processing
import nltk
nltk.download('punkt')

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# import keras
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False) 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
df_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

In [None]:
df_true.head()

In [None]:
print("The number of rows of True news are : {}".format(len(df_true)))
print("The number of rows of Fake news are : {}".format(len(df_fake)))

In [None]:
df_true.isnull().sum()

In [None]:
df_fake.isnull().sum()

In [None]:
df_true.info()

In [None]:
df_fake.info()

# 3: PERFORM FEATURE ENGINEERING

* create a new columns called isfake - 1=True(Fake), 0=False
* Combine 2 dataframes together

In [None]:
# add a target class column to indicate whether the news is real or fake
df_true['isfake'] = 0
df_true.head()

In [None]:
df_fake['isfake'] = 1
df_fake.head()

In [None]:
# Concatenate Real and Fake News
df = pd.concat([df_true, df_fake]).reset_index(drop = True)
df

In [None]:
#delete column in memory, not only in this notebook
df.drop(columns = ['date'], inplace = True)

In [None]:
df.head()

In [None]:
# combine title and text together
df['original'] = df['title'] + ' ' + df['text']
df.head()

In [None]:
df['original'][0]

#  4: PERFORM DATA CLEANING

In [None]:
# download stopwords
nltk.download("stopwords")

In [None]:
# Obtain additional stopwords from nltk
from nltk.corpus import stopwords
#I want the stop word in English language
stop_words = stopwords.words('english')
#we add stop words
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# Remove stopwords and remove words with 2 or less characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)
            
    return result

In [None]:
# Apply the function to the dataframe
df['clean'] = df['original'].apply(preprocess)

In [None]:
# Show original news
df['original'][0][:100]

In [None]:
# Show cleaned up news after removing stopwords
print(df['clean'][0][:20])

In [None]:
df.head()

In [None]:
# Obtain the total words present in the dataset
list_of_words = []
for i in df.clean:
    for j in i:
        list_of_words.append(j)

In [None]:
print(list_of_words)
print(len(list_of_words))
# Obtain the total number of unique words
total_words = len(list(set(list_of_words)))
total_words

In [None]:
# join the words into a string
df['clean_joined'] = df['clean'].apply(lambda x: " ".join(x))

In [None]:
df.head()

In [None]:
df['clean_joined'][0]

In [None]:
df['clean'][2]

In [None]:
df['original'][2]

# 5: VISUALIZE CLEANED UP DATASET

In [None]:
# plot the number of samples in 'subject'
plt.figure(figsize = (8, 8))
sns.countplot(y = "subject", data = df)

In [None]:
# plot the number of samples in 'subject'
plt.figure(figsize = (8, 8))
sns.countplot(y = "isfake", data = df)

In [None]:
# plot the word cloud for text that is Real
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 1].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
# plot the word cloud for text that is Fake
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(" ".join(df[df.isfake == 0].clean_joined))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
# length of maximum document will be needed to create word embeddings 
maxlen = -1
for doc in df.clean_joined:
    tokens = nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any document is =", maxlen)

In [None]:
# visualize the distribution of number of words in a text
import plotly.express as px
fig = px.histogram(x = [len(nltk.word_tokenize(x)) for x in df.clean_joined], nbins = 100)
fig.show()

# 6: PREPARE THE DATA BY PERFORMING TOKENIZATION AND PADDING

## TOKENIZER

Tokenizer allows us to vectorize text corpus by turning each text into a sequence of integers

* **SENTENCE**: 

* "budget fight looms republicans ..."

* **TOKENS**:

* [3138, 3581, 2895, ...]

In [None]:
# split data into test and train 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.clean_joined, df.isfake, test_size = 0.2)

In [None]:
from nltk import word_tokenize

# Create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

In [None]:
print("The encoding for document\n",df.clean_joined[0],"\n is : ",train_sequences[0])

In [None]:
# Add padding can either be maxlen = 4406 or smaller number maxlen = 40 seems to work well based on results
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')
padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post') 

In [None]:
for i,doc in enumerate(padded_train[:2]):
     print("The padded encoding for document",i+1," is : ",doc)

# 7: UNDERSTAND THE THEORY AND INTUITION BEHIND RECURRENT NEURAL NETWORKS AND LSTM

## RECURRENT NEURAL NETWORKS
* Feedforward Neural Networks map a fixed size input (such as image) to a fixed size output (classes or probabilities)
* A drawback in Feedforward Neural Networks is that they don't have any time dependency or memory effect
* A RNN is a type of ANN that is designed to take temporal dimension into consideration having memory (internal state)

* A RNN contains a temporal loop in which the hidden layer not only gives an output but feeds itself as well
* AN extra dimension is added which is the time
* RNN can recall what happened in the previous time stamp so it works great with sequence of text

##  VANISHING GRADIENT PROBLEM

* LSTM networks work much better compared to vanilla RNN since they overcome the vanishing gradient problem
* The error has to propagate through all the previous layers resulting in a vanishing gradient
* As the gradient goes smaller, the network weights are no longer updated
* As more layers are added, the gradients of the loss function appraches zero, making the network hard to train

## VANISHING GRADIENT PROBLEM

* ANN gradients are calculated during **backpropagation**
* In backpropagation, we calculate the derivatives of the network by moving from the outermost layer (close to output) back to the initial layers (close to inputs)
* The chain rule is used during this calculation in which the derivatives from the final layer are multiplied by the derivatives from early layers
* The gradients keep diminishing exponentially and therefore the weights and biases are longer being updated 

## GRADIENT DESCENT 

* Gradient Descent is an optimization algorithm used to obtain the optimized network **weight** and **bias** values
* It works by iteratively trying to **minimize the cost function**
* It works by calculating the gradient of the cost function and moving in the negative direction until the local/global minimum is achieved
* If the positive of the gradient is taken, local/global maximum is achieved
* The size of steps taken is called **LEARNING RATE**
* If the learning rate increases, the area covered in the search space will increase so we might reach global minimum faster. However, we can overshoot the target

## GRADIENT DESCENT WORKS AS FOLLOWS: 

1. Calculate the gradient (derivative) of the loss fuction
2. Pick random values for weights (m,b) and substitute
3. Calculate the step size (how much are we going to update the parameters?): 
     **step  size = learning rate * gradient**
4. Update the parameters and repeat:
     **new weight = old weight - step size**

# 8: UNDERSTAND THE INTUITION BEHIND LONG SHORT TERM MEMORY (LSTM) NETWORKS

## LSTM INTUITION

* LSTM networks work better compared to vanilla RNN since they overcome vanishing gradient problem
* In pratice, RNN fail to enstablish long term dependencies
* LSTM networks are type of RNN that are designed to remember long term dependencies by default
* LSTM can remember and recall information for a prolonged period of time  

# 9: BUILD AND TRAIN THE MODEL

## EMBEDDING LAYER

* Embedding layers learn the low-dimensional continuous representation of input discrete variables
* For example, let assume that we have 100,000 unique values in our data and want to train the model with this data. Even though we can train the model to generate accurate results, it would require more data to train
* Alternatively, by introducing embedding layer, you can specify the number of low-dimensional features that you would need to represent the input data, in this take let take the value of 200
* Now, what happens is the embedding layer learns the way to represent 100,000 variables with 200 variables only (think it as PCA or Autoencoder)
* This helps subsequent layers to learn more effectively with less computer resource

In [None]:
# Sequential Model
model = Sequential()

# embeddidng layer
model.add(Embedding(total_words, output_dim = 128))
# model.add(Embedding(total_words, output_dim = 240))


# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))

# Dense layers
model.add(Dense(128, activation = 'relu'))
#1 output because it's binary classification. The output will be 0 or 1
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

In [None]:
total_words

In [None]:
#convert y_train into an array
y_train = np.asarray(y_train)

# Train the model

* We started with entire training set
* We divide training into 2 sets: 90% to train the model and 10% to perform cross-validation
* We apply cross validation to make sure that the model is not overfitting the training data
* If the error in the training data is going down and the error in the validation data is going down as well,  it's a good sign: the model is able to generalize
* If the error in the training data is going down and the error in the validation data is going up, it means the model started to overfit the training data and we need to stop the training

In [None]:
model.fit(padded_train, y_train, batch_size = 64, validation_split = 0.1, epochs = 2)

In [None]:
# make prediction
pred = model.predict(padded_test)

In [None]:
# if the predicted value is >0.5 it is real else it is fake
prediction = []
for i in range(len(pred)):
    if pred[i].item() > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
# getting the accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(list(y_test), prediction)

print("Model Accuracy : ", accuracy)

## CONFUSION MATRIX

* I want to visually represent what was the actual ground truth
* We misclassified 14 samples as fake instead of true, 12 samples as true instead of fake

In [None]:
# get the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(list(y_test), prediction)
plt.figure(figsize = (25, 25))
sns.heatmap(cm, annot = True)

In [None]:
# category dict
category = { 0: 'Fake News', 1 : "Real News"}

## Exercise

In [None]:
# Sequential Model
model = Sequential()

# embeddidng layer
#model.add(Embedding(total_words, output_dim =128))
model.add(Embedding(total_words, output_dim = 240))


# Bi-Directional RNN and LSTM
model.add(Bidirectional(LSTM(128)))

# Dense layers
model.add(Dense(128, activation = 'relu'))
#1 output because it's binary classification. The output will be 0 or 1
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()