## Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

In [None]:
# Check our GPU Availability
!nvidia-smi

## Load the dataset

We will load the individual dataset, create a target attribute which will indicate '1' if the news is fake. Combine both the dataframes and create the combine dataframe for modelling

In [None]:
# load the fake and real news datasets
fake_news = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
fake_news.head()

In [None]:
true_news = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
true_news.head()

In [None]:
# Create a column with fake=1 in fake_news dataset
fake_news['fake']=1
fake_news.head()

In [None]:
# Create a column with fake=0 in true_news dataset
true_news['fake'] = 0
true_news.head()

In [None]:
# Concat two fake and true news
news = pd.concat([fake_news, true_news])
news.sample(5)

In [None]:
# Check for any null values
news.isna().sum()

In [None]:
# Check the info
news.info()

## Exploratory Data Analysis and Data Visualizations

In [None]:
# Explore the target variable
sns.countplot(x='fake', data=news)


In [None]:
# Explore 2 text for the fake dataset
news[news['fake']==1]['text'].head(2)

In [None]:
# Explore 2 text for true news
news[news['fake']==0]['text'].head(2)

In [None]:
# Explore the subject column
plt.figure(figsize=(10,5))
sns.countplot(x='subject', data=news, hue='fake')

## Feature Engineering

We will create a new columns calld Month and Year from Date and Analyse whether fake or true news has some correlation with Month or Year in the timeline

In [None]:
news['date'] = pd.to_datetime(news['date'], errors='coerce')
news['Year'] = news['date'].dt.year
news['Month'] = news['date'].dt.month

news.head()

In [None]:
# check the impact of yead on tha target
sns.countplot(x='Year', data=news, hue='fake')

In [None]:
# Check the impact of Month on the target variable
sns.countplot(x='Month', data=news, hue='fake')

**We will combine the title and text column**

In [None]:
news['text'] = news['title'] + news['text']
news.drop(labels=['title'], axis=1,inplace=True)
news.head()

## Preparing the final data

We will remove the subject attribute - Since it perfectly distributes the target variable We will remove the Year attribute - This also has a clear division for the target variable We will remove the Month Attribute - This also has a very clear approach of demarcating the target variable

For now we will just go ahead with the "text attribute"

In [None]:
news.drop(labels=['subject','date', 'Year','Month'], axis=1, inplace=True)
news.head()

## Split the dataset into training and testing

In [None]:
# We will shuffle the dataframe and extract the feature and label
news = news.sample(frac=1)
news.head()

In [None]:
# Split the dataset into training and testing
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels=train_test_split(news['text'].to_numpy(),
                                                                            news['fake'].to_numpy(),
                                                                            test_size=0.2,
                                                                            random_state=42)


In [None]:
len(train_sentences),len(val_sentences),len(train_labels),len(val_labels)

In [None]:
# Check the first 10 samples
train_sentences[:2], train_labels[:10]

## Converting text into numbers
When dealing with a text problem, one of the first things you'll have to do before you cna build a model is to covert your text to numbers.

There are a few ways to do this, namely:

* Tokenization -direct mapping of token (a token could be a word or a character ) to a number.
* Embedding - create a matrix of feature vector for each token (the size of the feature vector can be defined and this embedding can be learned).


## Text Vectorization (tokenization)

In [None]:
# find the average number of tokens (words) in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

In [None]:
# Setup text vectorization variables
max_vocab_length = 10000
max_length = 418


from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                   output_mode='int',
                                   output_sequence_length=max_length)

In [None]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [None]:
# Create a sample sentences and tekenize it
sample_sentence = "Please Do Not Forget To Upvoted"
text_vectorizer([sample_sentence])

In [None]:
# choose a random sentence from the training dataset and tokeize it
import random
random_sentence = random.choice(train_sentences)
print(f"Original text;\n{random_sentence}\
\n\n Vectorized Version:")
text_vectorizer([random_sentence])

In [None]:
words = text_vectorizer.get_vocabulary()
len(words)

## Creating an Embedding using an Embedding Layer
To make our embedding we're going to use TensorFlow's embedding layer

The parameters we care most about for our embedding layer:

* input_dim = the size of our vocabulary
* output_dim = the size of output embedding vector, for example, a value of 100 would mean each token gets represented by a vector 100 long
* input_length = length of the sequences being passed to be embedding layer

In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                            output_dim=128,
                            embeddings_initializer='uniform',
                            input_length=max_length)
embedding

In [None]:
# Get a random sentence from the training set
random_sentenc = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedd version: ")
embedding(text_vectorizer([random_sentence]))

## Modelling

We will be using LSTM(long-short term memory) neural network.

In [None]:
# Create an LSTM model
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,),dtype='string')

# Pass inputs to text_vectorizer(convert text into numbers)
x = text_vectorizer(inputs) 

# Convert text_vectorizer layer into embedding layer
x = embedding(x)

# Model 
x = layers.LSTM(64)(x)


# output
outputs = layers.Dense(1, activation='sigmoid')(x)

# Pass inputs and outputs to our model
model = tf.keras.Model(inputs, outputs, name='model_LSTM')

In [None]:
# Get a summary
model.summary()

In [None]:
# Compile the model
model.compile(loss='binary_crossentropy',
             optimizer=tf.keras.optimizers.Adam(),
             metrics=['accuracy'])

In [None]:
# Fit the model
model_history = model.fit(train_sentences,
                         train_labels,
                          epochs=5,
                         validation_data=(val_sentences, val_labels))

**We got 99% accuracy on valid data**

In [None]:
# Make predictions 
model_prediction = model.predict(val_sentences)
model_prediction[:10]

In [None]:
# Convert model prediction to our val_labels
model_preds = tf.squeeze(tf.round(model_prediction))
model_preds[:10]

In [None]:
# Evaluatinon metrics
from sklearn.metrics import accuracy_score, recall_score,precision_score, f1_score

print(f"Accuracy Score: {accuracy_score(val_labels,model_preds)}")
print(f"Recall Score : {recall_score(val_labels, model_preds)}")
print(f"Precsion Score : {precision_score(val_labels, model_preds)}")
print(f"f1 Score : {f1_score(val_labels, model_preds)}")

**References for Feature Engineering and EDA**: https://www.kaggle.com/suvofalcon/fake-real-news-tensorflow-hub-99-accuracy