In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Library imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import warnings
warnings.simplefilter("ignore")

import tensorflow as tf
import tensorflow_hub as hub

from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

## Load the dataset and validate the data load

We will load the individual dataset, create a target attribute which will indicate 1 if the news is fake. Combine both the dataframes and create the combined dataframe for modelling

In [None]:
# Load the fake and real news datasets
fake_news = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
fake_news["fake"] = 1

# Load the real news 
real_news = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
real_news["fake"] = 0

# We will join the two dataframes and create the combined one for modelling

news = pd.concat([fake_news, real_news])
news.head()

In [None]:
# Check for any null values
news.isna().sum()

In [None]:
# Check the data structure
news.info()

## Exploratory Data Analysis and Data Visualizations

In [None]:
# Explore the target variable
sns.countplot(x='fake', data=news)
print("Distributions...")
print(news['fake'].value_counts())

In [None]:
# Explore 2 texts for the fake dataset
news[news['fake'] == 1]['text'].head(2)

In [None]:
# Explore 2 textx for the real news
news[news['fake'] == 0]['text'].head(2)

In [None]:
# Explore the Subject column

plt.figure(figsize=(10, 6))
sns.countplot(x='subject', data=news, hue='fake')

## Feature Engineering

#### We will create a new columns called Month and Year from Date and analyse whether fake or real news has some correlation with Month or Year in the timeline

In [None]:
news['date'] = pd.to_datetime(news['date'], errors='coerce')
news['Year'] = news['date'].dt.year
news['Month'] = news['date'].dt.month

news.head()

In [None]:
# Check the impact of Year on the target variable
sns.countplot(x='Year', data=news, hue='fake')

All news in the year 2015 in the dataset is a fake news. So this attribute has a level which perfectly distributes the target variable.

In [None]:
# Check the impact of Month on the target variable
sns.countplot(x='Month', data=news, hue='fake')

This shows an interesting pattern - The number of fake news is higher till month 8, post which the number of real news increases drastically. Which essentially means if the month is <= 8, the probability of fake news is higher. 

#### We will combine the title and text column

In [None]:
news['text'] = news['title'] + news['text']
news.drop(labels=['title'], axis=1, inplace=True)

news.head()

## Preparing the final data

We will remove the subject attribute - Since it perfectly distributes the target variable
We will remove the Year attribute - This also has a clear division for the target variable
We will remove the Month Attribute - This also has a very clear approach of demarcating the target variable

For now we will just go ahead with the "text attribute"

In [None]:
news.drop(labels=['subject', 'date', 'Year', 'Month'], axis=1, inplace=True)
news.head()

## Train-Test Split

In [None]:
# We will shuffle the dataframe and extract the feature and label

news = news.sample(frac=1)
feature_text = news['text']
target = news['fake']

In [None]:
# Perform the split
features_train, features_test, target_train, target_test = train_test_split(feature_text, target, test_size=0.3, 
                                                                            random_state=101)

# We will further split the training set into validatoion to evaluate the Neural Network training
features_train, features_val, target_train, target_val = train_test_split(features_train, target_train, test_size=0.3, 
                                                                            random_state=101)

print("Training Features shape: ", features_train.shape)
print("Training Target shape: ", target_train.shape)

print("Validation Features shape: ", features_val.shape)
print("Validation Target shape: ", target_val.shape)

print("Test Features shape: ", features_test.shape)
print("Training Target shape: ", target_test.shape)

In [None]:
# First 10 training samples
features_train[: 10]

In [None]:
# First 10 training classes (target)
target_train[: 10]

## Build and Train the Neural Network Model

In [None]:
# Define some global Model Constants

INPUT_SHAPE = []

OUTPUT_UNITS = 1
HIDDEN_UNITS_SINGLE = 16
HIDDEN_UNITS_DEEP = 8
ACTIVATION_HIDDEN = tf.keras.activations.relu
ACTIVATION_OUTPUT = tf.keras.activations.sigmoid
LEARNING_RATE = 1e-3
OPTIMIZER = tf.keras.optimizers.Adam(LEARNING_RATE)
LOSS_FUNCTION = tf.keras.losses.BinaryCrossentropy(from_logits=True)
L2_REGULARIZER = tf.keras.regularizers.L2(0.001)
DROPOUT_RATE = 0.2

EPOCHS = 3

### Define the Model Evaluation Metrics

In [None]:
# Define the Metrics - These are the metrics we will evaluate during training

METRICS = [tf.keras.metrics.TruePositives(name='tp'),
          tf.keras.metrics.FalsePositives(name='fp'),
          tf.keras.metrics.TrueNegatives(name='tn'),
          tf.keras.metrics.FalseNegatives(name='fn'), 
          tf.keras.metrics.BinaryAccuracy(name='accuracy'),
          tf.keras.metrics.Precision(name='precision'),
          tf.keras.metrics.Recall(name='recall'),
          tf.keras.metrics.AUC(name='auc')]

#### In order to perform text processing, we would be using a pre-trained embedding layer from tensorflow-hub
We will create a Keras Layer that uses tensorflow hub model to embed sentences

#### The first layer is a TensorFlow Hub layer. This layer uses a pre-trained Saved Model to map a sentence into its embedding vector. The model that we are using (google/nnlm-en-dim128/2) splits the sentence into tokens, embeds each token and then combines the embedding.

In [None]:
model_embeddings = "https://tfhub.dev/google/nnlm-en-dim128/2"
hub_layer = hub.KerasLayer(model_embeddings, input_shape=INPUT_SHAPE, dtype=tf.string, trainable=True)

# We will use it in the first two samples and check
hub_layer(features_train[:2])

### Build and Compile the Model

In [None]:
# Defining a function which will build and compile the model

'''
This will build and compile a model with one hidden layer and 16 neurons
'''
def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = tf.keras.Sequential()
    model.add(hub_layer)
    model.add(tf.keras.layers.Dense(units=HIDDEN_UNITS_SINGLE, activation=ACTIVATION_HIDDEN))
    model.add(tf.keras.layers.Dense(units=OUTPUT_UNITS, activation=ACTIVATION_OUTPUT))
    model.compile(optimizer=OPTIMIZER, loss=LOSS_FUNCTION, metrics=metrics)
    return model

In [None]:
# Defining a function to plot training loss vs validation loss

'''
This function will take a epoch model from training a neural network
Will plot training loss vs validation loss
'''

def plotTrainLossVsValLoss(epochs_history):
    plt.figure(figsize=(12, 8))
    loss_train = epochs_history.history['loss']
    loss_val = epochs_history.history['val_loss']

    plt.figure(figsize=(12, 8))

    loss_train = epochs_history.history['loss']
    loss_val = epochs_history.history['val_loss']

    epochs = range(1, (EPOCHS + 1))
    plt.plot(epochs, loss_train, 'g', label='Training loss')
    plt.plot(epochs, loss_val, 'b', label='Validation loss')
    plt.title('Training Loss vs Validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# Defining a function to plot training accuracy vs validation accuracy

'''
This function will take a epoch model from training a neural network
Will plot training accuracy vs validation accuracy
'''

def plotTrainAccuracyVsValAccuracy(epochs_history):
    plt.figure(figsize=(12, 8))

    loss_train = epochs_history.history['accuracy']
    loss_val = epochs_history.history['val_accuracy']

    epochs = range(1, (EPOCHS + 1))
    plt.plot(epochs, loss_train, 'g', label='Training Accuracy')
    plt.plot(epochs, loss_val, 'b', label='Validation Accuracy')
    plt.title('Training Accuracy vs Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# Defining a function to plot the confusion matrix
# Let us visualize the Confusion Matrix and detail out some key metrices including classification report

'''
This function will plot the confusion matrix 
This will also display various performance metrices
'''
def plot_cm(labels, predictions, threshold=0.5):
    cm = confusion_matrix(labels, predictions > threshold)
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title("Confusion Matrix %0.2f" %threshold)
    plt.xlabel("Predicted Label")
    plt.ylabel("Actual Label")
    
    print('True Negatives: ', cm[0][0])
    print('Incorrectly Detected (False Positives): ', cm[0][1])
    print('Missed (False Negatives): ', cm[1][0])
    print('True Positives: ', cm[1][1])
    print('Total Transactions: ', np.sum(cm[1]))
    print("\n")
    print("F1-Score")
    print(f1_score(target_test, target_predictions > 0.5))
    print("\n")
    print("Accuracy Score")
    print(accuracy_score(target_test, target_predictions > threshold))
    print("\n")
    print("Classification Report")
    print(classification_report(target_test, target_predictions > 0.5))

In [None]:
# Lets build the model and see the mmodel summary

simplemodel = make_model()
simplemodel.summary()

In [None]:
# we will now train the model on training and validation data
# Now use the function to plot the confusion matrix

start = datetime.now()
epochs_history_simple = simplemodel.fit(features_train, target_train, epochs=EPOCHS,
                          validation_data=(features_val, target_val),
                          verbose=1)
end = datetime.now()
print(f"The training of simple model completed in time - {end - start}")

### Check Performance Graphs

In [None]:
# Plot training loss vs validation loss
plotTrainLossVsValLoss(epochs_history=epochs_history_simple)

In [None]:
# Plot Training accuracy vs Validation accuracy
plotTrainAccuracyVsValAccuracy(epochs_history=epochs_history_simple)

### Model Validation

In [None]:
# Let us run the predictions
target_predictions = simplemodel.predict(features_test)

In [None]:
# Now use the function to plot the confusion matrix
plot_cm (target_test, target_predictions)