In [None]:
# read the data from the Fake.csv and True.csv files and create training, validation and test sets

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from tf_mcc import MCC

In [2]:
from zipfile import ZipFile

def unzip(file_name, final_path):
    with ZipFile(file_name, "r") as zipObj:
        zipObj.extractall(path = final_path)

unzip("dataset/Fake.csv.zip", "dataset/")
unzip("dataset/True.csv.zip", "dataset/")
        

In [4]:
fake = pd.read_csv('dataset/Fake.csv')
true = pd.read_csv('dataset/True.csv')

In [71]:
# true.isnull().any()
# fake.isnull().any()

In [5]:
# remove rows of text with empty strings 
fake = fake.drop(fake[fake.text == " "].index)
true = true.drop(true[true.text == " "].index)

In [6]:
# add binary column that indicates if the news are True (1) or Fake (0) 
fake["classification"] = 0
true["classification"] = 1

In [7]:
# combine the Fake and True dataframes 
frames = [fake, true]
result = pd.concat(frames, ignore_index = True)

In [8]:
result.head(-1)

Unnamed: 0,title,text,subject,date,classification
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44265,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,worldnews,"August 21, 2017",1
44266,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44267,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44268,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1


In [9]:
# generate a random sample (without replacement) of the combined dataframe
# use random_state to make it reproducible
corpus = result.sample(frac = 1, replace = False, random_state = 1234)
corpus.head()

Unnamed: 0,title,text,subject,date,classification
1220,Comey Might Have Just Made Sure We Don’t End ...,Many of us have been ready to impeach Donald T...,News,"June 8, 2017",0
43761,Exclusive: Cambodia says opposition party coul...,PHNOM PENH (Reuters) - Cambodia s government h...,worldnews,"September 5, 2017",1
24279,Whirlpool's washer war is balancing act for Trump,"Clyde, Ohio (Reuters) - In the middle of Whir...",politicsNews,"October 4, 2017",1
11119,BOOM! TRUMP POLL Numbers Going Up…Up…Up! While...,"Way before the election ever took place, the l...",politics,"Apr 17, 2017",0
29975,U.S. Holocaust museum alarmed over 'hateful sp...,WASHINGTON (Reuters) - The U.S. Holocaust Memo...,politicsNews,"November 22, 2016",1


In [10]:
# split data into training, validation and test set
X_features_names = ["title"]
X_features = corpus[X_features_names].values

X_data = X_features
y_data = corpus["classification"].values


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size = 0.4, random_state = 1234)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, random_state = 1234)

In [12]:
(X_train.shape, y_train.shape)

((26562, 1), (26562,))

In [28]:
# X_train[:5]

In [None]:
# save to csv file (index = False)

In [26]:
df_X_train = pd.DataFrame(X_train, columns = ["title"])
df_X_val = pd.DataFrame(X_val, columns = ["title"])
df_X_test = pd.DataFrame(X_test, columns = ["title"])

df_y_train = pd.DataFrame(y_train, columns = ["classification"])
df_y_val = pd.DataFrame(y_val, columns = ["classification"])
df_y_test = pd.DataFrame(y_test, columns = ["classification"])

In [27]:
df_X_train.to_csv("dataset/X_train.csv", index = False)
df_X_val.to_csv("dataset/X_val.csv", index = False)
df_X_test.to_csv("dataset/X_test.csv", index = False)
df_y_train.to_csv("dataset/y_train.csv", index = False)
df_y_val.to_csv("dataset/y_val.csv", index = False)
df_y_test.to_csv("dataset/y_test.csv", index = False)

In [12]:
# model parameters
vocabulary_size = 10000 # maximum vocabulary size (max_features, maximum number of tokens)
max_len = 250 # sequence length to pad the outputs to
embedding_dim = 16

# create vocab layer
# instantiate the text vectorization layer
vectorize_layer = TextVectorization(max_tokens = vocabulary_size, 
                                    standardize = "lower_and_strip_punctuation", 
                                    output_mode = "int", 
                                    output_sequence_length = max_len)

# build the vocabulary
# vectorization layer that generates a tensor of shape (batch_size, max_len) containing vocabulary indices
vectorize_layer.adapt(X_train)

# vectorize_layer.get_vocabulary()
# vectorize_layer.vocabulary_size()

In [None]:
# model predictions on the test set

In [24]:
y_pred = model.predict(test_text)



In [25]:
y_pred.shape

(8855, 1)

In [26]:
y_pred > 0.5

array([[False],
       [False],
       [ True],
       ...,
       [ True],
       [False],
       [ True]])

In [27]:
evaluation = model.evaluate(x = test_text, y = y_test, return_dict = True)



In [28]:
evaluation.items()

dict_items([('loss', 0.10133900493383408), ('binary_accuracy', 0.9597967267036438), ('precision', 0.9430451393127441), ('recall', 0.9753578901290894), ('mcc', 0.9200799465179443)])

In [29]:
for metric, value in evaluation.items():
    print(f"{metric}: {value:.4f}")

loss: 0.1013
binary_accuracy: 0.9598
precision: 0.9430
recall: 0.9754
mcc: 0.9201
