In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras.layers as tfl
from keras import Sequential
import numpy as np
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TerminateOnNaN, EarlyStopping
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.utils import get_file
import zipfile

## Data Preparation

In [2]:
# Import dataset
dataset_dir = "data/compiledData.csv"
df = pd.read_csv(dataset_dir)
df.head()

Unnamed: 0,title,isFakeNews,src
0,Donald Trump Sends Out Embarrassing New Year’...,True,fake-and-real-news-dataset
1,Drunk Bragging Trump Staffer Started Russian ...,True,fake-and-real-news-dataset
2,Sheriff David Clarke Becomes An Internet Joke...,True,fake-and-real-news-dataset
3,Trump Is So Obsessed He Even Has Obama’s Name...,True,fake-and-real-news-dataset
4,Pope Francis Just Called Out Donald Trump Dur...,True,fake-and-real-news-dataset


In [3]:
max_sequence_length = df['title'].apply(lambda x: len(x.split())).max()
print("Max sequence length:", max_sequence_length)

Max sequence length: 42


In [4]:
dataset_len = len(df)
dataset_len

134694

In [5]:
def train_test_validation_split(df):
    X = df['title']
    y = df['isFakeNews']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_validation_split(df)

In [15]:
def text_preprocessor(text):
    import tensorflow as tf
    punctuation = "!"
    stopwords = {'whom', 'all', 'shouldn', 'wouldn', 'how', 's', 'they', 'were', 'mustn', 'after', 'who', 'its', 'our', 't', 'a', 'very', 'an', 'do', 'be', 'to', 'can', 'had', 'i', 'these', 'himself', 'up', 'just', 'them', 'now', 'has', 'too', 'below', 'did', 'shan', 'until', 'during', 'him', 'into', 'have', "you'd", 'haven', 'theirs', 'ourselves', 'once', "isn't", 'than', "it's", 'wasn', 'yours', "mightn't", 'here', 'ours', 'her', 'doing', 'd', 'yourself', 'y', 'before', 'does', 'then', 'between', 'some', 'with', "needn't", 'further', 'she', 'down', 'on', "you'll", 'for', 'other', 'any', 'their', 'from', 'each', 'most', 'because', 'and', 'few', 'in', "you've", 'o', 'but', 'didn', "shouldn't", 'that', "weren't", 'which', 'or', "hasn't", 'own', 'about', 'what', "aren't", 'couldn', 'doesn', 'as', "wouldn't", 'hasn', 'no', 'm', 'hers', 'hadn', 'aren', 'while', 'will', "don't", "shan't", 'why', 'at', 'mightn', 'themselves', 'weren', "that'll", 'isn', 'only', 'the', 'been', "couldn't", 'don', 'should', 'same', 'both', 'where', 'was', 'me', 'through', "hadn't", 've', 'against', 'if', 'under', 'such', 'is', 'll', "haven't", 'ain', 're', "didn't", 'nor', 'not', 'being', 'are', 'your', 'over', 'off', 'having', 'by', "won't", 'myself', 'out', 'more', "wasn't", "doesn't", 'won', 'this', 'my', 'again', 'ma', 'his', 'when', 'you', 'there', 'herself', 'yourselves', 'itself', 'of', "she's", 'needn', 'we', "mustn't", 'above', "you're", 'so', 'it', "should've", 'am', 'he', 'those'}
    text = tf.strings.lower(text)
    text = tf.strings.strip(text)
    text = tf.strings.regex_replace(text, "<[^>]+>", "") 
    text = tf.strings.regex_replace(text, '[%s]' % punctuation, "") 
    for stopword in stopwords:
        text = tf.strings.regex_replace(text, r"\b%s\b" % stopword, "") 
    
    return text

In [16]:
def create_tokenizer(train_ds, max_words, max_seqlen, output_mode = "int", standardize = "lower_and_strip_punctuation"):
    train_text = train_ds.to_list()
    tokenizer = tfl.TextVectorization(
        standardize=standardize,
        max_tokens=max_words,
        output_sequence_length=42,
        output_mode=output_mode
    )
    tokenizer.adapt(train_text)
    return tokenizer

In [17]:
max_words = 1000000

## Tokenization and Embedding

In [18]:
tokenizer = create_tokenizer(X_train, max_words, max_sequence_length, standardize=text_preprocessor)

In [14]:
def load_pretrained_embeddings_v1(url, output_file, embedding_file, embedding_dim, vocabulary, max_words, max_seqlen):
    embedding_vecs = dict()
    word_idx = dict(zip(vocabulary, range(len(vocabulary))))
    file_dir = get_file(output_file, url)

    with zipfile.ZipFile(file_dir, "r") as f:
        f.extractall("content/")

    with open(embedding_file, "r", encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vec = np.asarray(values[1:], dtype='float32')
            embedding_vecs[word] = embedding_vec

    embedding_matrix = np.zeros((max_words, embedding_dim))
    
    for word, idx in word_idx.items():
        if idx < max_words:
            embedding_vec = embedding_vecs.get(word)
        if embedding_vec is not None:
            embedding_matrix[idx] = embedding_vec
    
    embedding = tfl.Embedding(max_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), mask_zero=False, input_length=max_seqlen, trainable=False)
    return embedding

In [15]:
vocabulary = tokenizer.get_vocabulary()
embedding = load_pretrained_embeddings_v1("https://nlp.stanford.edu/data/glove.twitter.27B.zip", "content/glove.twitter.27B.zip", "content/glove.twitter.27B.100d.txt", 100, vocabulary=vocabulary, max_words=max_words, max_seqlen=max_sequence_length)

Downloading data from https://nlp.stanford.edu/data/glove.twitter.27B.zip


## Neural Network Architecture

In [15]:
def create_recurrent_neural_network(embedding_layer, max_words, max_seqlen, optimizer='adam'):
    model = Sequential(
        [
            embedding_layer,
            tfl.Bidirectional(tfl.LSTM(128, return_sequences=True, input_shape=(max_words, max_seqlen))),
            tfl.Bidirectional(tfl.LSTM(128, return_sequences=False)),
            tfl.Dropout(0.2),
            tfl.Dense(1, activation='sigmoid')
        ]
    )
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = ['accuracy'])

    model.summary()
    return model

In [16]:
model = create_recurrent_neural_network(embedding, max_words, max_sequence_length)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 42, 100)           100000000 
                                                                 
 bidirectional (Bidirectiona  (None, 42, 256)          234496    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 100,628,993
Trainable params: 628,993
Non-

## Model Training

In [18]:
checkpoint_path = "tmp/checkpoints" 
callbacks = [
    ModelCheckpoint(checkpoint_path),
    ReduceLROnPlateau(),
    TerminateOnNaN(),
    EarlyStopping(patience=2)
]

In [19]:
def train_model(model, train_x, train_y, batch_size = 64, epochs=5, callbacks=callbacks):
    train_x = tokenizer(train_x)
    history = model.fit(x=train_x, y=train_y, validation_split=0.17644900953, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
    return history

In [20]:
history = train_model(model, X_train, y_train, batch_size=64, epochs=1, callbacks=callbacks)





INFO:tensorflow:Assets written to: tmp\checkpoints\assets


INFO:tensorflow:Assets written to: tmp\checkpoints\assets




In [21]:
model.save("models/model_preliminary.h5")

In [22]:
saved_model = load_model("models/model_preliminary.h5")

In [24]:
saved_model.evaluate(tokenizer(X_test), y_test)



[0.13706918060779572, 0.9467458724975586]

In [19]:
import pickle
import dill

In [21]:
# Pickle the config and weights
dill.dump({ 'config': tokenizer.get_config(),
            'weights': tokenizer.get_weights(),
            'text_preprocessor': text_preprocessor }
            , open("tokenizers/tokenizer_preliminary.dill", "wb"))

In [9]:
from keras.models import load_model

In [10]:
from_disk = pickle.load(open("./tokenizers/tokenizer_preliminary.pkl", "rb"))
tokenizer = tfl.TextVectorization.from_config(from_disk['config'])
tokenizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
tokenizer.set_weights(from_disk['weights'])

In [11]:
model = load_model('./models/model_preliminary.h5')

## Model Evaluation

In [13]:
loss, accuracy = model.evaluate(tokenizer(X_test), y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.13852912187576294
Test Accuracy: 0.9461024403572083


In [17]:
y_pred_probs = model.predict(tokenizer(X_test))
y_pred = (y_pred_probs > 0.5).astype("int32")



In [18]:
from sklearn.metrics import classification_report, confusion_matrix

# Classification report
report = classification_report(y_test, y_pred, target_names=["Real", "Fake"])
print(report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


              precision    recall  f1-score   support

        Real       0.94      0.94      0.94      9662
        Fake       0.95      0.95      0.95     10543

    accuracy                           0.95     20205
   macro avg       0.95      0.95      0.95     20205
weighted avg       0.95      0.95      0.95     20205

[[ 9112   550]
 [  539 10004]]
