# Supervised ML - Predicting fake news using only titles

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

from nltk.stem import PorterStemmer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# from wordcloud import WordCloud ##### Deprecated

import tensorflow as tf

In [None]:
data = pd.read_csv('/kaggle/input/source-based-news-classification/news_articles.csv')
data

In [None]:
data = data.dropna(axis=0)
data

# Exploratory Data Analysis
- We'll explore how frequent some words appear in the titles

In [None]:
def get_top_n_words(corpus, n = None):
    """
    A function that returns the top 'n' unigrams used in the corpus
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus) ## Shape: (2045, 46774) -> There are 2045 sentences and 46774 words
    sum_words = bag_of_words.sum(axis=0) ## Shape: (1, 46774) -> Count of occurance of each word
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] ## vec.vocabulary_.items returns the dictionary with (word, index)
    freq_sorted = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return freq_sorted[:n]

def get_top_n_bigram(corpus, n = None):
    """
    A function that returns the top 'n' bigrams used in the corpus
    """
    vec = CountVectorizer(ngram_range = (2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    freq_sorted = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return freq_sorted[:n]

In [None]:
top_unigram = get_top_n_words(data['title_without_stopwords'], 20)
words_unigram = [i[0] for i in top_unigram]
count_unigram = [i[1] for i in top_unigram]

top_bigram = get_top_n_bigram(data['text_without_stopwords'], 20)
words_bigram = [i[0] for i in top_bigram]
count_bigram = [i[1] for i in top_bigram]

In [None]:
# Plot bar charts for top unigrams
font_title = {'family': 'sans serif',
        'color':  'white',
        'weight': 'bold',
        'size': 16,
        }
font_text = {'family': 'sans serif',
        'color':  'white',
        'weight': 'bold',
        'size': 12,
        }

with plt.style.context("dark_background"):
    fig, ax = plt.subplots(figsize=(14,4))
    bar = ax.bar(words_unigram, count_unigram, color='#6baed6')
    ax.set_title("Top Unigrams", fontdict=font_title, size=16)
    ax.set_xticklabels(words_unigram, fontdict=font_text, rotation=90)
    ax.grid(axis='y')

In [None]:
# Plot bar charts for top bigrams
with plt.style.context("dark_background"):
    fig, ax = plt.subplots(figsize=(14,4))
    bar = ax.bar(words_bigram, count_bigram, color='#a1dab4')
    ax.set_title("Top Unigrams", fontdict=font_title, size=16)
    ax.set_xticklabels(words_bigram, fontdict=font_text, rotation=90)
    ax.grid(axis='y')

In [None]:
# Visualising frequency of words using WordCloud package
from wordcloud import WordCloud

wc = WordCloud(background_color="black", max_words=100,
               max_font_size=256,
               random_state=42, width=1000, height=1000)
wc.generate(' '.join(data['text_without_stopwords']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
## Visualising the fake and real news percentage
import math
different_labels = data['label'].unique()
counts = data['label'].value_counts().values

plt.figure(figsize=(6,6))
plt.pie(counts, labels=['Fake', 'Real'], autopct='%1.1f%%')
plt.legend()

# Preprocessing
- Since the dataset already contained title without stopwords, we would do stemming, tokenisation and padding to produce a sequence of numbers to feed into our ML model later

In [None]:
titles_stopped = data['title_without_stopwords']
titles_stopped.head()

In [None]:
ps = PorterStemmer()
ps

In [None]:
ps.stem("roasted")

In [None]:
def process_title(title):
    new_title = title.split(" ")
    new_title = list(map(lambda x: ps.stem(x), new_title))
    new_title = list(map(lambda x: x.strip(), new_title))
    if '' in new_title:
        new_title.remove('')
    return new_title

In [None]:
titles_stemmed = titles_stopped.apply(process_title)
titles_stemmed

In [None]:
# Get size of vocabulary
vocabulary = set()

for title in titles_stemmed:
    for word in title:
        if word not in vocabulary:
            vocabulary.add(word)

vocab_length = len(vocabulary)

# Get max length of a sequence
max_seq_length = 0
for title in titles_stemmed:
    if len(title) > max_seq_length:
        max_seq_length = len(title)

In [None]:
# Viewing the words that have been added to our vocabulary
import more_itertools
more_itertools.take(10, vocabulary)

In [None]:
vocab_length

In [None]:
max_seq_length

In [None]:
# Tokenising and padding our sequences
tokenizer = Tokenizer(num_words=vocab_length)
tokenizer.fit_on_texts(titles_stemmed)

sequences = tokenizer.texts_to_sequences(titles_stemmed)

word_index = tokenizer.word_index

model_inputs = pad_sequences(sequences, maxlen=max_seq_length)

In [None]:
# Viewing the sequences converted from the titles
sequences[:10]

In [None]:
# Viewing the words mapped to tokens, with 1 being the most frequent word
import more_itertools
more_itertools.take(10, word_index.items())

In [None]:
# Viewing the effect of padding the sequences
model_inputs

In [None]:
model_inputs.shape

In [None]:
data['label'].unique()

In [None]:
labels = np.array(data['label'].map(dict(Real=0, Fake=1)))
labels

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_inputs, labels, random_state=1)

In [None]:
print('shape of X_train:', X_train.shape)
print('shape of y_train:', y_train.shape)
print('shape of X_test:', X_test.shape)
print('shape of y_test:', y_test.shape)

In [None]:
y_train

In [None]:
y_test

In [None]:
embedding_dim = 64

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length,
)(inputs)

gru = tf.keras.layers.GRU(units=embedding_dim)(embedding)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(gru)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)

batch_size = 16
epochs = 5

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.ModelCheckpoint('model.h5', save_best_only=True)
    ]
)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(10,6))
ax1.plot(history.history['val_loss'])
ax1.plot(history.history['loss'])
ax2.plot(history.history['val_auc'])
ax2.plot(history.history['auc'])

ax1.legend(['val_loss', 'loss'])
ax2.legend(['val_auc', 'auc'])
ax1.set_title('Loss Over Time')
ax2.set_title('AUC Over Time')
ax1.set(xlabel='Epoch', ylabel='Loss')
ax2.set(xlabel='Epoch', ylabel='AUC')

fig.tight_layout()

In [None]:
history.history

In [None]:
model.load_weights('./model.h5')

In [None]:
model.evaluate(X_test, y_test)

# Discussion
-   The final result is an accuracy of about 67%. The results could be further improved if the texts of the articles are used, even better is we could append the texts to the titles and processed it altogether.

-   Further work can look at using LIME or SHAP values to explain how the model has identified what words would carry more weightings to classify whether a news article is real or fake.