# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [30]:
# !pip3 install -r requirements.txt
!pip3 install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.17.0-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow)
  Using cached tensorflow_intel-2.17.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.17.0->tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.17.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.17.0->tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.17.0->tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.17.0->tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from 

In [5]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# Count Vectoriser + Neural network


We first experiment with the use of a neural network for the classification task at hand.

We use a neural network for the following reasons:
#### Handling High-Dimensional Data: 
Neural networks are suited for handling high-dimensional data because they can learn complex patterns and relationships between features, such as text embeddings. This allows for the modelling of complex patterns between words or phrases.

#### Adaptability to Textual Nuances:
Neural networks can capture contextual information and nuances in text better than traditional models, which is crucial for accurately classifying fake news.

#### Enhanced Performance with Non-Linearities:
Neural networks can leverage non-linear fucntions (e.g., ReLU, sigmoid) to capture more intricate patterns in the data, allowing them to be able to discern subtle cues in language taht indicate misleading content.

Building upon our experiment using CNN, 

We use only 2 hidden layers with a dropout rate of 0.3 initially.
Additionally, the ADAM (Adaptive Moment Estimation) optimiser is used to adjust the weights to minimise the loss function (binary crossentropy for binary classification tasks).
Early stopping is implemented to prevent the tree from being overfitted.
Lastly, we set the number of epochs to 20, and allocate a batch size of 64.

In [3]:

# !pip install scikit-learn

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


data = pd.read_csv('processed_data.csv')

use = data['processed_full_content'].apply(lambda x: x.lower())
vectorizer = CountVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(use).toarray()


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, data['label'],test_size=0.2, random_state=42)

model = Sequential()

# Input layer and first hidden layer with dropout
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))

# Second hidden layer with dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',  
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    epochs=20, 
                    batch_size=64, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.9066 - loss: 0.2463 - val_accuracy: 0.9589 - val_loss: 0.1136
Epoch 2/20
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9744 - loss: 0.0752 - val_accuracy: 0.9603 - val_loss: 0.1171
Epoch 3/20
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9876 - loss: 0.0401 - val_accuracy: 0.9627 - val_loss: 0.1275
Epoch 4/20
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9914 - loss: 0.0261 - val_accuracy: 0.9654 - val_loss: 0.1366
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9587 - loss: 0.1116
Test accuracy: 0.9588944315910339


In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_prob = model.predict(X_test) 
y_pred = (y_pred_prob > 0.5).astype(int)  

# Calculate Accuracy, Precision, Recall, and F1 Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Accuracy: 0.9588944566238647
Precision: 0.9434951784807986
Recall: 0.966886269070735
F1 Score: 0.9550475211918829


As we can observe from our initial run, it has unusually high performance metrics. This possibly implies overfitting, so we will try to tune the hyperparameters.

We lower the number of nodes for each layer from 128 to 64 and 64 to 32 respectively, and the dropout increased to 0.7.
We also lower the number of epochs from 20 to 5 (as well as the patience value from 3 to 2, such that early stopping is now applied after 2-3 epochs of no improvement in the validation loss) and batch size to 16 in order to introduce more noise, as model weights are now updated more frequently.

Additionally, we also reduce the max_features parameter of the count vectorizer, as we believe that reducing the number of features may also contribute to improving the generalization of the model.

Lastly, we use elastic net, a form of regularization that leverages the feature selection capabilities of l1 regularization, and the weight distribution smoothing capabilities of l2 regularization.


In [5]:
from keras.regularizers import l2
from keras.regularizers import l1
from keras.regularizers import l1_l2

second_run = data['processed_full_content'].apply(lambda x: x.lower())
vectorizer = CountVectorizer(max_features=750)
X_vectorized = vectorizer.fit_transform(second_run).toarray()


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, data['label'],test_size=0.2, random_state=42)

model = Sequential()

# Input layer and first hidden layer with dropout
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.7))

# Second hidden layer with dropout
model.add(Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.7))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.00005),
              loss='binary_crossentropy',  
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(X_train, y_train,
                    epochs=5, 
                    batch_size=16, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3193/3193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.5057 - loss: 10.2801 - val_accuracy: 0.6286 - val_loss: 3.3507
Epoch 2/5
[1m3193/3193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 955us/step - accuracy: 0.5709 - loss: 2.5045 - val_accuracy: 0.6768 - val_loss: 1.1654
Epoch 3/5
[1m3193/3193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 961us/step - accuracy: 0.6083 - loss: 1.0628 - val_accuracy: 0.7545 - val_loss: 0.7679
Epoch 4/5
[1m3193/3193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 977us/step - accuracy: 0.6277 - loss: 0.7689 - val_accuracy: 0.8168 - val_loss: 0.6533
Epoch 5/5
[1m3193/3193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6734 - loss: 0.6856 - val_accuracy: 0.8415 - val_loss: 0.6243
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step - accuracy: 0.8447 - loss: 0.6232
Test accuracy: 0.8415283560752869


In [6]:
y_pred_prob = model.predict(X_test) 
y_pred = (y_pred_prob > 0.5).astype(int)  

# Calculate Accuracy, Precision, Recall, and F1 Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 920us/step
Accuracy: 0.8415283432508612
Precision: 0.845642540620384
Recall: 0.7940360610263523
F1 Score: 0.8190271816881259


Accuracy has now gone down to 0.83847 (compared to 0.96445) which is a more reasonable result. 

In this experiemnt, we sought to explore the capabilites of neural networks for fake news detection, only to find out that it had unusually high performance metrics. We then sought to increase the generalization capabilities, by tuning the hyperparameters of the neural network to not cause potential overfitting. 
Methods we explored were:
1) lowering the number of nodes to reduce the network's ability to learn complex patterns
2) Increasing dropout to reduce sensitivity to training data
3) Lowering number of epochs to reduce the tendency to 'overlearn'
4) Utilise regularization (L1 and L2) to penalise large weights

However, this ultimately leads to a far less complex model that not only fails to capture meaningful patterns in the data but may also fail to capture nuanced language patterns. The model is now:

- less sensitive to important features
- potentially more sensitive to noise
- higher tendency to make overly simplistic assumptions, resulting in increased bias.  

While F1 is now 0.81, the model could perform poorly against unseen test data or real-world scenarios due to its low complexity, resulting its inability to adapt to complex data.

In the next part of of the experiment, we seek to implement some additional findings we have made from other the experiments with other models in order to feasibly leverage upon neural networks for our task.

# Using k-fold partitioning to achieve better results

As we have learnt from our experiment with CNN, using count vectorizer/tf-idf will result in the the model being unable to capture language patterns and word associations since each term is treated as an independent vector.

Thus, we will instead switch to a pre-trained GloVe embedding pre-processing process.

Additionally, we will use the same k-fold partitioning method as the CNN experiment.

In [7]:

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [8]:

data = pd.read_csv('processed_data.csv')
k_fold_var = data['processed_full_content'].copy()

embeddings_index = {}
with open('./glove.6B.100d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.array(values[1:], dtype='float32')
        embeddings_index[word] = coefs

def text_to_embedding(text, embeddings_index, embedding_dim=100):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)
    
X_embeddings = np.array([text_to_embedding(text, embeddings_index) for text in k_fold_var])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
y = data['label']  

def create_model(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Store metrics
acc_scores, prec_scores, recall_scores, f1_scores = [], [], [], []

for train_index, test_index in skf.split(X_embeddings, y):
    X_train, X_test = X_embeddings[train_index], X_embeddings[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = create_model(X_embeddings.shape[1])
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Predictions and Metrics
    y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
    acc_scores.append(accuracy_score(y_test, y_pred))
    prec_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

print(f"Average Accuracy: {np.mean(acc_scores):.4f}")
print(f"Average Precision: {np.mean(prec_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 945us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 965us/step
Average Accuracy: 0.9108
Average Precision: 0.8828
Average Recall: 0.9273
Average F1 Score: 0.9045
