# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [2]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

In [3]:
data = pd.read_csv('../processed_data.csv')

In [3]:
data

Unnamed: 0,label,full_content,processed_full_content
0,1,No comment is expected from Barack Obama Membe...,no comment expect barack obama member fyf911 f...
1,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,1,"Now, most of the demonstrators gathered last n...",demonstr gather last night exercis constitut p...
3,0,A dozen politically active pastors came here f...,dozen polit activ pastor came privat dinner fr...
4,1,"The RS-28 Sarmat missile, dubbed Satan 2, will...",rs-28 sarmat missil dub satan 2 replac ss-18 f...
...,...,...,...
63855,0,WASHINGTON (Reuters) - Hackers believed to be ...,washington reuter hacker believ work russian g...
63856,1,"You know, because in fantasyland Republicans n...",know fantasyland republican never question cit...
63857,0,Migrants Refuse To Leave Train At Refugee Camp...,migrant refus leav train refuge camp hungari t...
63858,0,MEXICO CITY (Reuters) - Donald Trump’s combati...,mexico citi reuter donald trump ’ comb style b...


In [4]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ------------ --------------------------- 3.1/10.0 MB 15.4 MB/s eta 0:00:01
   ---------------------- ----------------- 5.8/10.0 MB 14.1 MB/s eta 0:00:01
   ------------------------------------ --- 9.2/10.0 MB 14.6 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 14.6 MB/s eta 0:00:00
Download


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Basic BERT

BERT is a transformer model that provides powerful pre-trained embeddings for downstream tasks such as fake news classification using text.

To use BERT in Tensorflow, we utilise the `transformers` library by HuggingFace, which simplifies the process of loading pre-trained BERT models and tokenizers.

BERT is limited to a maximum input length of 512 tokens.

Fine-tuning BERT usually requires fewer epochs (2-4) and smaller batch sizes (16 or 32) due to memory constraints and pre-trained knowledge.

In [20]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
import numpy as np
import random

# Set random seed for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
train_d, val_d, train_labels, val_labels = train_test_split(data['processed_full_content'],data['label'],test_size=0.2,random_state=42)

In [22]:
texts_train = list(train_d)
texts_val = list(val_d)

max_length = 64

In [23]:
tokenized_texts_train = tokenizer(texts_train, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
tokenized_texts_val = tokenizer(texts_val, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

In [24]:
tokenized_texts_train['input_ids'][0]

tensor([  101,  3290, 25022,  3775,  2128, 19901,  7426, 21877,  7361,  2140,
         2915,  2757,  9857,  2305,  4319, 24497, 18622,  2102,  2415,  2642,
         3290,  2112, 26568, 29469,  2954,  2048, 13675, 27605,  2078,  6080,
        14056,  2110,  4905,  4962,  2099,  2125,  2594,  2056,  5607,  2165,
         2173, 17496,  5101, 25022,  3775, 28480,  6178,  4183,  3675,  2110,
        28480,  2187,  2403, 21877,  7361,  2140,  2757,  1022,  1999,  9103,
         2099, 14056,  9758,   102])

In [25]:
tokenized_texts_train['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [26]:
import torch

In [27]:
train_labels = torch.tensor(list(train_labels))
val_labels = torch.tensor(list(val_labels)) 

In [28]:
train_dataset = TensorDataset(tokenized_texts_train['input_ids'], tokenized_texts_train['attention_mask'], train_labels)
val_dataset = TensorDataset(tokenized_texts_val['input_ids'], tokenized_texts_val['attention_mask'], val_labels)

In [29]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2, num_hidden_layers=12, hidden_size=768, output_attentions=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
from torch.nn import BatchNorm1d
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 5e-6},
    {'params': model.classifier.parameters(), 'lr': 5e-6}
], lr=5e-6)
criterion = torch.nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)

In [31]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [32]:
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
train_losses = []
val_losses = []
val_accuracies = []
train_accuracies = []

In [33]:
from tqdm import tqdm

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    tr_correct_preds = 0
    all_tr_labels = []
    all_tr_preds = []

    # Use tqdm to create a progress bar for the training loop
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        tr_loss = outputs.loss
        train_loss += tr_loss.item()
        tr_loss.backward()

        tr_logits = outputs.logits
        tr_preds = torch.argmax(tr_logits, dim=1)
        tr_correct_preds += torch.sum(tr_preds == labels).item()

        # Collect predictions and true labels
        all_tr_labels.extend(labels.cpu().numpy())
        all_tr_preds.extend(tr_preds.cpu().numpy())

        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    scheduler.step()

    # Calculate average training loss and accuracy
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    train_accuracy = tr_correct_preds / len(train_d)
    train_accuracies.append(train_accuracy)

    # Calculate Precision, Recall, F1 for training
    train_precision = precision_score(all_tr_labels, all_tr_preds, average='weighted')
    train_recall = recall_score(all_tr_labels, all_tr_preds, average='weighted')
    train_f1 = f1_score(all_tr_labels, all_tr_preds, average='weighted')

    # Validation phase with tqdm progress bar
    model.eval()
    val_loss = 0.0
    correct_preds = 0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_preds += torch.sum(preds == labels).item()

            # Collect predictions and true labels
            all_val_labels.extend(labels.cpu().numpy())
            all_val_preds.extend(preds.cpu().numpy())

    # Calculate average validation loss and accuracy
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct_preds / len(val_d)
    val_accuracies.append(val_accuracy)

    # Calculate Precision, Recall, F1 for validation
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
    val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
    val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

    # Print metrics
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
    print(f"Training Precision: {train_precision:.4f}, Training Recall: {train_recall:.4f}, Training F1: {train_f1:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1: {val_f1:.4f}")


Epoch 1/3 - Training: 100%|██████████| 1597/1597 [1:58:19<00:00,  4.45s/it]
Epoch 1/3 - Validation: 100%|██████████| 400/400 [09:07<00:00,  1.37s/it]



Epoch 1/3
Training Loss: 0.1939, Training Accuracy: 0.9206
Training Precision: 0.9209, Training Recall: 0.9206, Training F1: 0.9207
Validation Loss: 0.1120, Validation Accuracy: 0.9574
Validation Precision: 0.9577, Validation Recall: 0.9574, Validation F1: 0.9573


Epoch 2/3 - Training: 100%|██████████| 1597/1597 [1:58:44<00:00,  4.46s/it]
Epoch 2/3 - Validation: 100%|██████████| 400/400 [08:52<00:00,  1.33s/it]



Epoch 2/3
Training Loss: 0.0942, Training Accuracy: 0.9649
Training Precision: 0.9649, Training Recall: 0.9649, Training F1: 0.9649
Validation Loss: 0.1068, Validation Accuracy: 0.9627
Validation Precision: 0.9627, Validation Recall: 0.9627, Validation F1: 0.9626


Epoch 3/3 - Training: 100%|██████████| 1597/1597 [1:57:48<00:00,  4.43s/it]
Epoch 3/3 - Validation: 100%|██████████| 400/400 [09:02<00:00,  1.36s/it]


Epoch 3/3
Training Loss: 0.0859, Training Accuracy: 0.9696
Training Precision: 0.9696, Training Recall: 0.9696, Training F1: 0.9696
Validation Loss: 0.1066, Validation Accuracy: 0.9623
Validation Precision: 0.9623, Validation Recall: 0.9623, Validation F1: 0.9623



