# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

In [2]:
data = pd.read_csv('../processed_data.csv')

In [3]:
data

Unnamed: 0,label,full_content,processed_full_content
0,1,No comment is expected from Barack Obama Membe...,no comment expect barack obama member fyf911 f...
1,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,1,"Now, most of the demonstrators gathered last n...",demonstr gather last night exercis constitut p...
3,0,A dozen politically active pastors came here f...,dozen polit activ pastor came privat dinner fr...
4,1,"The RS-28 Sarmat missile, dubbed Satan 2, will...",rs-28 sarmat missil dub satan 2 replac ss-18 f...
...,...,...,...
63855,0,WASHINGTON (Reuters) - Hackers believed to be ...,washington reuter hacker believ work russian g...
63856,1,"You know, because in fantasyland Republicans n...",know fantasyland republican never question cit...
63857,0,Migrants Refuse To Leave Train At Refugee Camp...,migrant refus leav train refuge camp hungari t...
63858,0,MEXICO CITY (Reuters) - Donald Trump’s combati...,mexico citi reuter donald trump ’ comb style b...


# Basic DistilBERT

DistilBERT is a smaller, faster, and lighter version of BERT, designed to retain most of BERT's language understanding capabilities while being more computationally efficient.

DistilBERT has only 6 layers instead of BERT's 12, which makes it half the size of BERT in terms of layers. However, it retains the same hidden size (768), meaning it still processes and represents data similarly to BERT but with fewer computational steps. This results in a smaller number of parameters, making DistilBERT about 40-60% faster to train and use in inference.

DistilBERT is trained using knowledge distillation, a technique where a smaller model (the "student", DistilBERT) learns to mimic a larger model (the "teacher", BERT) rather than directly learning from the raw data. During training, the student model doesn't just learn from the labelled dataset but also from the "soft labels" (probabilities) provided by the teacher model. This method allows DistilBERT to capture much of the knowledge from the original BERT model even with fewer layers, preserving 97% of BERT's performance on many NLP tasks.

In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
import numpy as np
import random

# Set random seed for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_d, val_d, train_labels, val_labels = train_test_split(data['processed_full_content'],data['label'],test_size=0.2,random_state=42)

In [6]:
texts_train = list(train_d)
texts_val = list(val_d)

max_length = 64

In [7]:
tokenized_texts_train = tokenizer(texts_train, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
tokenized_texts_val = tokenizer(texts_val, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

In [9]:
tokenized_texts_train['input_ids'][0]

tensor([  101,  3290, 25022,  3775,  2128, 19901,  7426, 21877,  7361,  2140,
         2915,  2757,  9857,  2305,  4319, 24497, 18622,  2102,  2415,  2642,
         3290,  2112, 26568, 29469,  2954,  2048, 13675, 27605,  2078,  6080,
        14056,  2110,  4905,  4962,  2099,  2125,  2594,  2056,  5607,  2165,
         2173, 17496,  5101, 25022,  3775, 28480,  6178,  4183,  3675,  2110,
        28480,  2187,  2403, 21877,  7361,  2140,  2757,  1022,  1999,  9103,
         2099, 14056,  9758,   102])

In [10]:
tokenized_texts_train['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
import torch

In [12]:
train_labels = torch.tensor(list(train_labels))
val_labels = torch.tensor(list(val_labels)) 

In [13]:
train_dataset = TensorDataset(tokenized_texts_train['input_ids'], tokenized_texts_train['attention_mask'], train_labels)
val_dataset = TensorDataset(tokenized_texts_val['input_ids'], tokenized_texts_val['attention_mask'], val_labels)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=5e-6)
criterion = torch.nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)



In [16]:
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, prefetch_factor=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, prefetch_factor=2)

In [17]:
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
train_losses = []
val_losses = []
val_accuracies = []
train_accuracies = []

In [19]:
from tqdm import tqdm

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    tr_correct_preds = 0
    all_tr_labels = []
    all_tr_preds = []

    # Use tqdm to create a progress bar for the training loop
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        tr_loss = outputs.loss
        train_loss += tr_loss.item()
        tr_loss.backward()

        tr_logits = outputs.logits
        tr_preds = torch.argmax(tr_logits, dim=1)
        tr_correct_preds += torch.sum(tr_preds == labels).item()

        # Collect predictions and true labels
        all_tr_labels.extend(labels.cpu().numpy())
        all_tr_preds.extend(tr_preds.cpu().numpy())

        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    scheduler.step()

    # Calculate average training loss and accuracy
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    train_accuracy = tr_correct_preds / len(train_d)
    train_accuracies.append(train_accuracy)

    # Calculate Precision, Recall, F1 for training
    train_precision = precision_score(all_tr_labels, all_tr_preds, average='weighted')
    train_recall = recall_score(all_tr_labels, all_tr_preds, average='weighted')
    train_f1 = f1_score(all_tr_labels, all_tr_preds, average='weighted')

    # Validation phase with tqdm progress bar
    model.eval()
    val_loss = 0.0
    correct_preds = 0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_preds += torch.sum(preds == labels).item()

            # Collect predictions and true labels
            all_val_labels.extend(labels.cpu().numpy())
            all_val_preds.extend(preds.cpu().numpy())

    # Calculate average validation loss and accuracy
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct_preds / len(val_d)
    val_accuracies.append(val_accuracy)

    # Calculate Precision, Recall, F1 for validation
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
    val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
    val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

    # Print metrics
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
    print(f"Training Precision: {train_precision:.4f}, Training Recall: {train_recall:.4f}, Training F1: {train_f1:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1: {val_f1:.4f}")


Epoch 1/5 - Training: 100%|██████████| 799/799 [04:51<00:00,  2.75it/s]
Epoch 1/5 - Validation: 100%|██████████| 200/200 [00:31<00:00,  6.27it/s]



Epoch 1/5
Training Loss: 0.2304, Training Accuracy: 0.9022
Training Precision: 0.9022, Training Recall: 0.9022, Training F1: 0.9022
Validation Loss: 0.1637, Validation Accuracy: 0.9374
Validation Precision: 0.9413, Validation Recall: 0.9374, Validation F1: 0.9375


Epoch 2/5 - Training: 100%|██████████| 799/799 [05:13<00:00,  2.55it/s]
Epoch 2/5 - Validation: 100%|██████████| 200/200 [00:31<00:00,  6.32it/s]



Epoch 2/5
Training Loss: 0.1151, Training Accuracy: 0.9564
Training Precision: 0.9565, Training Recall: 0.9564, Training F1: 0.9565
Validation Loss: 0.1224, Validation Accuracy: 0.9516
Validation Precision: 0.9518, Validation Recall: 0.9516, Validation F1: 0.9516


Epoch 3/5 - Training: 100%|██████████| 799/799 [04:58<00:00,  2.68it/s]
Epoch 3/5 - Validation: 100%|██████████| 200/200 [00:31<00:00,  6.40it/s]



Epoch 3/5
Training Loss: 0.1093, Training Accuracy: 0.9591
Training Precision: 0.9591, Training Recall: 0.9591, Training F1: 0.9591
Validation Loss: 0.1221, Validation Accuracy: 0.9519
Validation Precision: 0.9521, Validation Recall: 0.9519, Validation F1: 0.9520


Epoch 4/5 - Training: 100%|██████████| 799/799 [05:04<00:00,  2.62it/s]
Epoch 4/5 - Validation: 100%|██████████| 200/200 [00:34<00:00,  5.82it/s]



Epoch 4/5
Training Loss: 0.1080, Training Accuracy: 0.9594
Training Precision: 0.9594, Training Recall: 0.9594, Training F1: 0.9594
Validation Loss: 0.1223, Validation Accuracy: 0.9518
Validation Precision: 0.9520, Validation Recall: 0.9518, Validation F1: 0.9519


Epoch 5/5 - Training: 100%|██████████| 799/799 [05:05<00:00,  2.61it/s]
Epoch 5/5 - Validation: 100%|██████████| 200/200 [00:31<00:00,  6.26it/s]


Epoch 5/5
Training Loss: 0.1091, Training Accuracy: 0.9592
Training Precision: 0.9592, Training Recall: 0.9592, Training F1: 0.9592
Validation Loss: 0.1223, Validation Accuracy: 0.9518
Validation Precision: 0.9520, Validation Recall: 0.9518, Validation F1: 0.9519



