In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
import pandas as pd
import numpy as np
import nltk
import random
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import openai
import os
import json
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value
import time
import datetime
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
import torch
from torch import nn
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import gc
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load data

In [3]:
# Define dataset file paths
datasets = {
    "LIAR Train": "train.tsv",
    "LIAR Test": "test.tsv",
    "LIAR Valid": "valid.tsv",
    "Gossip Fake": "gossipcop_fake.csv",
    "Gossip Real": "gossipcop_real.csv",
    "Political Fake": "politifact_fake.csv",
    "Political Real": "politifact_real.csv"
}

# Define LIAR dataset column names
liar_columns = [
    "id", "label", "text", "subjects", "speaker", "job_title", "state",
    "party_affiliation", "barely_true_count", "false_count", "half_true_count",
    "mostly_true_count", "pants_fire_count", "context"
]

# Load datasets into a dictionary
dataframes = {}
for name, path in datasets.items():
    sep = "\t" if path.endswith(".tsv") else ","  # Detect separator
    columns = liar_columns if "LIAR" in name else None  # Assign columns only for LIAR datasets
    dataframes[name] = pd.read_csv(path, sep=sep, header=None if columns else "infer", names=columns)

# Print first few rows of each dataset
for name, df in dataframes.items():
    print(f"\n {name} Data:")
    print(df.columns)



 LIAR Train Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 LIAR Test Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 LIAR Valid Data:
Index(['id', 'label', 'text', 'subjects', 'speaker', 'job_title', 'state',
       'party_affiliation', 'barely_true_count', 'false_count',
       'half_true_count', 'mostly_true_count', 'pants_fire_count', 'context'],
      dtype='object')

 Gossip Fake Data:
Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')

 Gossip Real Data:
Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')

 Political Fake Data:
Index(['id', 'news_url', 'title',

# Data Preprocessing: Standardizing Dataset Structure

In [4]:

#  retain essential columns for the liar dataset
liar_columns = ["text", "label", "subjects", "context", "speaker", "party_affiliation","barely_true_count", "false_count", "half_true_count",
    "mostly_true_count", "pants_fire_count", "state"]
for key in ["LIAR Train", "LIAR Test", "LIAR Valid"]:
    dataframes[key] = dataframes[key][liar_columns]

#  manually add 'label' column before selecting other columns
label_mapping = {
    "Gossip Fake": "fake", "Gossip Real": "real",
    "Political Fake": "fake", "Political Real": "real"
}

for key, label in label_mapping.items():
    # first, create the label column
    dataframes[key]["label"] = label

    # check if 'title' and 'news_url' exist before renaming
    expected_columns = ["title", "news_url", "label"]
    available_columns = [col for col in expected_columns if col in dataframes[key].columns]

    if "title" in available_columns:
        dataframes[key] = dataframes[key][available_columns].rename(columns={"title": "text"})
    else:
        print(f" Warning: Column 'title' not found in {key}. Available columns: {dataframes[key].columns}")

# #  ensure all datasets have a consistent structure
# for name, df in dataframes.items():
#     print(f"\n {name} Data (After Filtering):")
#     print(df.head())


# Handling Missing Value

In [5]:
# check the size of the datasets
for name, df in dataframes.items():
    print(f"\n {name} Size of the datasets: {df.shape}")


 LIAR Train Size of the datasets: (10240, 12)

 LIAR Test Size of the datasets: (1267, 12)

 LIAR Valid Size of the datasets: (1284, 12)

 Gossip Fake Size of the datasets: (5323, 3)

 Gossip Real Size of the datasets: (16817, 3)

 Political Fake Size of the datasets: (432, 3)

 Political Real Size of the datasets: (624, 3)


In [6]:
# check missing value
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    print(f"\n {name} missing value:")
    print(missing_values)


 LIAR Train missing value:
text                    0
label                   0
subjects                2
context               102
speaker                 2
party_affiliation       2
barely_true_count       2
false_count             2
half_true_count         2
mostly_true_count       2
pants_fire_count        2
state                2210
dtype: int64

 LIAR Test missing value:
text                   0
label                  0
subjects               0
context               17
speaker                0
party_affiliation      0
barely_true_count      0
false_count            0
half_true_count        0
mostly_true_count      0
pants_fire_count       0
state                262
dtype: int64

 LIAR Valid missing value:
text                   0
label                  0
subjects               0
context               12
speaker                0
party_affiliation      0
barely_true_count      0
false_count            0
half_true_count        0
mostly_true_count      0
pants_fire_count       0
stat

In [7]:
# calculate the percentage of missing values for each dataset
for name, df in dataframes.items():
    missing_percentage = df.isnull().sum() / len(df) * 100  # Compute the missing value percentage 
    print(missing_percentage)

text                  0.000000
label                 0.000000
subjects              0.019531
context               0.996094
speaker               0.019531
party_affiliation     0.019531
barely_true_count     0.019531
false_count           0.019531
half_true_count       0.019531
mostly_true_count     0.019531
pants_fire_count      0.019531
state                21.582031
dtype: float64
text                  0.000000
label                 0.000000
subjects              0.000000
context               1.341752
speaker               0.000000
party_affiliation     0.000000
barely_true_count     0.000000
false_count           0.000000
half_true_count       0.000000
mostly_true_count     0.000000
pants_fire_count      0.000000
state                20.678769
dtype: float64
text                  0.000000
label                 0.000000
subjects              0.000000
context               0.934579
speaker               0.000000
party_affiliation     0.000000
barely_true_count     0.000000
false_cou

In [8]:
# Remove rows with missing values in specified columns for liar dataset
for key in ["LIAR Train", "LIAR Test", "LIAR Valid"]:
    initial_rows = dataframes[key].shape[0]
    dataframes[key].dropna(subset=["subjects", "speaker", "party_affiliation", "context"], inplace=True)
    removed_rows = initial_rows - dataframes[key].shape[0]
    print(f"{key} - Rows removed: {removed_rows}")
    dataframes[key]["state"].fillna("Unknown", inplace=True)

# Remove rows with missing values in news_url for gossipcop and political dataset
for key in ["Gossip Fake", "Gossip Real", "Political Fake", "Political Real"]:
    initial_rows = dataframes[key].shape[0]
    dataframes[key].dropna(subset=["news_url"], inplace=True)
    removed_rows = initial_rows - dataframes[key].shape[0]
    print(f"{key} - Rows removed: {removed_rows}")

# Make sure missing values are fixed
for name, df in dataframes.items():
    print(f"\n{name} Missing Values After Fixing:")
    print(df.isnull().sum())


LIAR Train - Rows removed: 102
LIAR Test - Rows removed: 17
LIAR Valid - Rows removed: 12
Gossip Fake - Rows removed: 256
Gossip Real - Rows removed: 13
Political Fake - Rows removed: 4
Political Real - Rows removed: 57

LIAR Train Missing Values After Fixing:
text                 0
label                0
subjects             0
context              0
speaker              0
party_affiliation    0
barely_true_count    0
false_count          0
half_true_count      0
mostly_true_count    0
pants_fire_count     0
state                0
dtype: int64

LIAR Test Missing Values After Fixing:
text                 0
label                0
subjects             0
context              0
speaker              0
party_affiliation    0
barely_true_count    0
false_count          0
half_true_count      0
mostly_true_count    0
pants_fire_count     0
state                0
dtype: int64

LIAR Valid Missing Values After Fixing:
text                 0
label                0
subjects             0
context    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframes[key]["state"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframes[key]["state"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

# Remove punctuation and stopwords

In [9]:
# Define a function to remove punctuation and stopwords
def clean_text(text):
    stop_words = set(stopwords.words("english"))  # Load the stopword list
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()  # Split the sentence into words
    words = [word for word in words if word.lower() not in stop_words]  # Filter out stopwords
    return " ".join(words)  # Reassemble the cleaned words into a sentence

# Apply clean_text() to all datasets
for key in dataframes.keys():
    dataframes[key]["text"] = dataframes[key]["text"].apply(clean_text)


# Additional dataset specific pre-processing for RoBERTa training

In [10]:
# Liar Dataset Preprocessing
def preprocess_liar(df):

    # Mapping labels to integers
    df['label'] = df['label'].map({
        'pants-fire': 0,
        'false': 1,
        'barely-true': 2,
        'half-true': 3,
        'mostly-true': 4,
        'true': 5
    })

    credibility_columns = ['barely_true_count', 'false_count', 'half_true_count',
                          'mostly_true_count', 'pants_fire_count']

    for col in credibility_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Calculating simplified credibility score to reduce text length
    df['credibility_score'] = (
        0.1 * df['pants_fire_count'] +
        0.3 * df['false_count'] +
        0.5 * df['barely_true_count'] +
        0.7 * df['half_true_count'] +
        0.9 * df['mostly_true_count']
    ) / (df[credibility_columns].sum(axis=1) + 1)

    df['combined_text'] = (
        "Statement: " + df['text'].astype(str) +
        " | Speaker: " + df['speaker'].astype(str) +
        " | Party: " + df['party_affiliation'].astype(str) +
        " | Score: " + df['credibility_score'].round(2).astype(str)
    )

    return df[['combined_text', 'label']]

# FakeNewsNet preprocessing
def preprocess_fake_news(df):
    df['label'] = df['label'].map({'real': 1, 'fake': 0})
    df['combined_text'] = "Title: " + df['text'].astype(str) + " | URL: " + df['news_url'].astype(str)
    return df[['combined_text', 'label']]


In [11]:
def load_data_and_preprocess(dataframes):
    
    # Load dataframes for the LIAR dataset
    liar_train = dataframes["LIAR Train"]
    liar_test = dataframes["LIAR Test"]
    liar_val = dataframes["LIAR Valid"]

    # Load dataframes for the FakeNewsNet datasets
    gossipcop_fake = dataframes["Gossip Fake"]
    gossipcop_real = dataframes["Gossip Real"]
    politifact_fake = dataframes["Political Fake"]
    politifact_real = dataframes["Political Real"]
    
    # Combine all fake and real news data from FakeNewsNet datasets into a single dataframe
    fake_news_data = pd.concat([gossipcop_fake, gossipcop_real, politifact_fake, politifact_real])

    # Preprocess the LIAR dataset dataframes
    liar_train = preprocess_liar(liar_train)
    liar_val = preprocess_liar(liar_val)
    liar_test = preprocess_liar(liar_test)
    
    # Preprocess the combined FakeNewsNet datasets
    fake_news_data = preprocess_fake_news(fake_news_data)
    
    # Output the length of the combined text from the first row of the LIAR training set for verification
    print(f"Sample LIAR combined text length: {len(liar_train['combined_text'].iloc[0]) if len(liar_train) > 0 else 0} chars")

    # Output the length of the combined text from the first row of the FakeNewsNet datasets for verification
    print(f"Sample FakeNews combined text length: {len(fake_news_data['combined_text'].iloc[0]) if len(fake_news_data) > 0 else 0} chars")

    # Train-test-val (70-15-15) split FakeNewsNet data
    fake_news_train_data, temp_data = train_test_split(
        fake_news_data,
        test_size=0.3,
        random_state=42,
        stratify=fake_news_data['label'] # Ensure stratified sampling based on labels
    )
    # Split the temporary data into validation and test datasets, equally divided
    fake_news_val_data, fake_news_test_data = train_test_split(
        temp_data,
        test_size=0.5,
        random_state=42,
        stratify=temp_data['label']
    )
    # Return the preprocessed datasets
    return liar_train,liar_val,liar_test,fake_news_train_data,fake_news_val_data,fake_news_test_data


In [12]:
liar_train,liar_val,liar_test,fake_news_train_data,fake_news_val_data,fake_news_test_data = load_data_and_preprocess(dataframes)

Sample LIAR combined text length: 142 chars
Sample FakeNews combined text length: 160 chars


# RoBERTa Training and Evaluation

In [15]:
# Calculate class-level weights for imbalanced classes
def balanced_class_sampling(labels):
    label_counts = Counter(labels)
    total_samples = sum(label_counts.values())
    num_classes = len(label_counts)

    class_weights = {label: total_samples / (num_classes * count) for label, count in label_counts.items()}
    weight_tensor = torch.tensor([class_weights[label] for label in label_counts.keys()], dtype=torch.float32)
    return weight_tensor

# Format time from seconds to hours, minutes, and seconds
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Switch to CPU if GPUs are not available
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

device = get_device()


# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Tokenize the input text and prepare it for the model
def tokenize_function(examples, max_length=128,  dataset_name=None):
    tokenized = tokenizer(
        examples['combined_text'],
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    return tokenized

# Define a custom RoBERTa model with attention pooling
class RobertaWithAttentionPooling(nn.Module):
    def __init__(self, num_labels, model_name="roberta-base", dropout_rate=0.1):
        super().__init__()
        self.config = RobertaConfig.from_pretrained(
            model_name,
            num_labels=num_labels,
            hidden_dropout_prob=dropout_rate # Load config with modified dropout
        )
        self.roberta = RobertaForSequenceClassification.from_pretrained(
            model_name,
            config=self.config # Load RoBERTa model with the specified configuration
        )
    # Perform the forward pass and return model outputs
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.roberta(
            input_ids=input_ids, # Input IDs from tokenizer
            attention_mask=attention_mask, # Attention mask to handle padding
            labels=labels, # Actual labels for computing loss
            **kwargs
        )
        return outputs

# Initialize the CustomTrainer with optional class weights and focal loss gamma parameter    
class CustomTrainer(Trainer): 
    def __init__(self, class_weights=None, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs) # Initialize the base Trainer
        self.class_weights = class_weights# Class weights for handling imbalance
        self.gamma = gamma

        # Initialize training and evaluation metrics
        self.total_train_tokens = 0
        self.total_eval_tokens = 0
        self.total_train_examples = 0
        self.total_eval_examples = 0

        # Initialize per-epoch training metrics
        self.current_epoch = 0
        self.epoch_train_tokens = 0
        self.epoch_train_examples = 0
        self.epoch_stats = []

        # Determine size of the training dataset
        self.train_dataset_size = 0
        if hasattr(self, 'train_dataset') and self.train_dataset is not None:
            self.train_dataset_size = len(self.train_dataset)
    
    # Compute the loss using either focal loss with class weights or standard cross-entropy
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs['input_ids']
        batch_size = input_ids.size(0)

        # Count non-padding tokens in the input
        non_padding_tokens = (input_ids != 1).sum().item()

        # Update token and example counters for training
        self.total_train_tokens += non_padding_tokens
        self.epoch_train_tokens += non_padding_tokens
        self.total_train_examples += batch_size
        self.epoch_train_examples += batch_size

        # Pop labels from inputs and send them to the correct device
        labels = inputs.pop("labels").to(device)
        outputs = model(**inputs)
        logits = outputs.logits

        # If class weights are provided, compute the weighted cross-entropy and focal loss
        if self.class_weights is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            ce_loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
            probs = F.softmax(logits, dim=-1)
            pt = probs.gather(1, labels.unsqueeze(1)).squeeze(1)
            focal_weight = (1 - pt) ** self.gamma
            loss = (focal_weight * ce_loss).mean()
        
        else: # Standard cross-entropy loss
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        # Return labels to inputs and return loss and optionally outputs
        inputs["labels"] = labels
        return (loss, outputs) if return_outputs else loss

    # Perform evaluation loop, counting tokens and examples processed
    def evaluation_loop(self, dataloader, description, prediction_loss_only=None, ignore_keys=None, metric_key_prefix="eval"):
        current_eval_tokens = 0
        current_eval_examples = 0

        # Use the base Trainer's evaluation loop
        outputs = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix
        )

        # Count non-padding tokens and examples in the evaluation dataset
        try:
            eval_dataloader = dataloader
            for batch in eval_dataloader:
                if isinstance(batch, dict) and 'input_ids' in batch:
                    batch_size = batch['input_ids'].size(0)
                    non_padding_tokens = (batch['input_ids'] != 1).sum().item()
                    current_eval_tokens += non_padding_tokens
                    current_eval_examples += batch_size
       
        # Fallback for counting examples and approximating tokens based on training data average
        except:
            if hasattr(dataloader, 'dataset'):
                current_eval_examples = len(dataloader.dataset)
                if self.total_train_examples > 0 and self.total_train_tokens > 0:
                    avg_tokens = self.total_train_tokens / self.total_train_examples
                    current_eval_tokens = int(current_eval_examples * avg_tokens)
                else:
                    current_eval_tokens = current_eval_examples * 128  # Approximate if no data available

        # Update total evaluation tokens and examples
        self.total_eval_tokens += current_eval_tokens
        self.total_eval_examples += current_eval_examples

        return outputs

    # Update the training dataset size before training
    def train(self):
        if hasattr(self, 'train_dataset') and self.train_dataset is not None:
            self.train_dataset_size = len(self.train_dataset)
        result = super().train() # Call the base class train method and capture the result

        return result 

    # Reset epoch-level training token and example counters
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_train_tokens = 0
        self.epoch_train_examples = 0
        return super().on_epoch_begin(args, state, control, **kwargs) # Call base method for any additional setup

    # Store training token and example statistics at the end of each epoch
    def on_epoch_end(self, args, state, control, **kwargs):
        """Called at the end of each epoch to store per-epoch statistics"""
        self.epoch_stats.append({
            'epoch': self.current_epoch + 1, # Increment the current epoch counter
            'train_tokens': self.epoch_train_tokens,
            'train_examples': self.epoch_train_examples,
        })

        self.current_epoch += 1
        return super().on_epoch_end(args, state, control, **kwargs)

    # Calculate and print statistics related to token and example usage across epochs
    def print_token_efficiency(self):
        num_epochs = self.current_epoch if self.current_epoch > 0 else int(self.args.num_train_epochs)

        print("\n===== Token Usage Statistics =====")

        if self.epoch_stats:
            print("\nPer-Epoch Training Usage:")
            for epoch_data in self.epoch_stats:
                epoch = epoch_data['epoch']
                tokens = epoch_data['train_tokens']
                examples = epoch_data['train_examples']
                print(f"Epoch {epoch}: {tokens:,} tokens for {examples:,} examples ({tokens/examples:.2f} tokens/example)")

        unique_examples_per_epoch = self.train_dataset_size
        if unique_examples_per_epoch == 0 and self.epoch_stats:
            unique_examples_per_epoch = self.epoch_stats[0]['train_examples']

        print("\nOverall Usage:")
        print(f"Total training tokens across {num_epochs} epochs: {self.total_train_tokens:,}")
        print(f"Total training examples processed across {num_epochs} epochs: {self.total_train_examples:,}")
        print(f"Unique training examples per epoch: {unique_examples_per_epoch:,}")

        if self.total_train_examples > 0:
            print(f"Average tokens per training example: {self.total_train_tokens / self.total_train_examples:.2f}")

        print(f"\nTotal evaluation tokens: {self.total_eval_tokens:,}")
        print(f"Total evaluation examples: {self.total_eval_examples:,}")
        if self.total_eval_examples > 0:
            print(f"Average tokens per evaluation example: {self.total_eval_tokens / self.total_eval_examples:.2f}")

        total_tokens = self.total_train_tokens + self.total_eval_tokens
        total_examples = self.total_train_examples + self.total_eval_examples

        print(f"\nGrand total: {total_tokens:,} tokens processed for {total_examples:,} examples")
        print(f"(Training: {num_epochs} epochs × {unique_examples_per_epoch:,} examples/epoch + Evaluation: {self.total_eval_examples:,} examples)")
        print("==================================\n")

# Calculate evaluation metrics based on predictions and actual labels
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1) # Convert prediction logits to class labels

    # Compute accuracy, precision, recall, and F1 score
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)
    macro_f1 = f1_score(labels, preds, average='macro', zero_division=0)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'macro_f1': macro_f1
    }


def map_liar_to_binary(label):
    # Mapping LIAR classes to binary: 0,1,2 -> Fake (0), 3,4,5 -> Real (1)
    return 0 if label <= 2 else 1

# Safely tokenize dataset texts with a specified tokenizer, handling large datasets in batches
def safe_tokenize(dataset, dataset_name, max_len=256):
      tokenized = dataset.map(
                lambda examples: tokenize_function(    examples,
                    max_length=max_len,
                    dataset_name=dataset_name
      ),
      batched=True,
      batch_size=32,
      remove_columns=['combined_text']  
            )
      # Set the format of tokenized data suitable for PyTorch models
      tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
      return tokenized

# Map binary FakeNewsNet labels to the corresponding LIAR labels (Fake or Real)
def map_fake_news_to_liar(label):
        return 0 if label == 0 else 5  

# Convert binary labels to multi-class labels using a mapping function
def convert_to_multi(dataset):
        return dataset.map(
            lambda x: {'label': map_fake_news_to_liar(x['label'])},
            desc="Converting to binary labels"
        )
# Start the training and evaluation pipeline for fake news detection
def train_and_evaluate_models(liar_train,liar_val,liar_test,fake_news_train_data,fake_news_val_data,fake_news_test_data):
    print("Starting fake news detection pipeline...")
    total_start_time = time.time()

    # Calculate class weights for the LIAR dataset to handle class imbalance
    y_train = liar_train['label'].values
    liar_class_weights = balanced_class_sampling(y_train).to(device)
    print("LIAR sampling weights:", liar_class_weights.cpu().numpy())

    # Calculate class weights for the FakeNewsNet dataset
    binary_y_train = fake_news_train_data['label'].values
    binary_class_weights = balanced_class_sampling(binary_y_train).to(device)

    print("FakeNewsNet class weights:", binary_class_weights.cpu().numpy())
    
    global tokenizer # Initialize the tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    max_length = 128

    # Convert dataframes to datasets
    liar_train_dataset = Dataset.from_pandas(liar_train)
    liar_val_dataset = Dataset.from_pandas(liar_val)
    liar_test_dataset = Dataset.from_pandas(liar_test)

    fake_news_train_dataset = Dataset.from_pandas(fake_news_train_data)
    fake_news_val_dataset = Dataset.from_pandas(fake_news_val_data)
    fake_news_test_dataset = Dataset.from_pandas(fake_news_test_data)

    print("Tokenizing datasets...") # Tokenize all datasets
    tokenize_start_time = time.time()

    liar_train_tokenized = safe_tokenize(liar_train_dataset, "LIAR Train")
    liar_val_tokenized = safe_tokenize(liar_val_dataset, "LIAR Val")
    liar_test_tokenized = safe_tokenize(liar_test_dataset, "LIAR Test")

    fake_news_train_tokenized = safe_tokenize(fake_news_train_dataset, "FakeNews Train")
    fake_news_val_tokenized = safe_tokenize(fake_news_val_dataset, "FakeNews Val")
    fake_news_test_tokenized = safe_tokenize(fake_news_test_dataset, "FakeNews Test")

    tokenize_time = time.time() - tokenize_start_time
    print(f"Tokenization completed in {format_time(tokenize_time)}")

    # Initialize models for LIAR and FakeNewsNet datasets
    print("Initializing models...")
    model_liar = RobertaWithAttentionPooling(num_labels=6)
    model_liar.to(device)

    model_fake_news = RobertaWithAttentionPooling(num_labels=2)
    model_fake_news.to(device)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        learning_rate=1e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        
        # fp16=torch.cuda.is_available(),
        fp16=torch.cuda.is_available() and not torch.backends.mps.is_available(),
        dataloader_num_workers=2,
        gradient_accumulation_steps=2,
        no_cuda=torch.backends.mps.is_available(),

    )

    print("Setting up trainers...")

    # Setup trainers with specific models, datasets, and training arguments
    trainer_liar = CustomTrainer(
        model=model_liar,
        args=training_args,
        train_dataset=liar_train_tokenized,
        eval_dataset=liar_val_tokenized,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        class_weights=liar_class_weights,
        gamma=1.5 
    )

    # Train and evaluate LIAR model
    print("Training LIAR model...")
    trainer_liar.train()

    print("Evaluating LIAR model...")
    liar_results = trainer_liar.evaluate(liar_test_tokenized)
    print(f"LIAR Test Results: {liar_results}")

    trainer_liar.print_token_efficiency()

    print("Cross-dataset evaluation...")

    # Adjust the FakeNewsNet dataset for evaluation with the LIAR model
    fake_news_test_adjusted = convert_to_multi(fake_news_test_tokenized)

    fake_news_results_liar_model = trainer_liar.evaluate(fake_news_test_adjusted)
    print(f"LIAR model on FakeNewsNet: {fake_news_results_liar_model}")

    # Setup and run the trainer for the FakeNewsNet model
    trainer_fake_news = CustomTrainer(
        model=model_fake_news,
        args=training_args,
        train_dataset=fake_news_train_tokenized,
        eval_dataset=fake_news_val_tokenized,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        class_weights=binary_class_weights,
        gamma=1.0
    )

    print("Training FakeNewsNet model...")
    trainer_fake_news.train()

    print("Evaluating FakeNewsNet model...")
    fake_news_results = trainer_fake_news.evaluate(fake_news_test_tokenized)
    print(f"FakeNewsNet Test Results: {fake_news_results}")

    trainer_fake_news.print_token_efficiency() # Print token efficiency stats

    # Convert multi-class labels to binary labels for cross-dataset evaluation
    def convert_to_binary(dataset):
        return dataset.map( 
            lambda x: {'label': map_liar_to_binary(x['label'])}, # Apply mapping function to each label
            desc="Converting to binary labels" # Description shown during processing
        )

    # Convert the LIAR test dataset to binary labels for cross-dataset evaluation
    liar_test_binary = convert_to_binary(liar_test_tokenized)
    print("Cross-dataset evaluation...")
    
    # Evaluate the FakeNewsNet model on the binary-labeled LIAR test dataset
    cross_eval_results = trainer_fake_news.evaluate(liar_test_binary)
    print(f"Cross-Dataset Results: {cross_eval_results}")

    # Calculate and print the total execution time of the script
    total_time = time.time() - total_start_time
    print(f"Total execution time: {format_time(total_time)}")

    return {
        "liar_trainer": trainer_liar,
        "fake_news_trainer": trainer_fake_news,
        "liar_results": liar_results,
        "fake_news_results": fake_news_results,
        "cross_dataset_results": cross_eval_results,

    }

# Call the training and evaluation function with datasets as arguments
if __name__ == "__main__":
    results = train_and_evaluate_models(liar_train,liar_val,liar_test,fake_news_train_data,fake_news_val_data,fake_news_test_data)
    # results = train_and_evaluate_models(liar_train.head(100),liar_val.head(10),liar_test.head(10),fake_news_train_data.head(100),fake_news_val_data.head(10),fake_news_test_data.head(10))



Using GPU: Tesla T4
Starting fake news detection pipeline...
LIAR sampling weights: [0.8568289  0.80575424 0.8687232  1.0203301  1.0302845  2.0406601 ]
FakeNewsNet class weights: [0.65814143 2.0808632 ]
Tokenizing datasets...


Map:   0%|          | 0/10138 [00:00<?, ? examples/s]

Map:   0%|          | 0/1272 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

Map:   0%|          | 0/16006 [00:00<?, ? examples/s]

Map:   0%|          | 0/3430 [00:00<?, ? examples/s]

Map:   0%|          | 0/3430 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenization completed in 0:00:41
Initializing models...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up trainers...
Training LIAR model...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Macro F1
100,2.5834,1.31545,0.131289,0.017237,0.131289,0.030473,0.038684
200,2.2053,0.992475,0.391509,0.494765,0.391509,0.368984,0.352723
300,1.7613,0.853297,0.407233,0.440674,0.407233,0.397842,0.402904
400,1.599,0.815159,0.40173,0.471611,0.40173,0.399321,0.404781
500,1.5891,0.804785,0.414308,0.450649,0.414308,0.412516,0.418488
600,1.5758,0.784573,0.408019,0.470032,0.408019,0.411267,0.416683
700,1.518,0.779732,0.419811,0.453709,0.419811,0.421971,0.426234
800,1.4715,0.775682,0.437893,0.474494,0.437893,0.439728,0.442921
900,1.4098,0.790262,0.418239,0.460367,0.418239,0.421268,0.426348
1000,1.3408,0.778471,0.430031,0.448515,0.430031,0.432312,0.437448


Evaluating LIAR model...


LIAR Test Results: {'eval_loss': 0.8336982131004333, 'eval_accuracy': 0.3984, 'eval_precision': 0.42646163725987024, 'eval_recall': 0.3984, 'eval_f1': 0.39998920058454995, 'eval_macro_f1': 0.412735987883807, 'eval_runtime': 4.4297, 'eval_samples_per_second': 282.187, 'eval_steps_per_second': 9.03, 'epoch': 6.921135646687697}

===== Token Usage Statistics =====

Overall Usage:
Total training tokens across 10 epochs: 2,942,960
Total training examples processed across 10 epochs: 85,414
Unique training examples per epoch: 10,138
Average tokens per training example: 34.46

Total evaluation tokens: 523,867
Total evaluation examples: 15,242
Average tokens per evaluation example: 34.37

Grand total: 3,466,827 tokens processed for 100,656 examples
(Training: 10 epochs × 10,138 examples/epoch + Evaluation: 15,242 examples)

Cross-dataset evaluation...


Converting to binary labels:   0%|          | 0/3430 [00:00<?, ? examples/s]

LIAR model on FakeNewsNet: {'eval_loss': 1.6161656379699707, 'eval_accuracy': 0.0565597667638484, 'eval_precision': 0.7209984567641788, 'eval_recall': 0.0565597667638484, 'eval_f1': 0.1045360286850049, 'eval_macro_f1': 0.03149389414677097, 'eval_runtime': 11.3724, 'eval_samples_per_second': 301.608, 'eval_steps_per_second': 9.497, 'epoch': 6.921135646687697}
Training FakeNewsNet model...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Macro F1
100,0.2598,0.028418,0.759767,0.577246,0.759767,0.656048,0.431743
200,0.0035,0.001459,0.985714,0.985978,0.985714,0.985562,0.980021
300,0.0026,0.001276,0.985714,0.985978,0.985714,0.985562,0.980021
400,0.003,0.001015,0.985714,0.985978,0.985714,0.985562,0.980021


Evaluating FakeNewsNet model...


FakeNewsNet Test Results: {'eval_loss': 0.0015432683285325766, 'eval_accuracy': 0.9854227405247813, 'eval_precision': 0.9856972651853316, 'eval_recall': 0.9854227405247813, 'eval_f1': 0.9852642543981199, 'eval_macro_f1': 0.9796221482889733, 'eval_runtime': 11.0387, 'eval_samples_per_second': 310.725, 'eval_steps_per_second': 9.784, 'epoch': 1.594810379241517}

===== Token Usage Statistics =====

Overall Usage:
Total training tokens across 10 epochs: 2,483,381
Total training examples processed across 10 epochs: 42,692
Unique training examples per epoch: 16,006
Average tokens per training example: 58.17

Total evaluation tokens: 1,001,211
Total evaluation examples: 17,150
Average tokens per evaluation example: 58.38

Grand total: 3,484,592 tokens processed for 59,842 examples
(Training: 10 epochs × 16,006 examples/epoch + Evaluation: 17,150 examples)



Converting to binary labels:   0%|          | 0/1250 [00:00<?, ? examples/s]

Cross-dataset evaluation...
Cross-Dataset Results: {'eval_loss': 1.8387531042099, 'eval_accuracy': 0.4392, 'eval_precision': 0.19289664, 'eval_recall': 0.4392, 'eval_f1': 0.2680609227348527, 'eval_macro_f1': 0.3051695386325737, 'eval_runtime': 4.4505, 'eval_samples_per_second': 280.867, 'eval_steps_per_second': 8.988, 'epoch': 1.594810379241517}
Total execution time: 0:23:31


In [14]:
# Saving the trained model
results["liar_trainer"].save_model("./liar_trained_model")
results["fake_news_trainer"].save_model("./fake_news_trained_model")