In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [37]:
df1 = pd.read_excel("/content/fake_news_flag_01.xlsx")
df2 = pd.read_csv("/content/train.txt", sep="\t")
df3 = pd.read_csv("/content/test.txt", sep="\t")

In [39]:
df2.columns = ["ID", "Text", "Label"]
df3.columns = ["ID", "Text", "Label"]

In [40]:
df2 = df2[df2['Label'] != 'unverified']
df3 = df3[df3['Label'] != 'unverified']

In [41]:
df2['Label'] = df2['Label'].map({"true": 0, "non-rumor": 0, "false": 1})
df3['Label'] = df3['Label'].map({"true": 0, "non-rumor": 0, "false": 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Label'] = df2['Label'].map({"true": 0, "non-rumor": 0, "false": 1})


In [43]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [44]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = torch.tensor(labels).long()
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

    def __len__(self):
        return len(self.texts)


In [45]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

class RobertaClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

class DistilBertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(DistilBertClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.fc = nn.Linear(self.distilbert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.last_hidden_state[:, 0])
        return logits

In [46]:
def train_and_evaluate(model, train_loader, test_loader, criterion, optimizer, device):
    model.to(device)

    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            optimizer.zero_grad()  # Zero gradients
            # Forward pass: get logits from the model
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs  # Directly use outputs since it's a tensor (logits)
            loss = criterion(logits, labels)  # Calculate the loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights
            total_loss += loss.item()  # Track total loss
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():  # Disable gradient calculation
        for batch in test_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            # Forward pass: get logits from the model
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs  # Directly use outputs since it's a tensor (logits)
            predictions = torch.argmax(logits, dim=1).tolist()  # Get predicted class (the class with max logit)
            all_preds.extend(predictions)
            all_labels.extend(labels.tolist())  # Store true labels

    # Calculate performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return all_preds, accuracy, precision, recall, f1


In [47]:
train_dataset_bert = FakeNewsDataset(df2["Text"].tolist(), df2["Label"].tolist(), bert_tokenizer)
test_dataset_bert = FakeNewsDataset(df3["Text"].tolist(), df3["Label"].tolist(), bert_tokenizer)

train_dataset_roberta = FakeNewsDataset(df2["Text"].tolist(), df2["Label"].tolist(), roberta_tokenizer)
test_dataset_roberta = FakeNewsDataset(df3["Text"].tolist(), df3["Label"].tolist(), roberta_tokenizer)

train_dataset_distilbert = FakeNewsDataset(df2["Text"].tolist(), df2["Label"].tolist(), distilbert_tokenizer)
test_dataset_distilbert = FakeNewsDataset(df3["Text"].tolist(), df3["Label"].tolist(), distilbert_tokenizer)


train_loader_bert = DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=8, shuffle=True)
test_loader_roberta = DataLoader(test_dataset_roberta, batch_size=8, shuffle=False)

train_loader_distilbert = DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
class_weights = torch.tensor([1, 10], dtype=torch.float)
class_weights = class_weights.to(device)

In [63]:
bert_model = BertClassifier(num_classes=2)
bert_optimizer = optim.AdamW(bert_model.parameters(), lr=2e-5)

roberta_model = RobertaClassifier(num_classes=2)
roberta_optimizer = optim.AdamW(roberta_model.parameters(), lr=2e-5)

distilbert_model = DistilBertClassifier(num_classes=2)
distilbert_optimizer = optim.AdamW(distilbert_model.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss(weight=class_weights)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
print("Training BERT Model...")
bert_preds, bert_accuracy, bert_precision, bert_recall, bert_f1 = train_and_evaluate(bert_model, train_loader_bert, test_loader_bert, criterion, bert_optimizer, device)

Training BERT Model...
Epoch 1, Loss: 0.5476851130907352
Epoch 2, Loss: 0.350029055458995
Epoch 3, Loss: 0.12675535628715387
Accuracy: 0.8333
Precision: 0.8343
Recall: 0.8333
F1 Score: 0.8338


In [65]:
print("Training Roberta  Model...")
roberta_preds, roberta_accuracy, roberta_precision, roberta_recall, roberta_f1 = train_and_evaluate(
    roberta_model, train_loader_roberta, test_loader_roberta, criterion, roberta_optimizer, device
)

Training Roberta  Model...
Epoch 1, Loss: 0.5267167790577962
Epoch 2, Loss: 0.4128960998585591
Epoch 3, Loss: 0.25358939099197203
Accuracy: 0.6667
Precision: 0.8194
Recall: 0.6667
F1 Score: 0.6682


In [66]:
print("Training DistilBert Model...")
distilbert_preds, distilbert_accuracy, distilbert_precision, distilbert_recall, distilbert_f1 = train_and_evaluate(
    distilbert_model, train_loader_distilbert, test_loader_distilbert, criterion, distilbert_optimizer, device
)

Training DistilBert Model...
Epoch 1, Loss: 0.5292764351918147
Epoch 2, Loss: 0.37377411685884
Epoch 3, Loss: 0.15299717361967152
Accuracy: 0.8551
Precision: 0.8594
Recall: 0.8551
F1 Score: 0.8565


In [89]:
import torch

def soft_ensemble_majority_vote(models, dataloaders, device):
    all_preds = []

    # Ensure that models are on the correct device
    for model in models:
        model.to(device)

    with torch.no_grad():  # Disable gradient calculations during inference
        for batches in zip(*dataloaders):  # Combine batches from different dataloaders
            input_ids_list = []
            attention_mask_list = []

            # Gather input data for all models
            for batch in batches:
                input_ids, attention_mask, _ = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
                input_ids_list.append(input_ids)
                attention_mask_list.append(attention_mask)

            model_probs = []
            for model, input_ids, attention_mask in zip(models, input_ids_list, attention_mask_list):
                # Get logits for each model
                logits = model(input_ids, attention_mask)
                probs = torch.softmax(logits, dim=1)  # Get probabilities for each class
                model_probs.append(probs)

            # Average the probabilities across all models
            avg_probs = torch.mean(torch.stack(model_probs), dim=0)  # Shape: [batch_size, num_classes]
            predictions = torch.argmax(avg_probs, dim=1)  # Get the predicted class (the class with max probability)
            all_preds.extend(predictions.tolist())

    return all_preds

In [90]:
models = [bert_model, roberta_model, distilbert_model]
dataloaders = [test_loader_bert, test_loader_roberta, test_loader_distilbert]

ensemble_preds = soft_ensemble_majority_vote(models, dataloaders, device)

In [91]:
ensemble_accuracy = accuracy_score(df3["Label"], ensemble_preds)
ensemble_precision = precision_score(df3["Label"], ensemble_preds, average="weighted")
ensemble_recall = recall_score(df3["Label"], ensemble_preds, average="weighted")
ensemble_f1 = f1_score(df3["Label"], ensemble_preds, average="weighted")

print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble Precision: {ensemble_precision:.4f}")
print(f"Ensemble Recall: {ensemble_recall:.4f}")
print(f"Ensemble F1 Score: {ensemble_f1:.4f}")

Ensemble Accuracy: 0.8188
Ensemble Precision: 0.8420
Ensemble Recall: 0.8188
Ensemble F1 Score: 0.8232


In [92]:
dataset_bert = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), bert_tokenizer)
dataset_roberta = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), roberta_tokenizer)
dataset_distilbert = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), distilbert_tokenizer)

In [93]:
dataloader_bert = DataLoader(dataset_bert, batch_size=8, shuffle=False)
dataloader_roberta = DataLoader(dataset_roberta, batch_size=8, shuffle=False)
dataloader_distilbert = DataLoader(dataset_distilbert, batch_size=8, shuffle=False)

In [94]:
distilbert_predictions = []
distilbert_model.to(device)
distilbert_model.eval()

with torch.no_grad():
    for batch in dataloader_distilbert:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = distilbert_model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)

        predictions = torch.argmax(probs, dim=1)
        distilbert_predictions.extend(predictions.tolist())

df1['distilbert_predictions'] = distilbert_predictions

df1['distilbert_predictions_label'] = df1['distilbert_predictions'].map({0: 'False', 1: 'True'})

In [95]:
df1['Fake_News_Flag_Num'] = df1['Fake_News_Flag'].astype(int)

In [96]:
df1.head()

Unnamed: 0,Message_ID,Max_Cosine_Similarity,Fake_News_Flag,Cleaned_text,roberta_predictions,roberta_predictions_label,Fake_News_Flag_Num,distilbert_predictions,distilbert_predictions_label
0,1266,0.573545,True,donald trump hate america want eliminate israe...,1,True,1,1,True
1,1268,0.497152,True,breaking germany called european union classif...,1,True,1,1,True
2,1269,0.527227,True,accurate photo right israel israel v hamas syr...,1,True,1,1,True
3,1273,0.536036,True,breaking idf released footage precise eliminat...,1,True,1,1,True
4,1274,0.508663,True,love israel heart please comment shabbat shalo...,1,True,1,0,False


In [97]:
accuracy = accuracy_score(df1['Fake_News_Flag_Num'], df1['distilbert_predictions'])
precision = precision_score(df1['Fake_News_Flag_Num'], df1['distilbert_predictions'])
recall = recall_score(df1['Fake_News_Flag_Num'], df1['distilbert_predictions'])
f1 = f1_score(df1['Fake_News_Flag_Num'], df1['distilbert_predictions'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.4362934362934363
Precision: 0.9593220338983051
Recall: 0.3991537376586742
F1 Score: 0.5637450199203188


In [88]:
print(df1['distilbert_predictions'].value_counts())
print(df1['Fake_News_Flag'].value_counts())


distilbert_predictions
1    1553
0       1
Name: count, dtype: int64
Fake_News_Flag
True     1418
False     136
Name: count, dtype: int64
