In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [None]:
df1 = pd.read_excel("/content/fake_news_flag_01.xlsx")
df2 = pd.read_csv("/content/ISOT_Fake_News_subset.csv")

In [None]:
df2.duplicated().sum()

np.int64(0)

In [None]:
df2.drop(columns=["length"], inplace=True)

In [None]:
df2_train, df2_test = train_test_split(df2, test_size=0.20, stratify=df2["Fake_News_Flag"], random_state=42)


print(df2_train["Fake_News_Flag"].value_counts(normalize=True))
print(df2_test["Fake_News_Flag"].value_counts(normalize=True))

Fake_News_Flag
True     0.5
False    0.5
Name: proportion, dtype: float64
Fake_News_Flag
False    0.5
True     0.5
Name: proportion, dtype: float64


In [None]:
print(set(df2_train["Text"]).intersection(set(df2_test["Text"])))

set()


In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = torch.tensor(labels).long()
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

    def __len__(self):
        return len(self.texts)


In [None]:
class BertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

class RobertaClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

class DistilBertClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(DistilBertClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.fc = nn.Linear(self.distilbert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.last_hidden_state[:, 0])
        return logits

In [None]:
def train_and_evaluate(model, train_loader, test_loader, criterion, optimizer, device):
    model.to(device)  # Move model to the correct device
    # Training loop
    model.train()  # Set model to training mode
    for epoch in range(3):  # Loop over epochs
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            optimizer.zero_grad()  # Zero gradients
            # Forward pass: get logits from the model
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs  # Directly use outputs since it's a tensor (logits)
            loss = criterion(logits, labels)  # Calculate the loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights
            total_loss += loss.item()  # Track total loss
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad():  # Disable gradient calculation
        for batch in test_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            # Forward pass: get logits from the model
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs  # Directly use outputs since it's a tensor (logits)
            predictions = torch.argmax(logits, dim=1).tolist()  # Get predicted class (the class with max logit)
            all_preds.extend(predictions)
            all_labels.extend(labels.tolist())  # Store true labels

    # Calculate performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return all_preds, accuracy, precision, recall, f1


In [None]:
train_dataset_bert = FakeNewsDataset(df2_train["Text"].tolist(), df2_train["Fake_News_Flag"].tolist(), bert_tokenizer)
test_dataset_bert = FakeNewsDataset(df2_test["Text"].tolist(), df2_test["Fake_News_Flag"].tolist(), bert_tokenizer)

train_dataset_roberta = FakeNewsDataset(df2_train["Text"].tolist(), df2_train["Fake_News_Flag"].tolist(), roberta_tokenizer)
test_dataset_roberta = FakeNewsDataset(df2_test["Text"].tolist(), df2_test["Fake_News_Flag"].tolist(), roberta_tokenizer)

train_dataset_distilbert = FakeNewsDataset(df2_train["Text"].tolist(), df2_train["Fake_News_Flag"].tolist(), distilbert_tokenizer)
test_dataset_distilbert = FakeNewsDataset(df2_test["Text"].tolist(), df2_test["Fake_News_Flag"].tolist(), distilbert_tokenizer)


train_loader_bert = DataLoader(train_dataset_bert, batch_size=8, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=8, shuffle=False)

train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=8, shuffle=True)
test_loader_roberta = DataLoader(test_dataset_roberta, batch_size=8, shuffle=False)

train_loader_distilbert = DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
test_loader_distilbert = DataLoader(test_dataset_distilbert, batch_size=8, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class_weights = torch.tensor([1, 10], dtype=torch.float)
class_weights = class_weights.to(device)

In [None]:
bert_model = BertClassifier(num_classes=2)
bert_optimizer = optim.AdamW(bert_model.parameters(), lr=2e-5)

roberta_model = RobertaClassifier(num_classes=2)
roberta_optimizer = optim.AdamW(roberta_model.parameters(), lr=2e-5)

distilbert_model = DistilBertClassifier(num_classes=2)
distilbert_optimizer = optim.AdamW(distilbert_model.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss(weight=class_weights)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
print("Training BERT Model...")
bert_preds, bert_accuracy, bert_precision, bert_recall, bert_f1 = train_and_evaluate(bert_model, train_loader_bert, test_loader_bert, criterion, bert_optimizer, device)

Training BERT Model...
Epoch 1, Loss: 0.02891748454834504
Epoch 2, Loss: 0.000252913760288555
Epoch 3, Loss: 9.505832416338804e-05
Accuracy: 0.9991
Precision: 0.9991
Recall: 0.9991
F1 Score: 0.9991


In [None]:
print("Training Roberta  Model...")
roberta_preds, roberta_accuracy, roberta_precision, roberta_recall, roberta_f1 = train_and_evaluate(
    roberta_model, train_loader_roberta, test_loader_roberta, criterion, roberta_optimizer, device
)

Training RobertaT Model...
Epoch 1, Loss: 0.03126151370220627
Epoch 2, Loss: 0.0025042600963471077
Epoch 3, Loss: 4.8188610595719765e-05
Accuracy: 0.9991
Precision: 0.9991
Recall: 0.9991
F1 Score: 0.9991


In [None]:
print("Training DistilBert Model...")
distilbert_preds, distilbert_accuracy, distilbert_precision, distilbert_recall, distilbert_f1 = train_and_evaluate(
    distilbert_model, train_loader_distilbert, test_loader_distilbert, criterion, distilbert_optimizer, device
)

Training DistilBert Model...
Epoch 1, Loss: 0.032557776493300813
Epoch 2, Loss: 0.00016941715874550442
Epoch 3, Loss: 6.28914788930243e-05
Accuracy: 0.9991
Precision: 0.9991
Recall: 0.9991
F1 Score: 0.9991


In [None]:
import torch

def soft_ensemble_majority_vote(models, dataloaders, device):
    all_preds = []

    # Ensure that models are on the correct device
    for model in models:
        model.to(device)

    with torch.no_grad():  # Disable gradient calculations during inference
        for batches in zip(*dataloaders):  # Combine batches from different dataloaders
            input_ids_list = []
            attention_mask_list = []

            # Gather input data for all models
            for batch in batches:
                input_ids, attention_mask, _ = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
                input_ids_list.append(input_ids)
                attention_mask_list.append(attention_mask)

            model_probs = []
            for model, input_ids, attention_mask in zip(models, input_ids_list, attention_mask_list):
                # Get logits for each model
                logits = model(input_ids, attention_mask)
                probs = torch.softmax(logits, dim=1)  # Get probabilities for each class
                model_probs.append(probs)

            # Average the probabilities across all models
            avg_probs = torch.mean(torch.stack(model_probs), dim=0)  # Shape: [batch_size, num_classes]
            predictions = torch.argmax(avg_probs, dim=1)  # Get the predicted class (the class with max probability)
            all_preds.extend(predictions.tolist())

    return all_preds

In [None]:
models = [bert_model, roberta_model, distilbert_model]
dataloaders = [test_loader_bert, test_loader_roberta, test_loader_distilbert]

ensemble_preds = soft_ensemble_majority_vote(models, dataloaders, device)

In [None]:
ensemble_accuracy = accuracy_score(df2_test["Fake_News_Flag"], ensemble_preds)
ensemble_precision = precision_score(df2_test["Fake_News_Flag"], ensemble_preds, average="weighted")
ensemble_recall = recall_score(df2_test["Fake_News_Flag"], ensemble_preds, average="weighted")
ensemble_f1 = f1_score(df2_test["Fake_News_Flag"], ensemble_preds, average="weighted")

print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble Precision: {ensemble_precision:.4f}")
print(f"Ensemble Recall: {ensemble_recall:.4f}")
print(f"Ensemble F1 Score: {ensemble_f1:.4f}")

Ensemble Accuracy: 0.9991
Ensemble Precision: 0.9991
Ensemble Recall: 0.9991
Ensemble F1 Score: 0.9991


In [None]:
dataset_bert = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), bert_tokenizer)
dataset_roberta = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), roberta_tokenizer)
dataset_distilbert = FakeNewsDataset(df1["Cleaned_text"].tolist(), [0] * len(df1), distilbert_tokenizer)

In [None]:
dataloader_bert = DataLoader(dataset_bert, batch_size=8, shuffle=False)
dataloader_roberta = DataLoader(dataset_roberta, batch_size=8, shuffle=False)
dataloader_distilbert = DataLoader(dataset_distilbert, batch_size=8, shuffle=False)

In [None]:
roberta_predictions = []
roberta_model.to(device)
roberta_model.eval()

with torch.no_grad():
    for batch in dataloader_roberta:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = roberta_model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)

        predictions = torch.argmax(probs, dim=1)
        roberta_predictions.extend(predictions.tolist())

df1['roberta_predictions'] = roberta_predictions

df1['roberta_predictions_label'] = df1['roberta_predictions'].map({0: 'False', 1: 'True'})

In [None]:
df1['Fake_News_Flag_Num'] = df1['Fake_News_Flag'].astype(int)

In [None]:
df1.head()

Unnamed: 0,Message_ID,Max_Cosine_Similarity,Fake_News_Flag,Cleaned_text,roberta_predictions,roberta_predictions_label,Fake_News_Flag_Num
0,1266,0.573545,True,donald trump hate america want eliminate israe...,0,False,1
1,1268,0.497152,True,breaking germany called european union classif...,0,False,1
2,1269,0.527227,True,accurate photo right israel israel v hamas syr...,0,False,1
3,1273,0.536036,True,breaking idf released footage precise eliminat...,0,False,1
4,1274,0.508663,True,love israel heart please comment shabbat shalo...,0,False,1


In [None]:
accuracy = accuracy_score(df1['Fake_News_Flag_Num'], df1['roberta_predictions'])
precision = precision_score(df1['Fake_News_Flag_Num'], df1['roberta_predictions'])
recall = recall_score(df1['Fake_News_Flag_Num'], df1['roberta_predictions'])
f1 = f1_score(df1['Fake_News_Flag_Num'], df1['roberta_predictions'])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.08751608751608751
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(df1['roberta_predictions_label'].value_counts())
print(df1['Fake_News_Flag'].value_counts())
