In [None]:
!git clone https://github.com/pooja-premnath/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil

Cloning into 'SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 7 (delta 2), reused 7 (delta 2), pack-reused 0[K
Receiving objects: 100% (7/7), 192.82 KiB | 19.28 MiB/s, done.
Resolving deltas: 100% (2/2), done.


## XLM-RoBERTa

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained model for sequence classification
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=5,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5973,0.587761,0.692534,0.604373,0.561308,0.692534
2,0.7745,0.648559,0.692534,0.604794,0.564575,0.692534
3,0.5846,0.573542,0.700393,0.613153,0.836402,0.700393
4,0.5049,0.593841,0.700393,0.616683,0.767263,0.700393
5,0.5224,0.573715,0.687623,0.679759,0.687119,0.687623


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.7004
Test F1 Score: 0.6132
Test Precision: 0.8364
Test Recall: 0.7004


## mBERT

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using mBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained mBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=5,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.581,0.586382,0.69057,0.603441,0.606922,0.69057
2,0.625,0.608179,0.693517,0.604316,0.560249,0.693517
3,0.5017,0.669843,0.69057,0.60877,0.746893,0.69057
4,0.5042,0.571384,0.701375,0.614349,0.575132,0.701375
5,0.5065,0.582963,0.688605,0.656041,0.680518,0.688605


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.7014
Test F1 Score: 0.6143
Test Precision: 0.5751
Test Recall: 0.7014


  _warn_prf(average, modifier, msg_start, len(result))


## mdeBERTa

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using mDeBERTa-v3-base-mnli-xnli
tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained mDeBERTa-v3 model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
    num_labels=4,
    ignore_mismatched_sizes=True
)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=5,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/mDeBERTa-v3-base-mnli-xnli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5311,0.59541,0.684676,0.64178,0.664565,0.684676
2,0.6287,0.591808,0.688605,0.621462,0.649461,0.688605
3,0.5184,0.640156,0.695481,0.615003,0.693945,0.695481
4,0.4797,0.595733,0.696464,0.653043,0.68649,0.696464
5,0.4583,0.621349,0.691552,0.690368,0.693384,0.691552


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6965
Test F1 Score: 0.6530
Test Precision: 0.6865
Test Recall: 0.6965


## Tamil BERT

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using the Tamil BERT model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/tamil-bert")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained Tamil BERT model
model = AutoModelForMaskedLM.from_pretrained("l3cube-pune/tamil-bert")

# Modify the model to perform sequence classification
class CustomBERTForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomBERTForSequenceClassification, self).__init__()
        self.bert = pretrained_model.bert  # Use the BERT model's transformer layers
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomBERTForSequenceClassification(model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.168,1.156668,0.663065,0.59193,0.706295,0.663065
2,0.941,0.922954,0.696464,0.608458,0.568387,0.696464
3,0.684,0.705487,0.698428,0.611289,0.571161,0.698428
4,0.609,0.610287,0.70334,0.614819,0.57155,0.70334
5,0.5794,0.612035,0.694499,0.607054,0.562512,0.694499
6,0.4659,0.629887,0.696464,0.636842,0.691153,0.696464
7,0.4415,0.616324,0.701375,0.702532,0.708369,0.701375
8,0.3764,0.640118,0.68664,0.692224,0.699055,0.68664
9,0.3308,0.671855,0.695481,0.698387,0.703396,0.695481
10,0.3206,0.680175,0.696464,0.698912,0.703814,0.696464


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in s

Test Accuracy: 0.7033
Test F1 Score: 0.6148
Test Precision: 0.5716
Test Recall: 0.7033


  _warn_prf(average, modifier, msg_start, len(result))


## Indic BERT

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using IndicBERT
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained IndicBERT model
model = AutoModel.from_pretrained("ai4bharat/indic-bert")

# Modify the model to perform sequence classification
class CustomIndicBERTForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomIndicBERTForSequenceClassification, self).__init__()
        self.bert = pretrained_model  # Use the IndicBERT model's transformer layers
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomIndicBERTForSequenceClassification(model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0733,1.116384,0.448919,0.288827,0.214191,0.448919
2,0.9482,0.981451,0.493124,0.361434,0.36404,0.493124
3,0.9893,0.875509,0.5,0.355279,0.438059,0.5
4,0.7659,0.728331,0.644401,0.562625,0.586224,0.644401
5,0.6364,0.65896,0.672888,0.589336,0.55715,0.672888
6,0.5804,0.725412,0.6611,0.6053,0.645362,0.6611
7,0.5065,0.673628,0.664047,0.589606,0.589326,0.664047
8,0.4801,0.69786,0.668959,0.668915,0.67694,0.668959
9,0.501,0.674952,0.6611,0.662586,0.669481,0.6611
10,0.4528,0.695425,0.67387,0.674037,0.678981,0.67387


  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  _warn_prf(average, modifier, msg_start, len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6739
Test F1 Score: 0.6740
Test Precision: 0.6790
Test Recall: 0.6739


## Indic BART

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using IndicBART
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained IndicBART model
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

# Modify the model to perform sequence classification
class CustomIndicBARTForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomIndicBARTForSequenceClassification, self).__init__()
        self.config = pretrained_model.config  # Inherit the configuration from the pre-trained model
        self.encoder = pretrained_model.model.encoder  # Encoder layer from the IndicBART model
        self.classifier = nn.Linear(pretrained_model.config.d_model, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomIndicBARTForSequenceClassification(pretrained_model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,             # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="no",              # Disable saving model checkpoints
    load_best_model_at_end=False,    # Do not load the best model at the end
    greater_is_better=True,          # Higher accuracy is better
)

# Initialize the Trainer without model saving
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics,     # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3621,1.360925,0.311395,0.147883,0.096967,0.311395
2,1.2602,1.249939,0.429273,0.397749,0.533235,0.429273
3,1.131,1.079352,0.499018,0.462475,0.555887,0.499018
4,0.9517,0.996753,0.533399,0.446726,0.530718,0.533399
5,0.9402,1.044599,0.527505,0.44212,0.476117,0.527505
6,0.8642,1.025457,0.546169,0.464323,0.480966,0.546169
7,0.8907,0.921151,0.570727,0.507129,0.547757,0.570727
8,0.7815,0.919766,0.571709,0.50177,0.526733,0.571709
9,0.8477,0.904384,0.586444,0.511074,0.550808,0.586444
10,0.8539,0.921607,0.578585,0.502463,0.553074,0.578585


Test Accuracy: 0.5786
Test F1 Score: 0.5025
Test Precision: 0.5531
Test Recall: 0.5786


## MuRiL

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using MuRIL
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}  # Updated to clone and detach
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained MuRIL model
model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased")

# Modify the model to perform sequence classification
class CustomMuRILForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomMuRILForSequenceClassification, self).__init__()
        self.muril = pretrained_model.bert  # Use the BERT model from MuRIL's transformer layers
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.muril(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomMuRILForSequenceClassification(model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy",# Use accuracy to select the best model
    save_total_limit=1               # Only keep the best model to avoid excessive disk usage
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2085,1.178967,0.697446,0.60894,0.566621,0.697446
2,0.9345,0.915509,0.702358,0.612818,0.567974,0.702358
3,0.7077,0.707345,0.698428,0.609966,0.565079,0.698428
4,0.6346,0.620293,0.700393,0.612234,0.570024,0.700393
5,0.6036,0.622151,0.695481,0.63006,0.685741,0.695481
6,0.4443,0.680799,0.683694,0.672722,0.685123,0.683694
7,0.3841,0.681275,0.676817,0.681215,0.687154,0.676817
8,0.3824,0.699431,0.680747,0.685979,0.695829,0.680747
9,0.3513,0.728547,0.688605,0.69118,0.694778,0.688605
10,0.2804,0.742488,0.685658,0.689374,0.69355,0.685658


Test Accuracy: 0.7024
Test F1 Score: 0.6128
Test Precision: 0.5680
Test Recall: 0.7024


## LabSe

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using LaBSE
tokenizer = AutoTokenizer.from_pretrained("setu4993/LaBSE")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained LaBSE model
model = AutoModel.from_pretrained("setu4993/LaBSE")

# Modify the model to perform sequence classification
class CustomLaBSEForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomLaBSEForSequenceClassification, self).__init__()
        self.labse = pretrained_model  # Use the LaBSE model's transformer layers
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.labse(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomLaBSEForSequenceClassification(model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy" # Use accuracy to select the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


model.safetensors:  41%|####      | 765M/1.88G [00:00<?, ?B/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4913,0.567466,0.682711,0.684584,0.699746,0.682711
2,0.5368,0.570582,0.654224,0.600185,0.673787,0.654224
3,0.5028,0.580804,0.682711,0.656887,0.672514,0.682711
4,0.43,0.668711,0.682711,0.676849,0.678931,0.682711
5,0.398,0.737782,0.69057,0.682318,0.687839,0.69057
6,0.1944,1.125469,0.682711,0.676293,0.678122,0.682711
7,0.1706,1.361434,0.676817,0.67991,0.684428,0.676817
8,0.1907,1.589746,0.672888,0.673891,0.677122,0.672888
9,0.0786,1.877247,0.674853,0.676696,0.679096,0.674853
10,0.1191,2.111257,0.674853,0.67478,0.674801,0.674853


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6906
Test F1 Score: 0.6823
Test Precision: 0.6878
Test Recall: 0.6906


## mT5

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using mT5
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained mT5 model
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

# Modify the model to perform sequence classification
class CustomMT5ForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomMT5ForSequenceClassification, self).__init__()
        self.config = pretrained_model.config  # Inherit the configuration from the pre-trained model
        self.shared = pretrained_model.shared  # The shared embedding layer
        self.encoder = pretrained_model.encoder  # Encoder layer from the mT5 model
        self.classifier = nn.Linear(pretrained_model.config.d_model, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomMT5ForSequenceClassification(pretrained_model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments with disabled checkpoint saving
training_args = TrainingArguments(
    output_dir='./results',          # Output directory (won't be used)
    num_train_epochs=10,             # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="no",              # Disable saving model checkpoints
    load_best_model_at_end=False,    # Do not load the best model at the end
    greater_is_better=True,          # Higher accuracy is better
)

# Initialize the Trainer without model saving
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics,     # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3668,1.353159,0.310413,0.184793,0.248933,0.310413
2,1.306,1.243045,0.347741,0.236418,0.483265,0.347741
3,1.0979,1.065156,0.515717,0.479658,0.540353,0.515717
4,0.8726,0.875763,0.59725,0.521643,0.611483,0.59725
5,0.8688,0.774007,0.630648,0.566133,0.61205,0.630648
6,0.702,0.75413,0.655206,0.589242,0.640567,0.655206
7,0.6723,0.761725,0.649312,0.58931,0.650324,0.649312
8,0.6947,0.758878,0.628684,0.623875,0.629764,0.628684
9,0.74,0.749909,0.638507,0.580375,0.627326,0.638507
10,0.727,0.748362,0.637525,0.583396,0.63756,0.637525


Test Accuracy: 0.6375
Test F1 Score: 0.5834
Test Precision: 0.6376
Test Recall: 0.6375


## Tamillion

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using the Tamillion model
tokenizer = AutoTokenizer.from_pretrained("monsoon-nlp/tamillion")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained Tamillion model
model = AutoModel.from_pretrained("monsoon-nlp/tamillion")

# Modify the model to perform sequence classification
class CustomTamillionForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomTamillionForSequenceClassification, self).__init__()
        self.tamillion = pretrained_model  # Use the Tamillion model's transformer layers
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.tamillion(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomTamillionForSequenceClassification(model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model when finished training
    metric_for_best_model="accuracy",# Use accuracy to select the best model
    save_total_limit=1               # Only keep the best model to avoid excessive disk usage
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/837k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2861,1.540984,0.20334,0.068721,0.041347,0.20334
2,1.2452,1.90113,0.20334,0.068721,0.041347,0.20334
3,1.2841,1.938458,0.20334,0.068721,0.041347,0.20334
4,1.2501,1.987072,0.20334,0.068721,0.041347,0.20334
5,1.295,2.023666,0.20334,0.068721,0.041347,0.20334
6,1.2201,2.078246,0.20334,0.068721,0.041347,0.20334
7,1.1798,2.132212,0.20334,0.068721,0.041347,0.20334
8,1.207,2.178213,0.20334,0.068721,0.041347,0.20334
9,1.2196,2.19772,0.20334,0.068721,0.041347,0.20334
10,1.2632,2.200146,0.20334,0.068721,0.041347,0.20334


Test Accuracy: 0.2033
Test F1 Score: 0.0687
Test Precision: 0.0413
Test Recall: 0.2033


## LEALLA

In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# Load the dataset
df = pd.read_csv("/content/SPELLL-2024-Fine-Grained-Dataset-with-Expert-Annotations-and-LLMs-for-Fake-News-in-Tamil/Data/Augmented Dataset.csv")

# Map the categories to numerical labels
label_dict = {'Clickbait': 0, 'Misleading': 1, 'Biased': 2, 'Humor': 3}
df['Category'] = df['Category'].map(label_dict)

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

# Load the tokenizer and tokenize the data using LEALLA-base
tokenizer = AutoTokenizer.from_pretrained("setu4993/LEALLA-base")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = NewsDataset(train_encodings, train_labels.values)
test_dataset = NewsDataset(test_encodings, test_labels.values)

# Load the pre-trained LEALLA-base model
pretrained_model = AutoModel.from_pretrained("setu4993/LEALLA-base")

# Modify the model to perform sequence classification
class CustomLEALLAForSequenceClassification(nn.Module):
    def __init__(self, pretrained_model, num_labels):
        super(CustomLEALLAForSequenceClassification, self).__init__()
        self.config = pretrained_model.config  # Inherit the configuration from the pre-trained model
        self.bert = pretrained_model  # The LEALLA model
        self.classifier = nn.Linear(pretrained_model.config.hidden_size, num_labels)  # Add a classification head

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use the CLS token's representation
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, logits)

# Initialize the custom model for sequence classification
model = CustomLEALLAForSequenceClassification(pretrained_model, num_labels=4)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,             # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    eval_strategy="epoch",           # Evaluate every epoch
    save_strategy="no",              # Disable saving model checkpoints
    load_best_model_at_end=False,    # Do not load the best model at the end
    greater_is_better=True,          # Higher accuracy is better
)

# Initialize the Trainer without model saving
trainer = Trainer(
    model=model,                         # The instantiated model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics,     # The callback that computes metrics of interest
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the results
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {results['eval_f1']:.4f}")
print(f"Test Precision: {results['eval_precision']:.4f}")
print(f"Test Recall: {results['eval_recall']:.4f}")


tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/428M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2726,1.208828,0.51277,0.459861,0.557246,0.51277
2,0.7839,0.671674,0.641454,0.628547,0.65674,0.641454
3,0.6754,0.576308,0.691552,0.604068,0.622468,0.691552
4,0.5352,0.572379,0.695481,0.60676,0.561673,0.695481
5,0.5602,0.569583,0.699411,0.628059,0.683705,0.699411
6,0.4516,0.57136,0.699411,0.614758,0.656584,0.699411
7,0.459,0.572759,0.696464,0.63194,0.668963,0.696464
8,0.4604,0.577347,0.684676,0.687569,0.690802,0.684676
9,0.5088,0.584552,0.688605,0.63744,0.663007,0.688605
10,0.4772,0.580558,0.680747,0.65508,0.667686,0.680747


Test Accuracy: 0.6807
Test F1 Score: 0.6551
Test Precision: 0.6677
Test Recall: 0.6807
