In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn



# Load the datasets
fake_data = pd.read_csv('data/Fake.csv')
true_data = pd.read_csv('data/True.csv')

# Preprocess the data
fake_data['label'] = 1  # Fake news label
true_data['label'] = 0  # Real news label

# Concatenate the datasets
combined_data = pd.concat([fake_data, true_data])

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_data['text'], combined_data['label'], test_size=0.2, random_state=42)


class WeightedTrainer(Trainer):
    def __init__(self, class_weight, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weight = class_weight

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Convert class weights to float32 explicitly
        loss_fn = nn.CrossEntropyLoss(weight=self.class_weight.to(outputs.logits.device, dtype=torch.float32))
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Calculate class weights
class_counts = combined_data['label'].value_counts()
class_weights = torch.tensor([class_counts[0] / class_counts[1], 1.0])  # Weight of class 0 is the ratio of samples in class 1 to class 0, weight of class 1 is 1.0
# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

training_args = TrainingArguments(
    output_dir="./results",  # Provide the output directory path here
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Change save strategy to match evaluation strategy
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
    warmup_steps=500,
    weight_decay=0.01,
    logging_first_step=True,
    load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    greater_is_better=True,
)


# Create instances of the datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=512)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, max_length=512)


# Initialize WeightedTrainer with custom loss function and class weights
trainer = WeightedTrainer(
    class_weight=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=None,  # We'll compute metrics manually
)

# Fine-tune the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_labels.to_numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 1/8763 [00:04<9:51:09,  4.05s/it]

{'loss': 0.6721, 'grad_norm': 3.92455792427063, 'learning_rate': 1.0000000000000001e-07, 'epoch': 0.0}


  0%|          | 8/8763 [00:17<4:37:03,  1.90s/it]

KeyboardInterrupt: 