In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer

In [None]:
model_path = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: 'entailment', 1: 'contradiction', 2: 'neutral'}
label2id = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels = 3, 
                                                           id2label = id2label, 
                                                           label2id = label2id,)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.config.problem_type = "single_label_classification"

In [None]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [None]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
transformed_train_df = pd.read_csv("train_transformed.csv")
transformed_val_df = pd.read_csv("val_transformed.csv")
transformed_test_df = pd.read_csv("test_transformed.csv")

In [None]:
X_train = list(transformed_train_df["text"])
y_train = list(transformed_train_df["label"])

X_val = list(transformed_val_df["text"])
y_val = list(transformed_val_df["label"])

In [None]:
def tokenize_data(texts, labels, tokenizer, max_len=512):
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_len)
    dataset = {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask']
    }
    return dataset, labels

train_encodings, train_labels = tokenize_data(X_train, y_train, tokenizer)
val_encodings, val_labels = tokenize_data(X_val, y_val, tokenizer)

In [None]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [None]:
print("Train labels shape:", train_dataset[0]['labels'].shape)  
print("Validation labels shape:", val_dataset[0]['labels'].shape)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")

    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("<Starting Training>")
trainer.train()

In [None]:
print("Saving the fine-tuned model...")
model_dir = "./bert-finetuned-model"
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)