**Imports**

In [None]:
#import needed files
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

!pip install transformers
!pip install -U datasets

from collections import defaultdict, Counter
import json
import torch

from matplotlib import pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#read files
banktrak_df = pd.read_csv('/content/gdrive/MyDrive/Group 1: DSSI Summer 2025/Data/summary_banktrak.csv')
model_path = "/content/gdrive/MyDrive/facebookAI_roberta_base_loop"

In [None]:
#spliting data
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(banktrak_df, test_size=0.2, random_state=42) #validation size is 20%, random state is just there for reproducilbility

# turn data to json dict
from datasets import Dataset, DatasetDict
dataset = DatasetDict({'train': Dataset.from_pandas(train_df), 'validation': Dataset.from_pandas(val_df)}) #use dataset dict to turn to dataframe to dict

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM #import auto tokenizer, and auto model

#give tokenizer and model a variable
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")

#Call your tokenizer here to check if it was properly loaded by using on a test sentence
sample_input = "We want to use a pretrained tokenizer."

tokenized_inputs = tokenizer(sample_input, return_tensors = "pt")

# return pytorch tensor to load into model
print(tokenized_inputs["input_ids"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tensor([[    0,   170,   236,     7,   304,    10, 11857, 26492, 19233,  6315,
             4,     2]])


**DataLoader To Batch**

In [None]:
#apply tokenizer to dataset
tokenized_dataset = dataset.map(lambda example: tokenizer(example['text'], padding="max_length", truncation=True, max_length=64))

#rename columns
tokenized_dataset = tokenized_dataset.rename_column("contains_debt_instrument_information", "labels") #rename this column labels bz the model likes that
tokenized_dataset.set_format("torch")

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [None]:
#identify train and dataset
train_dataset = tokenized_dataset['train'].shuffle(seed=1111)
eval_dataset = tokenized_dataset['validation']

#apply dataloader to dataset
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

**Training & Validation Model**

In [None]:
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from transformers import set_seed
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import DistilBertConfig
from transformers import RobertaConfig

#model path
model_path = "/content/gdrive/MyDrive/facebookAI_roberta_base_loop"

#seed
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#adding drop out to model
config = RobertaConfig.from_pretrained("facebookAI/roberta-base")
config.num_labels = 2
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.3
model = AutoModelForSequenceClassification.from_pretrained("facebookAI/roberta-base", config=config)

#hyperparamters
num_epochs = 1
num_training_steps = len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
best_val_loss = float("inf")
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    # training
    model.train() # training mode
    training_losses = [] #store batch losses
    for batch_i, batch in enumerate(train_dataloader): #for batches in training loader

        optimizer.zero_grad() # prevents gradients from increasing

        # copy input to device
        input_ids = batch['input_ids'].to(device)            #moves these to same device
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels= labels.long()


        # Call the model for Forward Pass
        output = model(input_ids = input_ids, attention_mask = attention_mask, labels =labels)
        training_loss = output.loss
        training_losses.append(training_loss.item())

        #Do backprop and update params by taking an optimization step
        training_loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    print("Mean Training Loss", np.mean(training_losses))

    # validation
    val_loss = 0
    #set to evaluation mode because we dont want to collect gradients
    model.eval()            #disables things like dropout and layer norm randomness
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():                                          #clear gradients
            # copy input to device
            input_ids = batch['input_ids'].to(device)                  #add to device
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels = labels.long()

            #call the model again for Forward Pass
            output = model(input_ids = input_ids, attention_mask = attention_mask, labels =labels)

        # add the batch average of validation loss to the running sum
        val_loss += output.loss

    # calculating average validation loss across all batches
    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")

    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(),'val_loss': best_val_loss,}, f"{model_path}epoch_{epoch}.pt")
print(f"The best validation loss after {num_epochs} epochs is: {best_val_loss}")

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at facebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/22 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Mean Training Loss 0.6805517456748269
Validation loss: 0.646730899810791
Saving checkpoint!
The best validation loss after 1 epochs is: 0.646730899810791


**Evaluate**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

eval_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=len(tokenized_dataset['validation']))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()
test_batch_logits = []
y_true = []
for batch_i, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().detach().numpy()

        # Call the model on test data
        output = model(input_ids = input_ids, attention_mask = attention_mask, labels =None)
        test_batch_logits.append(output.logits)
        y_true.extend(labels)
print(len(test_batch_logits),len(eval_dataloader))
test_logits = torch.cat(test_batch_logits, dim=0)

#sanity check -> dimension 0 of your logits tensor should be same as the size of the test dataset
print(test_logits.shape,len(tokenized_dataset['validation']),len(y_true))

In [None]:
#Convert the logits to predicted labels
y_pred = torch.argmax(test_logits, dim = 1).cpu().numpy()

print(y_true[:10])
print(y_pred[:10])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

In [None]:
misclassified_indices = np.where(y_true != y_pred)[0]

misclassified_rows = dataset["validation"].select(misclassified_indices)
misclassified_rows.to_pandas().to_csv("misclassified.csv", index=False)

misclassified_preds = [y_pred[i] for i in misclassified_indices]
misclassified_trues = [y_true[i] for i in misclassified_indices]

csv_file = pd.read_csv('misclassified.csv')

csv_file["y_true"] = misclassified_trues
csv_file["y_pred"] = misclassified_preds
print(csv_file[["text", "y_true", "y_pred"]])

print("Misclassified row indices:", misclassified_indices)
print(len(misclassified_rows))

In [None]:
# call the f1_score function
print('F1 Score:',f1_score(y_true, y_pred, average='binary'))

# call the accuracy_score function
print('Accuracy Score:',accuracy_score(y_true, y_pred))

from sklearn.metrics import precision_score, recall_score
print('Precision Score:',precision_score(y_true, y_pred))
print('Recall Score:',recall_score(y_true, y_pred))