# **Import**

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

!pip install transformers
!pip install -U datasets

from collections import defaultdict, Counter
import json
import numpy as np
import torch


from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

Mounted at /content/gdrive


In [2]:
# Read in hybrid dataset containing OpenAI summaries of false rows with the original dataset
banktrack_df = pd.read_csv('/content/gdrive/MyDrive/Group 1: DSSI Summer 2025/Data/summary_banktrak.csv')

# Convert labels to boolean values
banktrack_df["contains_debt_instrument_information"] = banktrack_df["contains_debt_instrument_information"].astype(int)
banktrack_df.head()

Unnamed: 0,text,company,cik,item,contains_debt_instrument_information
0,"Senior Secured Credit Facility On May 3, ...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
1,"On January 9, 2019, Lifelogger Technologies...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
2,Promissory Note and Warrants to SBI Investm...,LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
3,"On March 1, 2016, Lifelogger Technologies C...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
4,Acquisition of Pixorial Assets In further...,LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1


In [3]:
from datasets import Dataset, DatasetDict

# Convert DataFrame to a dataset dictionary
dataset = DatasetDict({
    'train': Dataset.from_pandas(banktrack_df)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'company', 'cik', 'item', 'contains_debt_instrument_information'],
        num_rows: 220
    })
})

In [4]:
#S plit dataset to training and validation
split_dataset = dataset["train"].train_test_split(
    test_size=0.2,      # 20% for validation
    seed=42,            # For reproducibility
    shuffle=True        # Shuffle before splitting
)

# Create the final DatasetDict with both splits
dataset = DatasetDict({
    "train": split_dataset["train"],          # 80%
    "validation": split_dataset["test"]       # 20%
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'company', 'cik', 'item', 'contains_debt_instrument_information'],
        num_rows: 176
    })
    validation: Dataset({
        features: ['text', 'company', 'cik', 'item', 'contains_debt_instrument_information'],
        num_rows: 44
    })
})


# **Data Preprocessing**

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
name = "roberta-base"

#Initialize your tokenizer here
tokenizer = RobertaTokenizer.from_pretrained(name)

sample_input = "We want to use a pretrained tokenizer."

#Call your tokenizer here to check if it was properly loaded by using on a test sentence
tokenized_inputs = tokenizer(
    sample_input,
    padding = "max_length",
    truncation= True,
    max_length = 64,
    return_tensors = "pt"
)
print(tokenized_inputs["input_ids"])

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

tensor([[    0,   170,   236,     7,   304,    10, 11857, 26492, 19233,  6315,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]])


In [6]:
# Now that our tokenizer has been properly loaded, we need to call the tokenizer
# for every example in the dataset. Here we use list comprehension with a
# lambda function ensure that.
tokenized_dataset = dataset.map(
    lambda example: tokenizer(example['text'], padding="max_length",
    truncation=True, max_length=64)
)

# We need to remove these extra columns before the dataset can be sent to the
# dataloader and subsequently to the model. Also be sure to check that the
# output column is named labels or else rename if necessary
tokenized_dataset = tokenized_dataset.remove_columns(['item', 'text', 'company','cik'])
tokenized_dataset = tokenized_dataset.rename_column("contains_debt_instrument_information", "labels")

tokenized_dataset.set_format("torch")

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

# **Training and Validation**

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "/content/gdrive/MyDrive/DSSI/distilbert-model"

model = RobertaForSequenceClassification.from_pretrained(name, num_labels=2).to(device)  # Adjust for your task (e.g., 2 for binary classification)

## Freeze embedding layers
# for param in model.roberta.embeddings.parameters():
#     param.requires_grad = False

arguments = TrainingArguments(
    output_dir=model_path+"sample_hf_trainer",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=4,
    eval_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=3e-5,
    logging_dir="./logs",
    logging_steps=10,
    metric_for_best_model="f1",                   # or "accuracy", etc. — must match your `compute_metrics`
    greater_is_better=True,
    load_best_model_at_end=True,
    seed=42
)

def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    # calculates the accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'], # change to test when you do your final evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6791,0.514368,0.775,0.741935,0.958333,0.836364
2,0.4168,0.671802,0.725,0.69697,0.958333,0.807018


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6791,0.514368,0.775,0.741935,0.958333,0.836364
2,0.4168,0.671802,0.725,0.69697,0.958333,0.807018
3,0.4204,0.365374,0.825,0.814815,0.916667,0.862745
4,0.2944,0.371934,0.85,0.821429,0.958333,0.884615


TrainOutput(global_step=72, training_loss=0.4319485721902715, metrics={'train_runtime': 882.7721, 'train_samples_per_second': 0.779, 'train_steps_per_second': 0.082, 'total_flos': 22627550760960.0, 'train_loss': 0.4319485721902715, 'epoch': 4.0})