# **Import**

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

!pip install transformers
!pip install -U datasets

from collections import defaultdict, Counter
import json
import numpy as np
import torch

from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

Mounted at /content/gdrive
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31m

In [None]:
# Read in hybrid dataset containing OpenAI summaries of false rows with the original dataset
banktrack_df = pd.read_csv('/content/gdrive/MyDrive/Group 1: DSSI Summer 2025/Data/summary_banktrak.csv')

# Convert labels to boolean values
banktrack_df["contains_debt_instrument_information"] = banktrack_df["contains_debt_instrument_information"].astype(int)
banktrack_df.head()

Unnamed: 0,text,company,cik,item,contains_debt_instrument_information
0,"Senior Secured Credit Facility On May 3, ...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
1,"On January 9, 2019, Lifelogger Technologies...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
2,Promissory Note and Warrants to SBI Investm...,LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
3,"On March 1, 2016, Lifelogger Technologies C...",LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
4,Acquisition of Pixorial Assets In further...,LIEFLOGGER TECHNOLOGIES CORP.,1567771,1.01,1
...,...,...,...,...,...
215,"On April 21, 2014, the Company issued a press ...",ADVANCED MEDICINE INC,1080014,8.01,0
216,"Theravance, Inc. entered into a Common Stock P...",ADVANCED MEDICINE INC,1080014,1.01,0
217,"On January 24, 2013, Theravance closed its off...",ADVANCED MEDICINE INC,1080014,8.01,0
218,"On January 17, 2013, Theravance announced the ...",ADVANCED MEDICINE INC,1080014,8.01,0


In [None]:
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel

bart_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from datasets import Dataset, DatasetDict

# Convert DataFrame to a dataset dictionary
dataset = DatasetDict({
    'train': Dataset.from_pandas(banktrack_df)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'company', 'cik', 'item', 'contains_debt_instrument_information'],
        num_rows: 172
    })
    validation: Dataset({
        features: ['text', 'company', 'cik', 'item', 'contains_debt_instrument_information', '__index_level_0__'],
        num_rows: 40
    })
})

In [None]:
#S plit dataset to training and validation
split_dataset = dataset["train"].train_test_split(
    test_size=0.2,      # 20% for validation
    seed=42,            # For reproducibility
    shuffle=True        # Shuffle before splitting
)

# Create the final DatasetDict with both splits
dataset = DatasetDict({
    "train": split_dataset["train"],          # 80%
    "validation": split_dataset["test"]       # 20%
})
print(dataset)

# **Data Preprocessing**

In [None]:
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel
name = "distilbert/distilbert-base-cased"

#Initialize your tokenizer here
tokenizer = DistilBertTokenizer.from_pretrained(name)

sample_input = "We want to use a pretrained tokenizer."

#Call your tokenizer here to check if it was properly loaded by using on a test sentence
tokenized_inputs = tokenizer(
    sample_input,
    padding = "max_length",
    truncation= True,
    max_length = 64,
    return_tensors = "pt"
)
print(tokenized_inputs["input_ids"])

tensor([[  101,  1284,  1328,  1106,  1329,   170,  3073,  4487,  9044, 22559,
         17260,   119,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])


In [None]:
# Now that our tokenizer has been properly loaded, we need to call the tokenizer
# for every example in the dataset. Here we use list comprehension with a
# lambda function ensure that.
tokenized_dataset = dataset.map(
    lambda example: tokenizer(example['text'], padding="max_length",
    truncation=True, max_length=64)
)

# We need to remove these extra columns before the dataset can be sent to the
# dataloader and subsequently to the model. Also be sure to check that the
# output column is named labels or else rename if necessary
tokenized_dataset = tokenized_dataset.remove_columns(['item', 'text', 'company','cik'])
tokenized_dataset = tokenized_dataset.rename_column("contains_debt_instrument_information", "labels")

tokenized_dataset.set_format("torch")

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [None]:
# train_dataset = tokenized_dataset['train'].shuffle(seed=1111).select(range(174))
# batch_size=10
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
# eval_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, shuffle = True)

# **Training and Validation**

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model_path = "/content/gdrive/MyDrive/DSSI/distilbert-model"

model = DistilBertForSequenceClassification.from_pretrained(name, num_labels=2).to(device)

## Freeze embeddings
# for param in model.roberta.embeddings.parameters():
#     param.requires_grad = False

arguments = TrainingArguments(
    output_dir=model_path+"sample_hf_trainer",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=4,
    eval_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=10,
    metric_for_best_model="f1",                   # or "accuracy", etc. — must match your `compute_metrics`
    greater_is_better=True,
    load_best_model_at_end=True,
    seed=42
)

def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    # calculates the accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'], # change to test when you do your final evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# train the model
trainer.train()

KeyboardInterrupt: 

# **Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
results = trainer.predict(tokenized_dataset['validation']) # also gives you predictions
results

In [None]:
test_logits, y_true, metrics = results
print(len(test_logits), len(y_true))

#Convert the logits to predicted labels : if this a torch tensor use torch.argmax(test_logits, dim = 1)
y_pred = np.argmax(test_logits, axis = 1)
print(y_true[:10])
print(y_pred[:10])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

In [None]:
print('F1 Score:',f1_score(y_true, y_pred))
print('Accuracy Score:',accuracy_score(y_true, y_pred))