**Import**

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

!pip install transformers
!pip install -U datasets

from collections import defaultdict, Counter
import json
import torch

from matplotlib import pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

ValueError: mount failed

In [None]:
#banktrak_df = pd.read_csv('/content/gdrive/MyDrive/Group 1: DSSI Summer 2025/Data/summary_banktrak.csv')
#banktrak_df_1 = pd.read_csv('/content/gdrive/MyDrive/Group 1: DSSI Summer 2025/Data/banktrak-8K-20230501-annotated.csv')
#banktrak_df.head(10)
model_path = "/content/gdrive/MyDrive/DSSI/distilbert-model"

1. USING SAMPLERS FOR CLASS IMBALANCE

In [None]:
# #https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html#imblearn.over_sampling.RandomOverSampler
# from imblearn.over_sampling import RandomOverSampler

# X = banktrak_df_1[['text']]
# y = banktrak_df_1[['contains_debt_instrument_information']]

# #oversampling minority data
# ros = RandomOverSampler(random_state=42)                 #this is the random oversampling
# X_resampled, y_resampled = ros.fit_resample(X, y)        #i think this is fitting our model with the undersampling
# y_resampled.contains_debt_instrument_information.value_counts()

# X_resampled = X_resampled.reset_index(drop=True)
# y_resampled = y_resampled.reset_index(drop=True)

# banktrak_df = pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
# #https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html
# from imblearn.under_sampling import RandomUnderSampler

# X = banktrak_df_1[['text']]
# y = banktrak_df_1[['contains_debt_instrument_information']]

# #oversampling minority data
# rus = RandomUnderSampler(random_state=42)                 #this is the random oversampling
# X_resampled, y_resampled = rus.fit_resample(X, y)        #i think this is fitting our model with the undersampling
# y_resampled.contains_debt_instrument_information.value_counts()

# X_resampled = X_resampled.reset_index(drop=True)
# y_resampled = y_resampled.reset_index(drop=True)

# banktrak_df = pd.concat([X_resampled, y_resampled], axis=1)

2. SPLIT INTO TRAINING AND VALIDATION THEN TURNING INTO DICTIONARY



In [None]:
#spliting data train and val
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(banktrak_df, test_size=0.2, random_state=42) #validation size is 20%, random state is just there for reproducilbility

# turn to json dict
from datasets import Dataset, DatasetDict
dataset = DatasetDict({'train': Dataset.from_pandas(train_df), 'validation': Dataset.from_pandas(val_df)}) #use dataset dict to turn to dataframe to dict

3. Loading the model & tokenizer in

In [None]:
# load model and tokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("KISTI-AI/scideberta")
model = AutoModelForMaskedLM.from_pretrained("KISTI-AI/scideberta")

**Dataset Preprocessing**

In [None]:
# apply tokenizer to dataset
tokenized_dataset = dataset.map(lambda example: tokenizer(example['text'], padding="max_length", truncation=True, max_length=64))

#clean dataset
#tokenized_dataset = tokenized_dataset.remove_columns(['item', 'text', 'company','cik'])
tokenized_dataset = tokenized_dataset.rename_column("contains_debt_instrument_information", "labels") #rename this column labels bz the model likes that
tokenized_dataset.set_format("torch")
#turn to pytorch tensor for model

**Using DataLoader to batchify data**

In [None]:
#call train and eval
train_dataset = tokenized_dataset['train'].shuffle(seed=1111)
eval_dataset = tokenized_dataset['validation']

#apply dataloader to train and eval dataloader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

**Training and Validation**

In [None]:
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from transformers import set_seed
from torch.optim import AdamW

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("KISTI-AI/scideberta", num_labels=2).to(device)

num_epochs = 1
num_training_steps = len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) #2e-5 to 5e-5

lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

best_val_loss = float("inf") # starts at infilty so any real loss will be smaller
progress_bar = tqdm(range(num_training_steps))
#tqdm tracks iteration status

for epoch in range(num_epochs): #for num of epochs
    # training
    model.train() # training mode
    training_losses = [] #store batch losses
    for batch_i, batch in enumerate(train_dataloader): #for batches in training loader

        optimizer.zero_grad()

        # copy input to device
        input_ids = batch['input_ids'].to(device)            #moves these to same device
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels= labels.long()


        # Call the model for Forward Pass
        output = model(input_ids = input_ids, attention_mask = attention_mask, labels =labels)
        training_loss = output.loss                                           #takes loss data
        training_losses.append(training_loss.item())                          #adds loss data

        #Do backprop and update params by taking an optimization step
        training_loss.backward()
        optimizer.step()
        lr_scheduler.step()                                                   #pdates learning rate according to scheduler policy (linear decay here)
        progress_bar.update(1)                                                #advances the tqdm progress bar by 1 step (one batch done)
    print("Mean Training Loss", np.mean(training_losses))

    # validation
    val_loss = 0
    #set to evaluation mode because we dont want to collect gradients
    model.eval()            #disables things like dropout and layer norm randomness
    for batch_i, batch in enumerate(eval_dataloader):                  #go over batches
        with torch.no_grad():                                          #clear gradients
            # copy input to device
            input_ids = batch['input_ids'].to(device)                  #add to device
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels = labels.long()

            #call the model again for Forward Pass
            output = model(input_ids = input_ids, attention_mask = attention_mask, labels =labels)

        # add the batch average of validation loss to the running sum
        val_loss += output.loss               # batch’s loss to the total validation loss.

    # calculating average validation loss across all batches
    avg_val_loss = val_loss / len(eval_dataloader)                               #calculations of best loss
    print(f"Validation loss: {avg_val_loss}")

    # Saving this model checkpoint only if the current validation loss
    # is better than the best validation loss obtained so far
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(),'val_loss': best_val_loss,}, f"{model_path}epoch_{epoch}.pt")
    print()

print(f"The best validation loss after {num_epochs} epochs is: {best_val_loss}")

**Evaluate your model on Test Data**

In [None]:
from sklearn.metrics import accuracy_score, f1_score

eval_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=len(tokenized_dataset['validation']))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()            #evalaution mode
test_batch_logits = []    #store models raw outputs logits
y_true = []
for batch_i, batch in enumerate(eval_dataloader): #go through batches
    with torch.no_grad():                          #clear graidnet
        # copy input to device
        input_ids = batch['input_ids'].to(device)            #add to deviice
        attention_mask = batch['attention_mask'].to(device)  #add device
        labels = batch['labels'].cpu().detach().numpy()      #removes labels

        # Call the model on test data
        output = model(input_ids = input_ids, attention_mask = attention_mask, labels =None) #labels none becuase didnt pass any
        test_batch_logits.append(output.logits)                #append output logits --stores logits before soft max
        y_true.extend(labels)                                  #add ground turth labels

print(len(test_batch_logits),len(eval_dataloader))      #shape of the final logits tensor ([num_examples, num_classes])
test_logits = torch.cat(test_batch_logits, dim=0)     #concatenates the logits from all batches along dimension 0.

#sanity check -> dimension 0 of your logits tensor should be same as the size of the test dataset
print(test_logits.shape,len(tokenized_dataset['validation']),len(y_true))

In [None]:
#Convert the logits to predicted labels
y_pred = torch.argmax(test_logits, dim = 1).cpu().numpy()

print(y_true[:10])
print(y_pred[:10])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

In [None]:
# call the f1_score function
print('F1 Score:',f1_score(y_true, y_pred, average='binary'))

# call the accuracy_score function
print('Accuracy Score:',accuracy_score(y_true, y_pred))

new stuff

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
# pip install transformer-ranker

In [None]:
# from datasets import load_dataset
# from transformer_ranker import TransformerRanker, prepare_popular_models

# # Step 1: Load the CoNLL-03 dataset from HuggingFace
# dataset = load_dataset('conll2003')

# # Step 2: Use our list of 17 'base' LMs as candidates
# language_models = prepare_popular_models('base')

# # Step 3: Initialize the ranker with the dataset
# ranker = TransformerRanker(dataset, dataset_downsample=0.2)

# # ... and run the ranker to obtain the ranking
# results = ranker.run(language_models, batch_size=64)

# # print the ranking
# print(results)