In [1]:
import numpy as np
import torch
from torch import nn
import pandas as pd
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text_df = pd.read_csv('/Users/pranavturlapati/COMP-560-Deepseekers/Data/Classification Data/inputs_text.csv')
json_df = pd.read_csv('/Users/pranavturlapati/COMP-560-Deepseekers/Data/Classification Data/inputs_json.csv')
label_df = pd.read_csv('/Users/pranavturlapati/COMP-560-Deepseekers/Data/Classification Data/outputs.csv')


In [3]:
df_text_long = text_df.melt(var_name='game_id', value_name='text')
df_json_long = json_df.melt(var_name='game_id', value_name='json_output')
df_label_long = label_df.melt(var_name='game_id', value_name='label')



In [4]:
for df in [df_text_long, df_json_long, df_label_long]:
    df['row'] = df.index

merged = df_text_long.merge(df_json_long, on=['game_id', 'row']).merge(df_label_long, on=['game_id', 'row'])
merged = merged[['game_id', 'text', 'json_output', 'label']]


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
merged['input'] = merged['text'] + ' [SEP] ' + merged['json_output']

tokens = tokenizer(
    list(merged['input']),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='np'
)

In [6]:
full_dataset = Dataset.from_dict({
    'input_ids': tokens['input_ids'],
    'attention_mask': tokens['attention_mask'],
    'label': merged['label'].values
})

In [7]:
train_idx, val_idx = train_test_split(
    np.arange(len(full_dataset)),
    test_size=0.2,
    stratify=full_dataset['label'],
    random_state=42
)


In [8]:
# Create Dataset objects
train_dataset = full_dataset.select(train_idx)
val_dataset = full_dataset.select(val_idx)


In [9]:
def oversample_dataset(dataset):
    class_0_idx = [i for i, x in enumerate(dataset) if x['label'] == 0]
    class_0 = dataset.select(class_0_idx)
    return concatenate_datasets([dataset, class_0, class_0])  # 3x oversampling

In [10]:
train_dataset = oversample_dataset(train_dataset).shuffle()

In [11]:
class_counts = np.bincount(train_dataset['label'])
pos_weight = len(train_dataset) / (2 * class_counts[1])
neg_weight = len(train_dataset) / (2 * class_counts[0])
class_weights = torch.tensor([neg_weight, pos_weight], dtype=torch.float32)


In [12]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
class WeightedLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss_fct = nn.CrossEntropyLoss(weight=class_weights)
    
    def forward(self, logits, labels):
        return self.loss_fct(logits.view(-1, 2), labels.view(-1))

model.config.loss_fct = WeightedLoss()

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average=None, labels=[0, 1], zero_division=0
    )
    return {
        "precision_0": precision[0],
        "recall_0": recall[0],
        "f1_0": f1[0],
        "f1_1": f1[1],
        "pred_0_ratio": np.mean(preds == 0)
    }


In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="f1_0",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.evaluate()
print("\nFinal Evaluation Results:")
print(f"Class 0 F1: {results['eval_f1_0']:.4f}")
print(f"Class 1 F1: {results['eval_f1_1']:.4f}")
print(f"Class 0 Prediction Ratio: {results['eval_pred_0_ratio']:.2%}")