# E2E Runner

In [None]:
# Standard library imports
import sys
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split


# classifier = pipeline("text-classification", model="sumit2603/bert-sports-interview-classifier")
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForQuestionAnswering, AutoConfig
from transformers import pipeline, TrainerCallback, TrainingArguments, TrainerControl, TrainerState, Trainer
from datasets import Dataset
import torch
# from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, log_loss


# Check if GPU is available
print("CUDA available:", torch.cuda.is_available())

import os
os.environ["WANDB_DISABLED"] = "true"      # hard-disable
os.environ["WANDB_MODE"] = "disabled"      # belt & suspenders
# (optional) keep runs clean:
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

CUDA available: False


In [None]:
# transcripts_df = pd.read_csv('data/transcripts.csv').drop_duplicates(ignore_index=True)
# transcripts_df.insert(0, 'sample_id', range(1, len(transcripts_df) + 1))

# ACTIONS = list(set(pd.read_csv("data/actions.csv")["parameter"]))

# pseudo_df = pd.read_csv("data/pseudo_actions_labels_with_id.csv")[['sample_id', 'action_detected']]
transcripts_df = pd.read_csv('data/transcrpits_processed.csv').drop_duplicates(ignore_index=True)
augmented_texts_df = pd.read_csv('data/augmented_texts_processed.csv')[['sample_id', 'augmented_text', 'tokenized_augmented_text', 'action']]
transcripts_folds_df = pd.read_csv('data/transcripts_folds.csv')

train_ids = transcripts_folds_df[transcripts_folds_df['fold1'] == 'val'][['sample_id']] # val in origin was 40%, now i use it for train
val_ids = transcripts_folds_df[transcripts_folds_df['fold1'] == 'train'][['sample_id']] # val in origin was 20%, now i use it for val
test_ids = transcripts_folds_df[transcripts_folds_df['fold1'] == 'test'][['sample_id']]

In [None]:


# transcripts_train, transcripts_test = train_test_split(transcripts_df, test_size=0.6, random_state=42)
# transcripts_val, transcripts_test = train_test_split(transcripts_test, test_size=0.66, random_state=42)

transcripts_train = transcripts_df.merge(train_ids, on='sample_id')
transcripts_val = transcripts_df.merge(val_ids, on='sample_id')
transcripts_test = transcripts_df.merge(test_ids, on='sample_id')

transcripts_train_aug = augmented_texts_df.merge(transcripts_train[['sample_id']], on='sample_id', how='inner').merge(transcripts_df, on='sample_id', how='inner')[['sample_id', 'Text', 'events', 'Label']]

transcripts_train_aug = transcripts_train_aug[['Text', 'events', 'Label']]
transcripts_train = transcripts_train[['Text', 'events', 'Label']]
transcripts_test = transcripts_test[['Text', 'events', 'Label']]
transcripts_val = transcripts_val[['Text', 'events', 'Label']]

train_neg = transcripts_train[transcripts_train['Label'] == 0]
train_neg_aug = transcripts_train_aug[transcripts_train_aug['Label'] == 0]
train_neg = (pd.concat([train_neg, train_neg, train_neg, train_neg_aug, train_neg_aug], ignore_index=True))

train_pos = transcripts_train[transcripts_train['Label'] == 1]
train_pos_aug = transcripts_train_aug[transcripts_train_aug['Label'] == 1].sample(frac=0.33, random_state=42)
train_pos = (pd.concat([train_pos, train_pos_aug], ignore_index=True))

train_df_balanced = pd.concat([train_pos, train_neg], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)



In [None]:
def seed_everything(seed=42):
    import random
    import os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def print_metrics(y_true, y_pred):
  acc = accuracy_score(y_true, y_pred)
  micro_f1 = f1_score(y_true, y_pred, average='micro')
  macro_f1 = f1_score(y_true, y_pred, average='macro')
  weighted_f1 = f1_score(y_true, y_pred, average='weighted')
  precision_label_0 = precision_score(y_true, y_pred, labels=[0], average='macro', zero_division=0)
  precision_label_1 = precision_score(y_true, y_pred, labels=[1], average='macro', zero_division=0)
  recall_label_0 = recall_score(y_true, y_pred, labels=[0], average='macro', zero_division=0)
  recall_label_1 = recall_score(y_true, y_pred, labels=[1], average='macro', zero_division=0)
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
  return pd.DataFrame({
      "precision_label_0": [round(precision_label_0, 3)],
      "precision_label_1": [round(precision_label_1, 3)],
      "recall_label_0": [round(recall_label_0, 3)],
      "recall_label_1": [round(recall_label_1, 3)],
      "micro_f1": [round(micro_f1, 3)],
      "macro_f1": [round(macro_f1, 3)],
      "true_negative": [int(tn)],
      "false_positive": [int(fp)],
      "false_negative": [int(fn)],
      "true_positive": [int(tp)]
  })

In [None]:
# model_name = 'RajSang/pegasus-sports-titles'



# model_name = "Chrisneverdie/OnlySportsLM_196M"
# model_name = 'Chrisneverdie/OnlySports_Classifier'





# model_name = 'emeraldgoose/bert-base-v1-sports'
# grads = ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']

# model_name = 'leomaurodesenv/bert-basketball-qa'
# grads = ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']



# model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'

# model_name = 'SushantGautam/SportsSum'
# grads = ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']

model_name = "microsoft/SportsBERT"
grads = ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # or set to "cpu" to force CPU
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/SportsBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
seed_everything(42)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(DEVICE)

# config = LoraConfig(task_type=TaskType.SEQ_CLS, r=32, lora_alpha=32, lora_dropout=0.1)
# model = get_peft_model(model, config)

for name, p in model.named_parameters():
    p.requires_grad = name in grads or name.startswith(("bert.encoder.layer.11"))
    # p.requires_grad = name.startswith(("classifier", "score"))

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable}/{total} ({100*trainable/total:.2f}%)")

# Union transcripts_train and transcripts_train_aug for training dataset
# train_df_union = train_df_balanced[:400]
train_df_union = train_df_balanced
train_dataset = Dataset.from_pandas(
    train_df_union[['Text', 'Label']].rename(columns={'Text': 'text', 'Label': 'label'})
)
test_dataset = Dataset.from_pandas(transcripts_val[['Text', 'Label']].rename(columns={'Text': 'text', 'Label': 'label'}))

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="data/validator_checkpoint",
    report_to="none",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    eval_steps = 2,
    save_strategy="steps",
    save_steps=2,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=2,
    learning_rate=1e-4, 
    weight_decay=0.2,  # Further increased regularization
    gradient_accumulation_steps=1,
    lr_scheduler_type='polynomial',
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
)

# Define compute_metrics function


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Calculate accuracy
    acc = accuracy_score(labels, predictions)
    # Calculate micro F1
    micro_f1 = f1_score(labels, predictions, average='micro')
    macro_f1 = f1_score(labels, predictions, average='macro')
    weighted_f1 = f1_score(labels, predictions, average='weighted')
    # Calculate precision for label 0 and label 1
    from sklearn.metrics import precision_score, confusion_matrix, recall_score
    precision_label_0 = precision_score(labels, predictions, labels=[0], average='macro', zero_division=0)
    precision_label_1 = precision_score(labels, predictions, labels=[1], average='macro', zero_division=0)
    recall_label_0 = recall_score(labels, predictions, labels=[0], average='macro', zero_division=0)
    recall_label_1 = recall_score(labels, predictions, labels=[1], average='macro', zero_division=0)
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(labels, predictions, labels=[0,1]).ravel()
    return {
        "precision_label_0": round(precision_label_0, 3),
        "precision_label_1": round(precision_label_1, 3),
        "recall_label_0": round(recall_label_0, 3),
        "recall_label_1": round(recall_label_1, 3),
        "accuracy": round(acc, 3),
        "micro_f1": round(micro_f1, 3),
        "macro_f1": round(macro_f1, 3),
        "weighted_f1": round(weighted_f1, 3),
        "true_negative": int(tn),
        "false_positive": int(fp),
        "false_negative": int(fn),
        "true_positive": int(tp)
    }


class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Print train and eval loss at the end of each epoch
        train_loss = state.log_history[-2]['loss'] if len(state.log_history) > 1 and 'loss' in state.log_history[-2] else None
        eval_loss = state.log_history[-1]['eval_loss'] if len(state.log_history) > 0 and 'eval_loss' in state.log_history[-1] else None
        print(f"Epoch {int(state.epoch) if state.epoch is not None else '?'}: train_loss = {train_loss}, eval_loss = {eval_loss}")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    callbacks=[PrintLossCallback()],
)

# Finetune the model
train_result = trainer.train()

# Print the chosen/best checkpoint path
if hasattr(trainer, 'state') and hasattr(trainer.state, 'best_model_checkpoint') and trainer.state.best_model_checkpoint is not None:
    print(f"Best model checkpoint: {trainer.state.best_model_checkpoint}")
else:
    print("No best model checkpoint found.")


In [None]:
classifier = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1)
test_dataset = Dataset.from_pandas(transcripts_test[['Text', 'Label']].rename(columns={'Text': 'text', 'Label': 'label'}))

y_true = test_dataset['label']
y_pred = [int(d['label']) if isinstance(d['label'], int) else int(d['label'].split('_')[-1]) for d in classifier(list(test_dataset['text']), batch_size=32)]
print_metrics(y_true, y_pred)
