# WSC Project - Data Analysis & NLP

In [14]:
# Standard library imports
import warnings
warnings.filterwarnings('ignore')
import ast
import re
import pickle

# Data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP imports
import nltk
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import json
import os 
import random

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import LinearSVC
import tiktoken
import pickle
# import xgboost as xgb
from lightgbm import LGBMClassifier

# Jupyter settings
%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings for full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

# load and utils

In [15]:
ACTIONS = list(set(pd.read_csv("data/actions.csv")["parameter"]))
pseudo_df = pd.read_csv("data/pseudo_actions_labels_with_id.csv")[['sample_id', 'action_detected']]
transcripts_folds_df = pd.read_csv('data/transcripts_folds.csv')
features_df = transcripts_folds_df[['sample_id', 'Text', 'tokenized_text', 'events', 'actions_in_text', 'fold1', 'fold2', 'fold3', 'fold4', 'fold5']]
# features_df = transcripts_folds_df[['sample_id', 'Text', 'tokenized_text', 'events', 'actions_in_text']]

expr_df = pseudo_df.merge(features_df, on='sample_id', how='inner').rename(columns={'action_detected': 'actions_pseudo_label', 'actions_in_text': 'actions_str_detected', 'Text': 'transcript_text'})
expr_df['actions_pseudo_label'] =  expr_df['actions_pseudo_label'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

with open('data/actions_processed_to_action.json', 'r') as f:
    actions_map = json.load(f)

inverse_actions_map = {v: k for k, v in actions_map.items()}
expr_df['actions_str_detected'] = expr_df['actions_str_detected'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
expr_df['actions_str_detected'] = expr_df['actions_str_detected'].apply(lambda actions: [inverse_actions_map.get(a, a) for a in actions])

# Transformers

In [16]:

augmented_texts_df = pd.read_csv('data/augmented_texts_processed.csv')[['sample_id', 'augmented_text', 'tokenized_augmented_text', 'action']]

def concat_augmentations_to_fold_df(fold_train_df):
    augmented_texts_train = augmented_texts_df[augmented_texts_df['sample_id'].isin(fold_train_df['sample_id'])]
    augmented_texts_train = augmented_texts_train.rename(columns={'augmented_text': 'transcript_text', 'action': 'actions_pseudo_label', 'tokenized_augmented_text': 'tokenized_text'})
    augmented_texts_train['actions_pseudo_label'] = augmented_texts_train['actions_pseudo_label'].apply(lambda x: [x])
    augmented_texts_train = augmented_texts_train.merge(
        fold_train_df.drop(['actions_pseudo_label', 'transcript_text', 'tokenized_text'], axis=1), 
        on='sample_id', 
        how='inner'
    )
    return pd.concat([fold_train_df, augmented_texts_train], ignore_index=True)

In [17]:
AUGMENT_DATA = True

df = expr_df.copy()
action_counts = df['actions_pseudo_label'].explode().value_counts()

fold_idx = 1
fold_col = f'fold{fold_idx}'
fold_train_df = df[df[fold_col] == 'train']
if AUGMENT_DATA:
    fold_train_df = concat_augmentations_to_fold_df(fold_train_df)
    fold_val_df = df[df[fold_col] == 'val']

In [None]:
train_df = fold_train_df.copy()
train_df['events_str'] = ['Events: ' + ', '.join(l) for l in [[] if l == '[None]' else eval(l) for l in train_df['events'].tolist()]]
train_df['events_and_transcript'] = train_df['events_str'] + '\n Transcript: ' + train_df['transcript_text']
# train_df['events_and_transcript'] =  train_df['transcript_text']

val_df = fold_val_df.copy()
# val_df['events_str'] = ['Events: ' + ', '.join(l) for l in [[] if l == '[None]' else eval(l) for l in val_df['events'].tolist()]]
# val_df['events_and_transcript'] = val_df['events_str'] + '\n Transcript: ' + val_df['transcript_text']
val_df['events_and_transcript'] =  val_df['transcript_text']

X_train = train_df['events_and_transcript'].tolist()
y_train = fold_train_df['actions_pseudo_label'].tolist()

X_val = val_df['events_and_transcript'].tolist()
y_val = val_df['actions_pseudo_label'].tolist()

In [19]:
INFERENCE_COLLECTION = 'train'
# INFERENCE_COLLECTION = 'val'

EMBEDDINGS_TASK = 'detector'
# EMBEDDINGS_TASK = 'validator'


## Action detection

In [None]:
all_df = df.copy()
all_df['events_str'] = ['Events: ' + ', '.join(l) for l in [[] if l == '[None]' else eval(l) for l in all_df['events'].tolist()]]
all_df['events_and_transcript'] = all_df['events_str'] + '\n Transcript: ' + all_df['transcript_text']
all_df_text = all_df['transcript_text'].tolist()

In [None]:
from transformers import pipeline
import torch
import json



MODEL = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"

classifier = pipeline("zero-shot-classification", model=MODEL, device=-1)

batch_size = 16 # or any batch size you prefer
actions_preds_all = []
for i in tqdm(range(0, len(all_df_text), batch_size)):
    batch = all_df_text[i:i+batch_size]
    actions_preds_all.extend(classifier(batch, ACTIONS, multi_label=False))


# Convert the list to a dict, using index as key
actions_preds_val_dict = {i: v for i, v in enumerate(actions_preds_all)}
with open('data/all_df_preds_DeBERTa-v3-base-mnli-fever-anli.json', 'w') as f:
    json.dump(actions_preds_val_dict, f)


  0%|          | 0/70 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 70/70 [4:47:52<00:00, 246.74s/it]    


## Action Validator

In [73]:
from transformers import AutoModelForSequenceClassification

In [None]:
filtered = [(x, y, label) for x, y, label in zip(X_val, y_val, val_df['Label']) if len(y) == 1]

In [103]:
premise = "John Collins can't convert the lob inside, yesterday he slam dunked it"
# text = "Angela Merkel is a politician in Germany and leader of the CDU"
hypothesis_template = "This action mentioned in the commentary is {}"
classes_verbalized = ACTIONS
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")  # change the model identifier here
output = zeroshot_classifier(text, classes_verbalized, hypothesis_template=hypothesis_template, multi_label=True)
output

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'sequence': 'Angela Merkel is a politician in Germany and leader of the CDU',
 'labels': ['side step',
  'floater',
  'post up',
  'no look pass',
  'alley oop',
  'splash',
  'poster',
  'bank shot',
  'slam dunk',
  'tip in',
  'fake',
  'dime',
  'nothing but net',
  'lob',
  'outlet pass',
  'behind the back',
  'step back',
  'pick and roll',
  'swish',
  'give and go',
  'reverse dunk',
  'take it to the rack',
  'fadeaway',
  'pump fake',
  'euro step',
  'double team',
  'between the legs',
  'jab step',
  'flop',
  'backdoor',
  'shake and bake',
  'coast to coast',
  'teardrop',
  'baseball pass',
  'rainbow shot',
  'jam',
  'finger roll',
  'tomahawk'],
 'scores': [0.02935909666121006,
  0.027067378163337708,
  0.023653825744986534,
  0.014645302668213844,
  0.013345428742468357,
  0.010482476092875004,
  0.0086736548691988,
  0.0067649600096046925,
  0.006156327668577433,
  0.005655469838529825,
  0.003958581481128931,
  0.0034659183584153652,
  0.0034551972057670355,
  0

In [99]:
model_name = "MoritzLaurer/DeBERTa-v3-small-mnli-fever-docnli-ling-2c"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

# premise = "I first thought that I liked the movie, but upon second thought it was actually disappointing."
# hypothesis = "The movie was good."



# model_name = "MoritzLaurer/DeBERTa-v3-xsmall-mnli-fever-anli-ling-binary"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# premise = "I first thought that I liked the movie, but upon second thought it was actually disappointing."
# hypothesis = "The movie was good."

# premise = "'Jordan Posterized Oxymora!'"
premise = "John Collins can't convert the lob inside, yesterday he slam dunked it"
# hypothesis = "A poster action is mentioned and executed in this current play."
# hypothesis = "A slam dunked action is mentioned but executed sometime else" 
hypothesis = "A slam dunked action is mentioned and executed" 
# hypothesis = "A poster action is mentioned and executed" 


input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"])  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "not_entailment"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

{'entailment': 90.8, 'not_entailment': 9.2}


# Deroberta

In [32]:
# MODEL = "microsoft/deberta-v3-base"
MODEL = "MoritzLaurer/DeBERTa‑v3‑base‑mnli"

# MODEL = "microsoft/SportsBERT"

In [33]:
DEVICE = 'cpu'

def set_deterministic(s=42):
    os.environ["PYTHONHASHSEED"]=str(s); random.seed(s); np.random.seed(s); torch.manual_seed(s)
    try: torch.use_deterministic_algorithms(True)
    except: pass

rng = np.random.default_rng(42)

def build_train_pairs(X,Y,A):
    T,A2,L=[],[],[]
    for t,ys in zip(X,Y):
        P=list(set(ys)); N=[a for a in A if a not in P]; k=max(len(ys),len(P))
        if N: N=rng.choice(N,size=min(k,len(N)),replace=False).tolist()
        for a in P: T.append(t); A2.append(H(a)); L.append(1.)
        for a in N: T.append(t); A2.append(H(a)); L.append(0.)
    return T,A2,torch.tensor(L,dtype=torch.float32)

def build_val_pairs(X,Y,A):
    T,A2,L=[],[],[]
    for t,ys in zip(X,Y):
        S=set(ys)
        for a in A:
            T.append(t)
            A2.append(H(a))
            L.append(1. if a in S else 0.)
    return T,A2,torch.tensor(L,dtype=torch.float32)


class Pairs(torch.utils.data.Dataset):
    def __init__(self,tok,T,A,L,m=160): self.tok,tok.model_max_length,self.T,self.A,self.L,self.m=tok,m,T,A,L,m
    def __len__(self): return len(self.T)
    def __getitem__(self,i):
        enc=self.tok(self.T[i], self.A[i], truncation=True, padding="max_length", max_length=self.m)
        # DeBERTa does not use token_type_ids, so always remove if present
        enc.pop("token_type_ids", None)
        enc={k:torch.tensor(v) for k,v in enc.items()}; enc["labels"]=self.L[i]; return enc


if EMBEDDINGS_TASK == 'detector':
    H = lambda action: f"This commentary mentions {action}."
elif EMBEDDINGS_TASK == 'validator':
    H = lambda action: f"A {action} occurs in this play."

set_deterministic(42)
tok=AutoTokenizer.from_pretrained(MODEL, use_fast=True)
# DeBERTa does not have .to() for tokenizer, so skip moving tokenizer to device
if tok.pad_token is None: tok.pad_token = tok.eos_token or tok.unk_token

Ttr,Atr,Ltr=build_train_pairs(X_train,y_train,ACTIONS)
Tva,Ava,Lva=build_val_pairs(X_val,y_val,ACTIONS)
ds_tr,ds_va=Pairs(tok,Ttr,Atr,Ltr),Pairs(tok,Tva,Ava,Lva)

enc=AutoModel.from_pretrained(MODEL)
for p in enc.parameters(): p.requires_grad=False
enc = enc.to(DEVICE)

print('Train trainscripts data', len(X_train))
print('Val trainscripts data', len(X_val))
print('Train data points', len(Ttr))
print('Val data points', len(Tva))


if INFERENCE_COLLECTION =='train':
    x_text = Ttr
    x_acts = Atr
    X_binary_label = Ltr

if INFERENCE_COLLECTION =='val':
    x_text = Tva
    x_acts = Ava
    X_binary_label = Lva

print(INFERENCE_COLLECTION)
print(len(x_text))
mean_pooling_embs = []
cls_embs = []
bs = 64
enc.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(x_text), bs)):
        print(f"{(i/len(x_text)*100):.1f}%")
        t_batch = x_text[i:i+bs]
        a_batch = x_acts[i:i+bs]
        batch = tok(t_batch, a_batch, truncation=True, padding="max_length", max_length=160, return_tensors="pt")
        # DeBERTa does not use token_type_ids, so always remove if present
        batch.pop("token_type_ids", None)
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        out = enc(**batch)
        
        attn_mask = batch['attention_mask'].unsqueeze(-1)
        sum_hidden = (out.last_hidden_state * attn_mask).sum(1)
        lens = attn_mask.sum(1).clamp(min=1)
        mean_pooled = sum_hidden / lens
        mean_pooling_embs.append(mean_pooled.detach().cpu())
        cls_embs.append(out.last_hidden_state[:,0].detach().cpu())

cls_embs = torch.cat(cls_embs)
mean_pooling_embs = torch.cat(mean_pooling_embs)

OSError: Incorrect path_or_model_id: 'MoritzLaurer/DeBERTa‑v3‑base‑mnli'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

## SportsBERT

### create embeddings

In [None]:
# DEVICE = 'cpu'

# def set_deterministic(s=42):
#     os.environ["PYTHONHASHSEED"]=str(s); random.seed(s); np.random.seed(s); torch.manual_seed(s)
#     try: torch.use_deterministic_algorithms(True)
#     except: pass

# rng = np.random.default_rng(42)

# def build_train_pairs(X,Y,A):
#     T,A2,L=[],[],[]
#     for t,ys in zip(X,Y):
#         P=list(set(ys)); N=[a for a in A if a not in P]; k=max(len(ys),len(P))
#         if N: N=rng.choice(N,size=min(k,len(N)),replace=False).tolist()
#         for a in P: T.append(t); A2.append(H(a)); L.append(1.)
#         for a in N: T.append(t); A2.append(H(a)); L.append(0.)
#     return T,A2,torch.tensor(L,dtype=torch.float32)

# def build_val_pairs(X,Y,A):
#     T,A2,L=[],[],[]
#     for t,ys in zip(X,Y):
#         S=set(ys)
#         for a in A:
#             T.append(t)
#             A2.append(H(a))
#             L.append(1. if a in S else 0.)
#     return T,A2,torch.tensor(L,dtype=torch.float32)


# class Pairs(torch.utils.data.Dataset):
#     def __init__(self,tok,T,A,L,m=160): self.tok,tok.model_max_length,self.T,self.A,self.L,self.m=tok,m,T,A,L,m
#     def __len__(self): return len(self.T)
#     def __getitem__(self,i):
#         enc=self.tok(self.T[i], self.A[i], truncation=True, padding="max_length", max_length=self.m)
#         # --- drop token_type_ids for single-segment models ---
#         enc.pop("token_type_ids", None)  # <<< changed
#         enc={k:torch.tensor(v) for k,v in enc.items()}; enc["labels"]=self.L[i]; return enc

# MODEL, REV = "microsoft/SportsBERT", "refs/pr/4"
# if EMBEDDINGS_TASK == 'detector':
#     H = lambda action: f"This commentary mentions {action}."
# elif EMBEDDINGS_TASK == 'validator':
#     H = lambda action: f"A {action} occurs in this play."

# set_deterministic(42)
# tok=AutoTokenizer.from_pretrained(MODEL, revision=REV, use_fast=True)
# tok = tok.to(DEVICE) if hasattr(tok, 'to') else tok
# if tok.pad_token is None: tok.pad_token = tok.eos_token or tok.unk_token

# Ttr,Atr,Ltr=build_train_pairs(X_train,y_train,ACTIONS)
# Tva,Ava,Lva=build_val_pairs(X_val,y_val,ACTIONS)
# ds_tr,ds_va=Pairs(tok,Ttr,Atr,Ltr),Pairs(tok,Tva,Ava,Lva)

# enc=AutoModel.from_pretrained(MODEL, revision=REV)
# for p in enc.parameters(): p.requires_grad=False
# enc = enc.to(DEVICE)

# print('Train trainscripts data', len(X_train))
# print('Val trainscripts data', len(X_val_sample))
# print('Train data points', len(Ttr))
# print('Val data points', len(Tva))


# if INFERENCE_COLLECTION =='train':
#     x_text = Ttr
#     x_acts = Atr
#     X_binary_label = Ltr

# if INFERENCE_COLLECTION =='val':
#     x_text = Tva
#     x_acts = Ava
#     X_binary_label = Lva

# print(INFERENCE_COLLECTION)
# print(len(x_text))
# mean_pooling_embs = []
# cls_embs = []
# bs = 64
# enc.eval()
# with torch.no_grad():
#     for i in tqdm(range(0, len(x_text), bs)):
#         print(f"{(i/len(x_text)*100):.1f}%")
#         t_batch = x_text[i:i+bs]
#         a_batch = x_acts[i:i+bs]
#         batch = tok(t_batch, a_batch, truncation=True, padding="max_length", max_length=160, return_tensors="pt")
#         # --- drop token_type_ids when the model can't use them (avoids IndexError) ---
#         if getattr(enc.config, "type_vocab_size", 2) <= 1 and "token_type_ids" in batch:  # <<< changed
#             batch.pop("token_type_ids")                                             # <<< changed
#         batch = {k: v.to(DEVICE) for k, v in batch.items()}
#         out = enc(**batch)
        
#         attn_mask = batch['attention_mask'].unsqueeze(-1)
#         sum_hidden = (out.last_hidden_state * attn_mask).sum(1)
#         lens = attn_mask.sum(1).clamp(min=1)  # <<< changed (safety vs. division by zero)
#         mean_pooled = sum_hidden / lens
#         mean_pooling_embs.append(mean_pooled.detach().cpu())
#         cls_embs.append(out.last_hidden_state[:,0].detach().cpu())

# cls_embs = torch.cat(cls_embs)
# mean_pooling_embs = torch.cat(mean_pooling_embs)

In [31]:
## Save Embds

In [None]:
WRITE = False

def save_embeddings(INFERENCE_COLLECTION, x_acts, cls_embs, mean_pooling_embs):
    # Ensure the output directory exists
    out_dir = "data/embds"
    os.makedirs(out_dir, exist_ok=True)

    # Prepare data to save
    embd_data = {
        "INFERENCE_COLLECTION": INFERENCE_COLLECTION,
        "actions_labels": [s.replace("This commentary mentions ", "").rstrip(".") for s in x_acts],
        "cls_embs": cls_embs.cpu(),  # ensure on CPU
        "mean_pooling_embs": mean_pooling_embs.cpu(),
        "sample_label": X_binary_label.cpu()


    }

    # Save as .pt (PyTorch native)
    torch.save(embd_data, os.path.join(out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pt"))

    # For pickle, better to convert to numpy for portability
    embd_data_np = {**embd_data,
                    "cls_embs": cls_embs.cpu().numpy(),
                    "mean_pooling_embs": mean_pooling_embs.cpu().numpy()}

    with open(os.path.join(out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pkl"), "wb") as f:
        pickle.dump(embd_data_np, f)

if WRITE:
    save_embeddings(INFERENCE_COLLECTION, x_acts, cls_embs, mean_pooling_embs)

In [15]:
def load_embeddings(INFERENCE_COLLECTION, embeddings_task, filetype="pt"):
    p = f"data/embds/{INFERENCE_COLLECTION}_embds_{embeddings_task}.{filetype}"
    if filetype == "pt":
        return torch.load(p)
    if filetype == "pkl":
        with open(p, "rb") as f:
            return pickle.load(f)
    raise ValueError