# WSC Project - Data Analysis & NLP

In [18]:
# Standard library imports
import warnings
warnings.filterwarnings('ignore')
import ast
import re
import pickle

# Data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP imports
import nltk
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import json
import os 
import random
from tqdm import tqdm

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import LinearSVC
import tiktoken
import pickle
# import xgboost as xgb
from lightgbm import LGBMClassifier

# Jupyter settings
%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings for full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

# load and utils

In [19]:
ACTIONS = list(set(pd.read_csv("data/actions.csv")["parameter"]))
pseudo_df = pd.read_csv("data/pseudo_actions_labels_with_id.csv")[['sample_id', 'action_detected']]
transcripts_folds_df = pd.read_csv('data/transcripts_folds.csv')
features_df = transcripts_folds_df[['sample_id', 'Text', 'tokenized_text', 'events', 'actions_in_text', 'fold1', 'fold2', 'fold3', 'fold4', 'fold5']]
# features_df = transcripts_folds_df[['sample_id', 'Text', 'tokenized_text', 'events', 'actions_in_text']]

expr_df = pseudo_df.merge(features_df, on='sample_id', how='inner').rename(columns={'action_detected': 'actions_pseudo_label', 'actions_in_text': 'actions_str_detected', 'Text': 'transcript_text'})
expr_df['actions_pseudo_label'] =  expr_df['actions_pseudo_label'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

with open('data/actions_processed_to_action.json', 'r') as f:
    actions_map = json.load(f)

inverse_actions_map = {v: k for k, v in actions_map.items()}
expr_df['actions_str_detected'] = expr_df['actions_str_detected'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
expr_df['actions_str_detected'] = expr_df['actions_str_detected'].apply(lambda actions: [inverse_actions_map.get(a, a) for a in actions])

In [20]:

augmented_texts_df = pd.read_csv('data/augmented_texts_processed.csv')[['sample_id', 'augmented_text', 'tokenized_augmented_text', 'action']]

def concat_augmentations_to_fold_df(fold_train_df):
    augmented_texts_train = augmented_texts_df[augmented_texts_df['sample_id'].isin(fold_train_df['sample_id'])]
    augmented_texts_train = augmented_texts_train.rename(columns={'augmented_text': 'transcript_text', 'action': 'actions_pseudo_label', 'tokenized_augmented_text': 'tokenized_text'})
    augmented_texts_train['actions_pseudo_label'] = augmented_texts_train['actions_pseudo_label'].apply(lambda x: [x])
    augmented_texts_train = augmented_texts_train.merge(
        fold_train_df.drop(['actions_pseudo_label', 'transcript_text', 'tokenized_text'], axis=1), 
        on='sample_id', 
        how='inner'
    )
    return pd.concat([fold_train_df, augmented_texts_train], ignore_index=True)

In [32]:
AUGMENT_DATA = False
INCLUDE_EVENTS = False

df = expr_df.copy()
action_counts = df['actions_pseudo_label'].explode().value_counts()

fold_col = f'fold1'
train_df = df[df[fold_col] == 'train']
if AUGMENT_DATA:
    train_df = concat_augmentations_to_fold_df(train_df)
    val_df = df[df[fold_col] == 'val']

if INCLUDE_EVENTS:
    train_df['events_str'] = ['Events: ' + ', '.join(l) for l in [[] if l == '[None]' else eval(l) for l in train_df['events'].tolist()]]
    train_df['events_and_transcript'] = train_df['events_str'] + '\n Transcript: ' + train_df['transcript_text']
else:
    train_df['events_and_transcript'] =  train_df['transcript_text']

if INCLUDE_EVENTS:
    val_df['events_str'] = ['Events: ' + ', '.join(l) for l in [[] if l == '[None]' else eval(l) for l in val_df['events'].tolist()]]
    val_df['events_and_transcript'] = val_df['events_str'] + '\n Transcript: ' + val_df['transcript_text']
else:
    val_df['events_and_transcript'] =  val_df['transcript_text']

val_df['events_and_transcript'] =  val_df['transcript_text']

X_train = train_df['events_and_transcript'].tolist()
y_train = train_df['actions_pseudo_label'].tolist()

X_val = val_df['events_and_transcript'].tolist()
y_val = val_df['actions_pseudo_label'].tolist()

### create embeddings

In [197]:
# INFERENCE_COLLECTION = 'train'
INFERENCE_COLLECTION = 'val'

EMBEDDINGS_TASK = 'detector'
# EMBEDDINGS_TASK = 'validator'


In [198]:
# MODEL_SPORTSBERT_NAME = 'SportsBERT'
# MODEL_SPORTSBERT = f'microsoft/{MODEL_SPORTSBERT_NAME}'

MODEL_BGE_NAME = 'bge-small-en-v1.5'
MODEL_BGE = f'BAAI/{MODEL_BGE_NAME}'

# MODEL, REV = MODEL_SPORTSBERT, "refs/pr/4"
MODEL, MODEL_NAME, REV = MODEL_BGE, MODEL_BGE_NAME, None

In [199]:
def set_deterministic(s=42):
    os.environ["PYTHONHASHSEED"]=str(s); random.seed(s); np.random.seed(s); torch.manual_seed(s)
    try: torch.use_deterministic_algorithms(True)
    except: pass
set_deterministic(42)

def build_train_pairs(X,Y,A):
    T,A2,L=[],[],[]
    for t,ys in zip(X,Y):
        P=list(set(ys)); N=[a for a in A if a not in P]; k=max(len(ys),len(P))
        if N: N=rng.choice(N,size=min(k,len(N)),replace=False).tolist()
        for a in P: T.append(PREFIX + t); A2.append(H(a)); L.append(1.)
        for a in N: T.append(PREFIX + t); A2.append(H(a)); L.append(0.)
    return T,A2,torch.tensor(L,dtype=torch.float16)

def build_val_pairs(X,Y,A):
    T,A2,L=[],[],[]
    for t,ys in zip(X,Y):
        S=set(ys)
        for a in A:
            T.append(PREFIX + t)
            A2.append(H(a))
            L.append(1. if a in S else 0.)
    return T,A2,torch.tensor(L,dtype=torch.float16)


class Pairs(torch.utils.data.Dataset):
    def __init__(self,tok,T,A,L,m=160): self.tok,tok.model_max_length,self.T,self.A,self.L,self.m=tok,m,T,A,L,m
    def __len__(self): return len(self.T)
    def __getitem__(self,i):
        enc=self.tok(self.T[i], self.A[i], truncation=True, padding="max_length", max_length=self.m)
        # --- drop token_type_ids for single-segment models ---
        enc.pop("token_type_ids", None)  # <<< changed
        enc={k:torch.tensor(v) for k,v in enc.items()}; enc["labels"]=self.L[i]; return enc

In [200]:
DEVICE = 'cpu'

if EMBEDDINGS_TASK == 'detector':
    if MODEL == MODEL_BGE:
        H = lambda action: f". This commentary mentioned {action}."
        PREFIX = 'Represent this sentence for classification: '
    if MODEL == MODEL_SPORTSBERT:
        H = lambda action: f"This commentary mentions {action}."
        PREFIX = ''
    
    H = lambda action: f"This commentary mentions {action}."
elif EMBEDDINGS_TASK == 'validator':
    H = lambda action: f"The {action} executed in this current play."


tok = AutoTokenizer.from_pretrained(MODEL, revision=REV, use_fast=True)
enc = AutoModel.from_pretrained(MODEL, revision=REV)
enc.eval()


Ttr,Atr,Ltr=build_train_pairs(X_train,y_train,ACTIONS)
Tva,Ava,Lva=build_val_pairs(X_val,y_val,ACTIONS)
ds_tr,ds_va=Pairs(tok,Ttr,Atr,Ltr),Pairs(tok,Tva,Ava,Lva)
print('Train trainscripts data', len(X_train))
print('Val trainscripts data', len(X_val))
print('Train data points', len(Ttr))
print('Val data points', len(Tva))

if INFERENCE_COLLECTION =='train':
    x_text = Ttr
    x_acts = Atr
    X_binary_label = Ltr

if INFERENCE_COLLECTION =='val':
    x_text = Tva
    x_acts = Ava
    X_binary_label = Lva
print(len(x_text))

print(INFERENCE_COLLECTION)


def get_mean_pooled(out, encoded_input, model_type=MODEL):
    if model_type == MODEL_SPORTSBERT:
        attn_mask = encoded_input['attention_mask'].unsqueeze(-1)
        hidden_attention = (out.last_hidden_state * attn_mask)
    else:
        hidden_attention = out.last_hidden_state
        attn_mask = encoded_input['attention_mask'].unsqueeze(-1)
    sum_hidden = hidden_attention.sum(1)
    lens = attn_mask.sum(1).clamp(min=1)
    mean_pooled = sum_hidden / lens
    return mean_pooled

mean_pooling_embs = []
cls_embs = []
bs = 256
for i in tqdm(range(0, len(x_text), bs)):
    
    print(f"{(i/len(x_text)*100):.1f}%")
    t_batch = x_text[i:i+bs]
    a_batch = x_acts[i:i+bs]
    
    encoded_input = tok(t_batch, a_batch, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = enc(**encoded_input)
        mean_pooled = get_mean_pooled(model_output, encoded_input, MODEL)
        cls = model_output.last_hidden_state[:,0]
        
    mean_pooled = torch.nn.functional.normalize(mean_pooled, p=2, dim=1)
    cls = torch.nn.functional.normalize(cls, p=2, dim=1)
        
    mean_pooling_embs.append(mean_pooled.detach().cpu())
    cls_embs.append(cls.detach().cpu())
cls_embs = torch.cat(cls_embs)
mean_pooling_embs = torch.cat(mean_pooling_embs)



Train trainscripts data 222
Val trainscripts data 385
Train data points 450
Val data points 14630
14630
val


  0%|          | 0/58 [00:00<?, ?it/s]

0.0%


  2%|▏         | 1/58 [00:02<02:01,  2.14s/it]

1.7%


  3%|▎         | 2/58 [00:04<02:06,  2.25s/it]

3.5%


  5%|▌         | 3/58 [00:07<02:11,  2.40s/it]

5.2%


  7%|▋         | 4/58 [00:09<02:14,  2.48s/it]

7.0%


  9%|▊         | 5/58 [00:12<02:09,  2.44s/it]

8.7%


 10%|█         | 6/58 [00:14<02:01,  2.34s/it]

10.5%


 12%|█▏        | 7/58 [00:16<01:54,  2.24s/it]

12.2%


 14%|█▍        | 8/58 [00:18<01:57,  2.34s/it]

14.0%


 16%|█▌        | 9/58 [00:21<01:59,  2.44s/it]

15.7%


 17%|█▋        | 10/58 [00:23<01:56,  2.42s/it]

17.5%


 19%|█▉        | 11/58 [00:26<01:54,  2.44s/it]

19.2%


 21%|██        | 12/58 [00:29<02:06,  2.74s/it]

21.0%


 22%|██▏       | 13/58 [00:33<02:13,  2.96s/it]

22.7%


 24%|██▍       | 14/58 [00:35<02:04,  2.83s/it]

24.5%


 26%|██▌       | 15/58 [00:38<02:05,  2.91s/it]

26.2%


 28%|██▊       | 16/58 [00:41<01:56,  2.77s/it]

28.0%


 29%|██▉       | 17/58 [00:43<01:49,  2.67s/it]

29.7%


 31%|███       | 18/58 [00:45<01:36,  2.40s/it]

31.5%


 33%|███▎      | 19/58 [00:47<01:33,  2.41s/it]

33.2%


 34%|███▍      | 20/58 [00:49<01:28,  2.32s/it]

35.0%


 36%|███▌      | 21/58 [00:52<01:27,  2.36s/it]

36.7%


 38%|███▊      | 22/58 [00:55<01:30,  2.52s/it]

38.5%


 40%|███▉      | 23/58 [00:57<01:27,  2.49s/it]

40.2%


 41%|████▏     | 24/58 [01:00<01:24,  2.48s/it]

42.0%


 43%|████▎     | 25/58 [01:02<01:19,  2.40s/it]

43.7%


 45%|████▍     | 26/58 [01:04<01:13,  2.31s/it]

45.5%


 47%|████▋     | 27/58 [01:06<01:07,  2.18s/it]

47.2%


 48%|████▊     | 28/58 [01:08<01:04,  2.16s/it]

49.0%


 50%|█████     | 29/58 [01:11<01:07,  2.34s/it]

50.7%


 52%|█████▏    | 30/58 [01:13<01:07,  2.39s/it]

52.5%


 53%|█████▎    | 31/58 [01:15<00:59,  2.22s/it]

54.2%


 55%|█████▌    | 32/58 [01:17<00:58,  2.26s/it]

56.0%


 57%|█████▋    | 33/58 [01:21<01:03,  2.54s/it]

57.7%


 59%|█████▊    | 34/58 [01:23<00:56,  2.37s/it]

59.5%


 60%|██████    | 35/58 [01:24<00:50,  2.18s/it]

61.2%


 62%|██████▏   | 36/58 [01:27<00:48,  2.19s/it]

63.0%


 64%|██████▍   | 37/58 [01:29<00:47,  2.25s/it]

64.7%


 66%|██████▌   | 38/58 [01:31<00:43,  2.18s/it]

66.5%


 67%|██████▋   | 39/58 [01:33<00:40,  2.15s/it]

68.2%


 69%|██████▉   | 40/58 [01:35<00:36,  2.04s/it]

70.0%


 71%|███████   | 41/58 [01:37<00:34,  2.03s/it]

71.7%


 72%|███████▏  | 42/58 [01:40<00:35,  2.23s/it]

73.5%


 74%|███████▍  | 43/58 [01:42<00:33,  2.23s/it]

75.2%


 76%|███████▌  | 44/58 [01:44<00:31,  2.23s/it]

77.0%


 78%|███████▊  | 45/58 [01:47<00:30,  2.38s/it]

78.7%


 79%|███████▉  | 46/58 [01:49<00:28,  2.39s/it]

80.5%


 81%|████████  | 47/58 [01:53<00:29,  2.69s/it]

82.2%


 83%|████████▎ | 48/58 [01:55<00:27,  2.74s/it]

84.0%


 84%|████████▍ | 49/58 [01:58<00:24,  2.72s/it]

85.7%


 86%|████████▌ | 50/58 [02:02<00:23,  2.97s/it]

87.5%


 88%|████████▊ | 51/58 [02:05<00:22,  3.23s/it]

89.2%


 90%|████████▉ | 52/58 [02:08<00:17,  2.97s/it]

91.0%


 91%|█████████▏| 53/58 [02:10<00:14,  2.81s/it]

92.7%


 93%|█████████▎| 54/58 [02:13<00:10,  2.70s/it]

94.5%


 95%|█████████▍| 55/58 [02:16<00:08,  2.76s/it]

96.2%


 97%|█████████▋| 56/58 [02:18<00:05,  2.71s/it]

98.0%


 98%|█████████▊| 57/58 [02:21<00:02,  2.78s/it]

99.7%


100%|██████████| 58/58 [02:22<00:00,  2.45s/it]


In [201]:
out_dir = f'data/embds_{MODEL_NAME}'
out_path = out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pt"
out_path

('data/embds_bge-small-en-v1.5', 'val_embds_detector.pt')

In [None]:
WRITE = True


def save_embeddings(INFERENCE_COLLECTION, x_acts, cls_embs, mean_pooling_embs):

    out_dir = f'data/embds_{MODEL_NAME}'
    out_path = os.path.join(out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pt")
    # Ensure the output directory exists
    
    os.makedirs(out_dir, exist_ok=True)

    # Prepare data to save
    embd_data = {
        "INFERENCE_COLLECTION": INFERENCE_COLLECTION,
        "actions_labels": [s.replace("This commentary mentions ", "").rstrip(".") for s in x_acts],
        "cls_embs": cls_embs.cpu(),  # ensure on CPU
        "mean_pooling_embs": mean_pooling_embs.cpu(),
        "sample_label": X_binary_label.cpu()


    }

    # Save as .pt (PyTorch native)
    torch.save(embd_data, out_path)

    # For pickle, better to convert to numpy for portability
    embd_data_np = {**embd_data,
                    "cls_embs": cls_embs.cpu().numpy(),
                    "mean_pooling_embs": mean_pooling_embs.cpu().numpy()}

    with open(os.path.join(out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pkl"), "wb") as f:
        pickle.dump(embd_data_np, f)

if WRITE:
    save_embeddings(INFERENCE_COLLECTION, x_acts, cls_embs, mean_pooling_embs)

data/embds_bge-small-en-v1.5/val_embds_detector.pt


In [206]:
def load_embeddings(INFERENCE_COLLECTION, EMBEDDINGS_TASK):
    out_dir = f'data/embds_{MODEL_NAME}'
    out_path = os.path.join(out_dir, f"{INFERENCE_COLLECTION}_embds_{EMBEDDINGS_TASK}.pt")
    return torch.load(out_path)

### predict

In [207]:
train_embeddings = load_embeddings('train', EMBEDDINGS_TASK)
val_embeddings = load_embeddings('val', EMBEDDINGS_TASK)

In [211]:
# EMBDS_COL = 'cls_embs'
EMBDS_COL = 'mean_pooling_embs'
# EMBDS_COL = 'both'

train_binary_labels = train_embeddings['sample_label']
if EMBDS_COL == 'both':
    train_embds = np.concatenate(
        [train_embeddings['cls_embs'], train_embeddings['mean_pooling_embs']], axis=1
    )
    test_embds = np.concatenate(
        [val_embeddings['cls_embs'], val_embeddings['mean_pooling_embs']], axis=1
    )
else:
    train_embds = train_embeddings[EMBDS_COL]
    test_embds = val_embeddings[EMBDS_COL]

test_binary_labels = val_embeddings['sample_label']

train_actions_labels = train_embeddings['actions_labels']
test_actions_labels = val_embeddings['actions_labels']


In [214]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score

def embd_action_prediction(train_embds, train_binary_labels, test_embds, predict_threshold=0.7, C=1.0):
    # Explicitly set penalty to 'l2' for L2 regularization
    
    pca = PCA(n_components=min(100, train_embds.shape[1]))
    train_embds = pca.fit_transform(train_embds)
    test_embds = pca.transform(test_embds)
    # clf = LGBMClassifier(
    #     n_estimators=50,
    #     max_depth=4,
    #     num_threads=1,
    #     random_state=42,
    #     verbose=-1,
    #     reg_alpha=5.0,   # Increased L1 regularization
    #     reg_lambda=5.0,  # Increased L2 regularization
    #     subsample=0.2    # Subsample 70% of data for each tree
    # )
    clf = LogisticRegression(penalty='l2', C=0.01, solver='liblinear', random_state=42)  # Increased regularization (smaller C)
    clf.fit(train_embds, train_binary_labels)
    train_proba = clf.predict_proba(train_embds)[:,1]
    test_proba = clf.predict_proba(test_embds)[:,1]

    train_preds = (train_proba > predict_threshold).astype(int)
    test_preds = (test_proba > predict_threshold).astype(int)
    return train_preds, test_preds



from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(np.concatenate([train_embds, test_embds], axis=0))
train_embds = scaler.transform(train_embds)
test_embds = scaler.transform(test_embds)

train_preds, test_preds = embd_action_prediction(train_embds, train_binary_labels, test_embds)


precision_train = precision_score(train_binary_labels, train_preds)
recall_train = recall_score(train_binary_labels, train_preds)
f1_train = f1_score(train_binary_labels, train_preds)
print('train', round(precision_train, 3), round(recall_train, 3), round(f1_train, 3))
print('Train metrics: \t\t Precision: {} \t Recall: {} \t F1: {}'.format(round(precision_train, 3), round(recall_train, 3), round(f1_train, 3)))

# np.random.shuffle(test_binary_labels)
precision_test = precision_score(test_binary_labels, test_preds)
recall_test = recall_score(test_binary_labels, test_preds)
f1_test = f1_score(test_binary_labels, test_preds)
print('Validation metrics: \t Precision: {} \t Recall: {} \t F1: {}'.format(round(precision_test, 3), round(recall_test, 3), round(f1_test, 3)))



# Calculate precision and recall per action label in train set
def per_action_metrics(actions_labels, binary_labels, preds):
    actions = set(actions_labels)
    action_metrics = {}
    for action in actions:
        mask = [a == action for a in actions_labels]
        y_true = [binary_labels[i] for i, m in enumerate(mask) if m]
        y_pred = [preds[i] for i, m in enumerate(mask) if m]
        if len(y_true) == 0:
            continue
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        action_metrics[action] = (round(prec, 3), round(rec, 3))
    for action, (prec, rec) in action_metrics.items():
        print(f"Action: {action:20s} Precision: {prec:.3f} Recall: {rec:.3f}")

# per_action_metrics(train_actions_labels, train_binary_labels, train_preds)
# per_action_metrics(test_actions_labels, test_binary_labels, test_preds)






train 0.953 0.547 0.695
Train metrics: 		 Precision: 0.953 	 Recall: 0.547 	 F1: 0.695
Validation metrics: 	 Precision: 0.235 	 Recall: 0.518 	 F1: 0.323


In [88]:
print('SportsBERT Action detection results')
print("""
train 0.858 0.864 0.861
Train metrics: 		 Precision: 0.858 	 Recall: 0.864 	 F1: 0.861
Validation metrics: 	 Precision: 0.102 	 Recall: 0.81 	 F1: 0.181
""")

SportsBERT Action detection results

train 0.858 0.864 0.861
Train metrics: 		 Precision: 0.858 	 Recall: 0.864 	 F1: 0.861
Validation metrics: 	 Precision: 0.102 	 Recall: 0.81 	 F1: 0.181

