# E2E Runner

In [1]:
# Standard library imports
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pandas as pd
import ast
import spacy
import json

# NLP imports
import nltk

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import FunctionTransformer
import pickle

In [2]:
transcripts_df = pd.read_csv('data/transcripts.csv').drop_duplicates(ignore_index=True)
transcripts_df.insert(0, 'sample_id', range(1, len(transcripts_df) + 1))
ACTIONS = list(set(pd.read_csv("data/actions.csv")["parameter"]))

In [3]:

pseudo_df = pd.read_csv("data/pseudo_actions_labels_with_id.csv")[['sample_id', 'action_detected']]
augmented_texts_df = pd.read_csv('data/augmented_texts_processed.csv')[['sample_id', 'augmented_text', 'tokenized_augmented_text', 'action']]

# preprocess

In [4]:
EVENT_TYPES = ['Missed Shot', '2-pointer', '3-pointer', 'Turnover', 'Rebound', 'Dunk', 'Foul', 'Assist', 'Steal', 'Jump Ball', '2-pts Made', 'FT-Made', '3-pts Made', 'Quarter End', 'FT-Missed', 'Block']
event_types_pattern = r"(?i)(" + "|".join(map(re.escape, sorted(EVENT_TYPES, key=len, reverse=True))) + r")\s+by\s+"
START = re.compile(event_types_pattern)
PAIR  = re.compile(r"(?i)^\s*(?P<a>" + "|".join(map(re.escape, sorted(EVENT_TYPES, key=len, reverse=True))) + r")\s+by\s+(?P<p>.+?)\s*$")

def split_events(s):
    s = re.sub(r"\s+", " ", s or "").strip()
    idx = [m.start() for m in START.finditer(s)]
    if not idx: return [s] if s else []
    idx.append(len(s))
    return [re.sub(r"\s+", " ", s[idx[i]:idx[i+1]].strip()) for i in range(len(idx)-1)]

def split_action_player(e):
    m = PAIR.match(re.sub(r"\s+", " ", e).strip())
    return (m.group("a").strip(), m.group("p").strip()) if m else (None, None)

def clean_events(transcripts_df):
    transcripts_df = transcripts_df.copy()
    event_split = transcripts_df['EventName'].str.split('by')
    event_types = event_split.str[0].str.strip()

    

    transcripts_df["event_list"] = transcripts_df["EventName"].apply(split_events)
    events_players = transcripts_df["event_list"].apply(lambda lst: [split_action_player(e) for e in lst])
    transcripts_df["players"] = events_players.apply(lambda lst: [ap[1] for ap in lst])
    transcripts_df["events"] = events_players.apply(lambda lst: [ap[0] for ap in lst])
    transcripts_df["EventNameCleaned"] = transcripts_df["event_list"].apply(lambda x: ', '.join(x))
    return transcripts_df.drop(columns=['event_list'])


# spaCy's stopword set
nlp = spacy.load("en_core_web_lg")
STOPWORDS = nlp.Defaults.stop_words

# Broad temporal concept vocabulary
temporal_candidates = {
    # basic temporal connectives
    "after", "before", "until", "till", "since", "when", "while", "once", "then", "later", "earlier",
    "eventually", "soon", "previously", "recently", "now",
    # specific time references
    "today", "tomorrow", "yesterday", "tonight", "morning", "afternoon", "evening",
    "day", "week", "month", "year", "season", "period", "half", "quarter",
    # sequence/order terms
    "final", "first", "second", "third", "last", "next"
}

# Broad negation vocabulary
negation_candidates = {
    "no", "not", "n't", "never", "cannot", "can't", "nobody", "none", "nothing", "nowhere",
    "neither", "nor", "without", "minus"
}

# Intersections with spaCy's stopword list
TEMPORAL_STOPWORDS = sorted({w for w in STOPWORDS if w in temporal_candidates})
NEGATION_STOPWORDS = sorted({w for w in STOPWORDS if w in negation_candidates})



def preprocess_text(text):
    """Lower, lemmatize, remove punct/space"""
    return [t.lemma_.lower() for t in nlp(text, disable=["parser", "ner"]) if not (t.is_punct or t.is_space)]

def lemmatize(text, phrases_patterns):
    text_tokens = preprocess_text(text)
    result = text_tokens[:]
    for pattern in phrases_patterns:
        pattern_split = pattern.split('_')
        if len(pattern_split) > 1:
            for i in range(len(result) - len(pattern_split) + 1):
                if result[i:i+len(pattern_split)] == pattern_split:
                    result = result[:i] + [pattern] + result[i+len(pattern_split):]
                    break
    result = [t for t in result if not nlp.vocab[t].is_stop or t in TEMPORAL_STOPWORDS + NEGATION_STOPWORDS]
    return result


In [5]:
EVENTS = list(set([event for sublist in clean_events(transcripts_df)['events'].tolist() for event in sublist if event is not None]))
ACTIONS_PROCESSED = ['_'.join(preprocess_text(action)) for action in ACTIONS]
EVENTS_PROCESSED = ['_'.join(preprocess_text(event)) for event in EVENTS]
phrases_patterns = ACTIONS_PROCESSED + EVENTS_PROCESSED

def extract_actions_from_lemmatized(lemmatized_tokens):
    return [action for action in ACTIONS_PROCESSED if action in lemmatized_tokens]

# Detector code

In [14]:

def concat_augmentations_to_fold_df(transcripts_df):
    augmented_texts_train = augmented_texts_df[augmented_texts_df['sample_id'].isin(transcripts_df['sample_id'])]
    augmented_texts_train = augmented_texts_train.rename(columns={'augmented_text': 'transcript_text', 'action': 'actions_pseudo_label', 'tokenized_augmented_text': 'tokenized_text'})
    augmented_texts_train['actions_pseudo_label'] = augmented_texts_train['actions_pseudo_label'].apply(lambda x: [x])
    augmented_texts_train = augmented_texts_train.merge(
        transcripts_df.drop(['actions_pseudo_label', 'transcript_text', 'tokenized_text'], axis=1), 
        on='sample_id', 
        how='inner'
    )
    return pd.concat([transcripts_df, augmented_texts_train], ignore_index=True)


def tfidf_detector_train(**kwargs):
    train_df = kwargs['train_df']
    mlb = MultiLabelBinarizer()
    
    y = mlb.fit_transform(train_df['actions_pseudo_label'])
    
    train_df['tokenized_text_str'] = train_df['tokenized_text'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else ' '.join(x))
    train_df['tokenized_events_str'] = train_df['events'].apply(lambda x: ' '.join([(i or '') for i in (ast.literal_eval(x) if isinstance(x, str) else x)]))

    get_text = FunctionTransformer(lambda X: X['tokenized_text_str'], validate=False)
    get_events = FunctionTransformer(lambda X: X['tokenized_events_str'], validate=False)
    sublinear_tf = True
    
    solver = 'liblinear'
    
    pipeline = Pipeline([
       ('features', FeatureUnion([
           ('word_tfidf_text', Pipeline([('sel', get_text), ('tfidf', TfidfVectorizer(ngram_range=(1,3), analyzer='word', sublinear_tf=sublinear_tf))])),
           ('char_tfidf_text', Pipeline([('sel', get_text), ('tfidf', TfidfVectorizer(ngram_range=(2,3), analyzer='char', sublinear_tf=sublinear_tf))])),
           ('word_tfidf_events', Pipeline([('sel', get_events), ('tfidf', TfidfVectorizer(ngram_range=(1,2), analyzer='word', sublinear_tf=sublinear_tf))])),
       ])),
       ('classifier', MultiOutputClassifier(LogisticRegression(class_weight='balanced', solver=solver, max_iter=2000, C=0.5)))
    ])
    X = train_df[['tokenized_text_str','tokenized_events_str']]
    pipeline.fit(X, y)
    return pipeline, mlb


def tfidf_detector_predict(pipeline, mlb, test_df, predict_threshold=0.6):
    predictions_proba = pipeline.predict_proba(test_df)
    proba_of_action = np.column_stack([pred[:, 1] for pred in predictions_proba])
    proba_of_action_max = np.zeros_like(proba_of_action)
    row_max_indices = proba_of_action.argmax(axis=1)
    proba_of_action_max[np.arange(proba_of_action.shape[0]), row_max_indices] = proba_of_action[np.arange(proba_of_action.shape[0]), row_max_indices]
    predictions = (proba_of_action_max > predict_threshold).astype(int)
    output = [sublist[0] if any(sublist) else None for sublist in mlb.inverse_transform(predictions)]
    return output

def string_matching_prediction(**kwargs):
    test_df = kwargs['test_df']
    return test_df['actions_str_detected'].tolist()


class Detector:

    def prepare_df(self, transcripts_df):
        transcripts_df = clean_events(transcripts_df)
        transcripts_df['tokenized_event_name'] = transcripts_df['EventNameCleaned'].apply(lambda x: lemmatize(x, phrases_patterns))
        transcripts_df['tokenized_text'] = transcripts_df['Text'].apply(lambda x: lemmatize(x, phrases_patterns))
        transcripts_df['actions_in_text'] = transcripts_df['tokenized_text'].apply(extract_actions_from_lemmatized)
        return transcripts_df
        

    def fit(self, transcripts_df):
        transcripts_df = transcripts_df.copy()
        with open('data/actions_processed_to_action.json', 'r') as f:
            actions_map = json.load(f)
        self.inverse_actions_map = {v: k for k, v in actions_map.items()}

        transcripts_df = self.prepare_df(transcripts_df)
        transcripts_df = pseudo_df.merge(transcripts_df, on='sample_id', how='inner').rename(columns={'action_detected': 'actions_pseudo_label', 'actions_in_text': 'actions_str_detected', 'Text': 'transcript_text'})
        transcripts_df['actions_pseudo_labbel'] =  transcripts_df['actions_pseudo_label'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

        
        transcripts_df['actions_str_detected'] = transcripts_df['actions_str_detected'].apply(
            lambda x: [self.inverse_actions_map.get(a, a) for a in (ast.literal_eval(x) if isinstance(x, str) else x)])
        transcripts_df = concat_augmentations_to_fold_df(transcripts_df)
        self.tfidf_pipeline, self.mlb = tfidf_detector_train(train_df=transcripts_df)

    def predict(self, transcripts_df):
        transcripts_df = transcripts_df.copy()
        transcripts_df = self.prepare_df(transcripts_df)
        transcripts_df = transcripts_df.rename(columns={'actions_in_text': 'actions_str_detected'})
        transcripts_df['tokenized_text_str'] = transcripts_df['tokenized_text'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else ' '.join(x))
        transcripts_df['tokenized_events_str'] = transcripts_df['events'].apply(lambda x: ' '.join([(i or '') for i in (ast.literal_eval(x) if isinstance(x, str) else x)]))
        transcripts_df['actions_str_detected'] = transcripts_df['actions_str_detected'].apply(
            lambda x: [self.inverse_actions_map.get(a, a) for a in (ast.literal_eval(x) if isinstance(x, str) else x)])
        predicted_labels_tfidf = tfidf_detector_predict(self.tfidf_pipeline, self.mlb, transcripts_df)
        predicted_labels_string_match = transcripts_df['actions_str_detected'].tolist()
        predicted_labels_string_match = [p[0] if p else None for p in predicted_labels_string_match]
        combined_labels = [
        t if s is None else s
        for s, t in zip(predicted_labels_string_match, predicted_labels_tfidf)]
        transcripts_df['action_predicted'] = combined_labels
        origin_mapping = {v: k for k, v in self.inverse_actions_map.items()}
        transcripts_df['action_predicted_tokenized'] = transcripts_df['action_predicted'].apply(lambda a: origin_mapping.get(a, a))
        return transcripts_df



In [7]:

def tfidf_validation_train(train_df):
    y = train_df['Label'].values

    get_text = FunctionTransformer(lambda X: X['tokenized_text_str'], validate=False)
    get_events = FunctionTransformer(lambda X: X['tokenized_events_str'], validate=False)

    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('word_tfidf_text', Pipeline([('sel', get_text), ('tfidf', TfidfVectorizer(ngram_range=(1,3), analyzer='word'))])),
            ('char_tfidf_text', Pipeline([('sel', get_text), ('tfidf', TfidfVectorizer(ngram_range=(2,4), analyzer='char'))])),
            ('word_tfidf_events', Pipeline([('sel', get_events), ('tfidf', TfidfVectorizer(ngram_range=(1,2), analyzer='word'))])),
        ])),
        ('classifier', LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=2000, C=1))
    ])
    X = train_df[['tokenized_text_str', 'tokenized_events_str']]
    pipeline.fit(X, y)
    return pipeline


def tfidf_validator_predict(pipeline, test_df, predict_threshold=0.6):
    X_test = test_df[['tokenized_text_str', 'tokenized_events_str']]
    predictions_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (predictions_proba > predict_threshold).astype(int)
    return y_pred

class Validator:
    def prepare_df(self, transcripts_df):
        transcripts_df['tokenized_text_str'] = transcripts_df['tokenized_text'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else ' '.join(x))
        transcripts_df['tokenized_events_str'] = transcripts_df['events'].apply(lambda x: ' '.join(['_'.join((w or '').split()) if w else '' for w in (ast.literal_eval(x) if isinstance(x, str) else x)]))
        return transcripts_df
        

    def fit(self, transcripts_df):
        transcripts_df = transcripts_df.copy()
        transcripts_df = self.prepare_df(transcripts_df)
        self.tfidf_pipeline = tfidf_validation_train(train_df=transcripts_df)

    def predict(self, transcripts_df):
        transcripts_df = transcripts_df.copy()
        transcripts_df = self.prepare_df(transcripts_df)
        y_pred = tfidf_validator_predict(self.tfidf_pipeline, transcripts_df)
        return y_pred

## train classes

In [16]:

detector_trainer = Detector()
detector_trainer.fit(transcripts_df)
transcripts_train = detector_trainer.predict(transcripts_df)

validator_trainer = Validator()
validator_trainer.fit(transcripts_train)

## predict on new data

In [19]:
new_transcripts_df = pd.read_csv('data/transcripts.csv').drop('Label',axis = 1).head(1)
display(new_transcripts_df)
new_transcripts_df = detector_trainer.predict(new_transcripts_df)
predicted_validation = validator_trainer.predict(new_transcripts_df)
new_transcripts_df['action_validity'] = predicted_validation
new_transcripts_df = new_transcripts_df[['EventName', 'Text', 'action_predicted', 'action_validity']]
display(new_transcripts_df)

Unnamed: 0,EventName,Text
0,Missed Shot by Darren CollisonRebound by Joel ...,If you go into that defensive circle and post ...


Unnamed: 0,EventName,Text,action_predicted,action_validity
0,Missed Shot by Darren CollisonRebound by Joel ...,If you go into that defensive circle and post ...,post up,0
