# WSC Project - Data Analysis & NLP

This notebook contains all necessary imports for data analysis and NLP tasks.

In [16]:
# Standard library imports
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pandas as pd
import ast
import spacy
import json

# NLP imports
import nltk
from transformers import pipeline, AutoTokenizer, AutoModel
import torch

# Jupyter settings
%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings for full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

# Jupyter settings
%matplotlib inline
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

In [17]:
transcripts_df = pd.read_csv('data/transcripts.csv')
actions_df = pd.read_csv('data/actions.csv')

## EventName pre process + adding id



In [18]:

transcripts_df = transcripts_df.drop_duplicates(ignore_index=True)

event_split = transcripts_df['EventName'].str.split('by')
event_types = event_split.str[0].str.strip()
countOccurences = event_types.value_counts().to_dict()
count_df = pd.DataFrame(list(countOccurences.items()), columns=['EventType', 'Count'])
display(count_df)


events_df = transcripts_df.copy()

EVENT_TYPES = ['Missed Shot', '2-pointer', '3-pointer', 'Turnover', 'Rebound', 'Dunk', 'Foul', 'Assist', 'Steal', 'Jump Ball', '2-pts Made', 'FT-Made', '3-pts Made', 'Quarter End', 'FT-Missed', 'Block']

event_types_pattern = r"(?i)(" + "|".join(map(re.escape, sorted(EVENT_TYPES, key=len, reverse=True))) + r")\s+by\s+"
START = re.compile(event_types_pattern)
PAIR  = re.compile(r"(?i)^\s*(?P<a>" + "|".join(map(re.escape, sorted(EVENT_TYPES, key=len, reverse=True))) + r")\s+by\s+(?P<p>.+?)\s*$")

def split_events(s):
    s = re.sub(r"\s+", " ", s or "").strip()
    idx = [m.start() for m in START.finditer(s)]
    if not idx: return [s] if s else []
    idx.append(len(s))
    return [re.sub(r"\s+", " ", s[idx[i]:idx[i+1]].strip()) for i in range(len(idx)-1)]

def split_action_player(e):
    m = PAIR.match(re.sub(r"\s+", " ", e).strip())
    return (m.group("a").strip(), m.group("p").strip()) if m else (None, None)

# Add new columns
events_df["event_list"] = events_df["EventName"].apply(split_events)
events_players = events_df["event_list"].apply(lambda lst: [split_action_player(e) for e in lst])
events_df["players"] = events_players.apply(lambda lst: [ap[1] for ap in lst])
events_df["events"] = events_players.apply(lambda lst: [ap[0] for ap in lst])
events_df["EventNameCleaned"] = events_df["event_list"].apply(lambda x: ', '.join(x))
events_df = events_df.drop(columns=['event_list'])
events_df.insert(0, 'sample_id', range(1, len(events_df) + 1))
display(events_df.head(10))
events_df.to_csv('data/transcrpits_processed.csv', index=False)


Unnamed: 0,EventType,Count
0,Missed Shot,374
1,2-pointer,266
2,3-pointer,75
3,Turnover,70
4,Rebound,63
...,...,...
31,Replay,1
32,Replay 0.069-0.055,1
33,Copy of 2-pointer,1
34,Replay 0.069-0.061,1


Unnamed: 0,sample_id,EventName,Text,Label,players,events,EventNameCleaned
0,1,Missed Shot by Darren CollisonRebound by Joel Embiid,"If you go into that defensive circle and post up, you notice the defensive players behind you and policy and left that jump look a little bit short.",1,"[Darren Collison, Joel Embiid]","[Missed Shot, Rebound]","Missed Shot by Darren Collison, Rebound by Joel Embiid"
1,2,3-pointer by PJ TuckerAssist by Eric Gordon,He's double teamed.,1,"[PJ Tucker, Eric Gordon]","[3-pointer, Assist]","3-pointer by PJ Tucker, Assist by Eric Gordon"
2,3,Dunk by Aaron GordonAssist by Evan Fournier,Jordan trying to post up against Oxymora.,1,"[Aaron Gordon, Evan Fournier]","[Dunk, Assist]","Dunk by Aaron Gordon, Assist by Evan Fournier"
3,4,Missed Shot by Gordon Hayward,They're going to come and help on all of his post ups.,0,[Gordon Hayward],[Missed Shot],Missed Shot by Gordon Hayward
4,5,Rebound by Eric Bledsoe,They're going to come and help on all of his post ups.,0,[Eric Bledsoe],[Rebound],Rebound by Eric Bledsoe
5,6,2-pointer by LaMarcus AldridgeAssist by DeMar DeRozan,Maybe tonight not to double team as much.,0,"[LaMarcus Aldridge, DeMar DeRozan]","[2-pointer, Assist]","2-pointer by LaMarcus Aldridge, Assist by DeMar DeRozan"
6,7,Turnover by Joel EmbiidSteal by Kyle Lowry,"Or for tried to tuck in, but Lowry always takes those post ups personally.",0,"[Joel Embiid, Kyle Lowry]","[Turnover, Steal]","Turnover by Joel Embiid, Steal by Kyle Lowry"
7,8,Missed Shot by Brandon IngramRebound by Andrew Wiggins,Doesn't get the finger roll challenged at the rim.,1,"[Brandon Ingram, Andrew Wiggins]","[Missed Shot, Rebound]","Missed Shot by Brandon Ingram, Rebound by Andrew Wiggins"
8,9,3-pointer by Michael Porter Jr.,"Now over tomorrow good pump, fake jokisch.",1,[Michael Porter Jr.],[3-pointer],3-pointer by Michael Porter Jr.
9,10,Missed Shot by D'Angelo RussellRebound by John Henson,I was three shots and made 0 Russell penetration floater.,1,"[D'Angelo Russell, John Henson]","[Missed Shot, Rebound]","Missed Shot by D'Angelo Russell, Rebound by John Henson"


## Transcripts + Actions pre-process

In [19]:

transcripts_df = pd.read_csv(
    'data/transcrpits_processed.csv',
    converters={
        'players': ast.literal_eval,
        'events': ast.literal_eval
    }
)

ACTIONS = list(set(pd.read_csv("data/actions.csv")["parameter"]))

## normalize text actions and events


In [20]:
# spaCy's stopword set
nlp = spacy.load("en_core_web_lg")
STOPWORDS = nlp.Defaults.stop_words

# Broad temporal concept vocabulary
temporal_candidates = {
    # basic temporal connectives
    "after", "before", "until", "till", "since", "when", "while", "once", "then", "later", "earlier",
    "eventually", "soon", "previously", "recently", "now",
    # specific time references
    "today", "tomorrow", "yesterday", "tonight", "morning", "afternoon", "evening",
    "day", "week", "month", "year", "season", "period", "half", "quarter",
    # sequence/order terms
    "final", "first", "second", "third", "last", "next"
}

# Broad negation vocabulary
negation_candidates = {
    "no", "not", "n't", "never", "cannot", "can't", "nobody", "none", "nothing", "nowhere",
    "neither", "nor", "without", "minus"
}

# Intersections with spaCy's stopword list
TEMPORAL_STOPWORDS = sorted({w for w in STOPWORDS if w in temporal_candidates})
NEGATION_STOPWORDS = sorted({w for w in STOPWORDS if w in negation_candidates})



def preprocess_text(text):
    """Lower, lemmatize, remove punct/space"""
    return [t.lemma_.lower() for t in nlp(text, disable=["parser", "ner"]) if not (t.is_punct or t.is_space)]

def lemmatize(text, phrases_patterns):
    text_tokens = preprocess_text(text)
    result = text_tokens[:]
    for pattern in phrases_patterns:
        pattern_split = pattern.split('_')
        if len(pattern_split) > 1:
            for i in range(len(result) - len(pattern_split) + 1):
                if result[i:i+len(pattern_split)] == pattern_split:
                    result = result[:i] + [pattern] + result[i+len(pattern_split):]
                    break
    result = [t for t in result if not nlp.vocab[t].is_stop or t in TEMPORAL_STOPWORDS + NEGATION_STOPWORDS]
    return result


In [21]:

EVENTS = list(set([event for sublist in transcripts_df['events'].tolist() for event in sublist if event is not None]))
ACTIONS_PROCESSED = ['_'.join(preprocess_text(action)) for action in ACTIONS]
EVENTS_PROCESSED = ['_'.join(preprocess_text(event)) for event in EVENTS]

phrases_patterns = ACTIONS_PROCESSED + EVENTS_PROCESSED
transcripts_df['tokenized_event_name'] = transcripts_df['EventNameCleaned'].apply(lambda x: lemmatize(x, phrases_patterns))
transcripts_df['tokenized_text'] = transcripts_df['Text'].apply(lambda x: lemmatize(x, phrases_patterns))

## Augmented data pre-process

In [26]:
augmented_texts_df = pd.read_csv('data/augmented_texts.csv')
augmented_texts_df['tokenized_augmented_text'] = augmented_texts_df['augmented_text'].apply(lambda x: lemmatize(x, phrases_patterns))   
augmented_texts_df.to_csv('data/augmented_texts_processed.csv', index=False)

## add detected actions from text

In [None]:
actions_processed_to_action = dict(zip(ACTIONS, ACTIONS_PROCESSED))
with open("data/actions_processed_to_action.json", "w") as f:
    json.dump(actions_processed_to_action, f, indent=2)



In [11]:
def extract_actions_from_lemmatized(lemmatized_tokens):
    return [action for action in ACTIONS_PROCESSED if action in lemmatized_tokens]

transcripts_df['actions_in_text'] = transcripts_df['tokenized_text'].apply(extract_actions_from_lemmatized)

In [None]:
save_cols = ['sample_id', 'EventNameCleaned', 'Text', 'Label', 'actions_in_text', 'tokenized_text', 'tokenized_event_name', 'players', 'events']
transcripts_df_reduced = transcripts_df[save_cols]
transcripts_df_reduced.to_csv('data/transcripts_tokenized.csv', index=False)
display(transcripts_df_reduced)
display(transcripts_df)

Unnamed: 0,sample_id,EventNameCleaned,Text,Label,actions_in_text,tokenized_text,tokenized_event_name,players,events
0,1,"Missed Shot by Darren Collison, Rebound by Joel Embiid","If you go into that defensive circle and post up, you notice the defensive players behind you and policy and left that jump look a little bit short.",1,[post_up],"[defensive, circle, post_up, notice, defensive, player, policy, leave, jump, look, little, bit, short]","[miss, shot, darren, collison, rebound, joel, embiid]","[Darren Collison, Joel Embiid]","[Missed Shot, Rebound]"
1,2,"3-pointer by PJ Tucker, Assist by Eric Gordon",He's double teamed.,1,[double_team],[double_team],"[3_pointer, pj, tucker, assist, eric, gordon]","[PJ Tucker, Eric Gordon]","[3-pointer, Assist]"
2,3,"Dunk by Aaron Gordon, Assist by Evan Fournier",Jordan trying to post up against Oxymora.,1,[post_up],"[jordan, try, post_up, oxymora]","[dunk, aaron, gordon, assist, evan, fournier]","[Aaron Gordon, Evan Fournier]","[Dunk, Assist]"
3,4,Missed Shot by Gordon Hayward,They're going to come and help on all of his post ups.,0,[post_up],"[come, help, post_up]","[miss, shot, gordon, hayward]",[Gordon Hayward],[Missed Shot]
4,5,Rebound by Eric Bledsoe,They're going to come and help on all of his post ups.,0,[post_up],"[come, help, post_up]","[rebound, eric, bledsoe]",[Eric Bledsoe],[Rebound]
...,...,...,...,...,...,...,...,...,...
1100,1101,Missed Shot by John Collins,John Collins can't convert the lob inside.,1,[lob],"[john, collins, not, convert, lob, inside]","[miss, shot, john, collins]",[John Collins],[Missed Shot]
1101,1102,Turnover by Tomas Satoransky,Backdoor cut watch couldn't find him.,0,[backdoor],"[backdoor, cut, watch, not, find]","[turnover, tomas, satoransky]",[Tomas Satoransky],[Turnover]
1102,1103,Replay - Auto,Fake getting up in the air and complete the play.,1,[fake],"[fake, air, complete, play]","[replay, auto]",[None],[None]
1103,1104,2-pointer by Otto Porter Jr.,Here's Porter with a fake.,1,[fake],"[porter, fake]","[2_pointer, otto, porter, jr.]",[Otto Porter Jr.],[2-pointer]


Unnamed: 0,sample_id,EventName,Text,Label,players,events,EventNameCleaned,tokenized_event_name,tokenized_text,actions_in_text
0,1,Missed Shot by Darren CollisonRebound by Joel Embiid,"If you go into that defensive circle and post up, you notice the defensive players behind you and policy and left that jump look a little bit short.",1,"[Darren Collison, Joel Embiid]","[Missed Shot, Rebound]","Missed Shot by Darren Collison, Rebound by Joel Embiid","[miss, shot, darren, collison, rebound, joel, embiid]","[defensive, circle, post_up, notice, defensive, player, policy, leave, jump, look, little, bit, short]",[post_up]
1,2,3-pointer by PJ TuckerAssist by Eric Gordon,He's double teamed.,1,"[PJ Tucker, Eric Gordon]","[3-pointer, Assist]","3-pointer by PJ Tucker, Assist by Eric Gordon","[3_pointer, pj, tucker, assist, eric, gordon]",[double_team],[double_team]
2,3,Dunk by Aaron GordonAssist by Evan Fournier,Jordan trying to post up against Oxymora.,1,"[Aaron Gordon, Evan Fournier]","[Dunk, Assist]","Dunk by Aaron Gordon, Assist by Evan Fournier","[dunk, aaron, gordon, assist, evan, fournier]","[jordan, try, post_up, oxymora]",[post_up]
3,4,Missed Shot by Gordon Hayward,They're going to come and help on all of his post ups.,0,[Gordon Hayward],[Missed Shot],Missed Shot by Gordon Hayward,"[miss, shot, gordon, hayward]","[come, help, post_up]",[post_up]
4,5,Rebound by Eric Bledsoe,They're going to come and help on all of his post ups.,0,[Eric Bledsoe],[Rebound],Rebound by Eric Bledsoe,"[rebound, eric, bledsoe]","[come, help, post_up]",[post_up]
...,...,...,...,...,...,...,...,...,...,...
1100,1101,Missed Shot by John Collins,John Collins can't convert the lob inside.,1,[John Collins],[Missed Shot],Missed Shot by John Collins,"[miss, shot, john, collins]","[john, collins, not, convert, lob, inside]",[lob]
1101,1102,Turnover by Tomas Satoransky,Backdoor cut watch couldn't find him.,0,[Tomas Satoransky],[Turnover],Turnover by Tomas Satoransky,"[turnover, tomas, satoransky]","[backdoor, cut, watch, not, find]",[backdoor]
1102,1103,Replay - Auto,Fake getting up in the air and complete the play.,1,[None],[None],Replay - Auto,"[replay, auto]","[fake, air, complete, play]",[fake]
1103,1104,2-pointer by Otto Porter Jr.,Here's Porter with a fake.,1,[Otto Porter Jr.],[2-pointer],2-pointer by Otto Porter Jr.,"[2_pointer, otto, porter, jr.]","[porter, fake]",[fake]
