In [1]:
import sys, os
from pathlib import Path
import pandas as pd
import numpy as np

# ensure repo root on sys.path
repo_root = Path('..').resolve()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from src.feature_engineering import (
    fe_body_part_onehot, fe_shot_type, fe_assist_type, fe_game_half, fe_big_chance, calculate_shot_angle
)

RAW_DIR = Path('../data/raw')
PROC_DIR = Path('../data/processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# load latest shots csv (basic header-fix)
files = sorted(RAW_DIR.glob('shots_events_*.csv'), key=os.path.getmtime)
assert files, 'No shots_events_*.csv found'
path = files[-1]
def read_and_fix(path):
    head = pd.read_csv(path, header=None, nrows=2)
    first = head.iloc[0].fillna('').astype(str)
    second = head.iloc[1].fillna('').astype(str)
    if (first.str.strip()!='').sum()<=2 and (second.str.strip()!='').sum()>(first.str.strip()!='').sum():
        return pd.read_csv(path, header=1)
    return pd.read_csv(path, header=0)

df = read_and_fix(path)
df.columns = [c.strip() if isinstance(c, str) else c for c in df.columns]
for col in ['xG','distance']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print('Loaded rows:', len(df))
df.head()

Loaded rows: 4029


Unnamed: 0,minute,player,team,xG,PSxG,outcome,distance,body_part,notes,SCA 1,SCA 1.1,SCA 2,SCA 2.1
0,,,,,,,,,,player,event,player,event
1,3.0,Hugo Ekitike,Liverpool,0.02,0.04,Saved,26.0,Right Foot,,Cody Gakpo,Pass (Live),Alexis Mac Allister,Pass (Live)
2,4.0,Mohamed Salah,Liverpool,0.02,0.22,Saved,18.0,Left Foot,,Hugo Ekitike,Pass (Live),Virgil van Dijk,Pass (Live)
3,5.0,Virgil van Dijk,Liverpool,0.13,0.0,Off Target,10.0,Head,,Mohamed Salah,Pass (Live),Jeremie Frimpong,Pass (Live)
4,6.0,Antoine Semenyo,Bournemouth,0.24,0.0,Off Target,10.0,Right Foot,,Adrien Truffert,Pass (Live),Marcos Senesi,Pass (Live)


In [3]:
# apply feature transforms
df = fe_body_part_onehot(df)
df = fe_shot_type(df)
# try to infer assist-type from SCA event columns if present
sca_event_cols = [c for c in df.columns if 'SCA' in str(c) and 'event' in str(c).lower()]
if not sca_event_cols:
    # fallback: some exports use repeated column names 'SCA 1.1' etc. try matching 'SCA'
    sca_event_cols = [c for c in df.columns if 'SCA' in str(c)]
df = fe_assist_type(df, sca_cols=sca_event_cols)
df = fe_game_half(df)
# big_chance heuristic requires distance
if 'distance' not in df.columns:
    # create a placeholder numeric distance if missing (NaN)
    df['distance'] = pd.NA
df = fe_big_chance(df)

# show resulting columns and head
print('Columns:', df.columns.tolist())
df.head()

Columns: ['minute', 'player', 'team', 'xG', 'PSxG', 'outcome', 'distance', 'body_part', 'notes', 'SCA 1', 'SCA 1.1', 'SCA 2', 'SCA 2.1', 'body_head', 'body_foot', 'body_other', 'shot_type', 'assist_type', 'minute_num', 'half', 'big_chance']


Unnamed: 0,minute,player,team,xG,PSxG,outcome,distance,body_part,notes,SCA 1,...,SCA 2,SCA 2.1,body_head,body_foot,body_other,shot_type,assist_type,minute_num,half,big_chance
0,,,,,,,,,,player,...,player,event,0,0,0,open_play,unknown,,2,0
1,3.0,Hugo Ekitike,Liverpool,0.02,0.04,Saved,26.0,Right Foot,,Cody Gakpo,...,Alexis Mac Allister,Pass (Live),0,1,0,open_play,pass,3.0,1,0
2,4.0,Mohamed Salah,Liverpool,0.02,0.22,Saved,18.0,Left Foot,,Hugo Ekitike,...,Virgil van Dijk,Pass (Live),0,1,0,open_play,pass,4.0,1,0
3,5.0,Virgil van Dijk,Liverpool,0.13,0.0,Off Target,10.0,Head,,Mohamed Salah,...,Jeremie Frimpong,Pass (Live),1,0,0,open_play,pass,5.0,1,0
4,6.0,Antoine Semenyo,Bournemouth,0.24,0.0,Off Target,10.0,Right Foot,,Adrien Truffert,...,Marcos Senesi,Pass (Live),0,1,0,open_play,pass,6.0,1,0


In [4]:
# save processed dataset
from datetime import datetime
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
out_path = PROC_DIR / f'processed_shots_{ts}.csv'
df.to_csv(out_path, index=False)
print('Saved processed CSV to', out_path)

Saved processed CSV to ..\data\processed\processed_shots_20251221T175417Z.csv
