In [1]:
# Imports and helpers
import json
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, brier_score_loss, precision_recall_curve, roc_curve, auc, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

def find_repo_root(start=Path.cwd(), markers=('setup.py','requirements.txt','README.md')):
    cur = start.resolve()
    for _ in range(10):
        if any((cur / m).exists() for m in markers):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

repo_root = find_repo_root()
print('Repo root:', repo_root)


Repo root: C:\Users\ramse\xg-prediction-model


In [2]:
# Load processed data and available models (defensive)
proc_dir = repo_root / 'data' / 'processed'
files = sorted(proc_dir.glob('processed_shots_*.csv'), key=lambda p: p.stat().st_mtime) if proc_dir.exists() else []
if not files:
    raise FileNotFoundError('No processed_shots_*.csv files in data/processed/')
data_path = files[-1]
print('Using processed CSV:', data_path)
df = pd.read_csv(data_path)
if 'outcome' not in df.columns:
    raise ValueError('processed data missing outcome column')
y = df['outcome'].astype(str).str.lower().eq('goal').astype(int)
# infer feature columns like training script
numeric_feats = [c for c in ['distance','minute_num'] if c in df.columns]
binary_feats = [c for c in ['body_head','body_foot','body_other','big_chance','half'] if c in df.columns]
cat_feats = [c for c in ['shot_type','assist_type'] if c in df.columns]
feature_cols = numeric_feats + binary_feats + cat_feats
X = df[feature_cols].copy() if feature_cols else pd.DataFrame()
print('Features inferred:', feature_cols)
# build preprocessor as used in training
num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())]) if numeric_feats else None
bin_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent'))]) if binary_feats else None
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown='ignore'))]) if cat_feats else None
transformers = []
if numeric_feats: transformers.append(('num', num_pipe, numeric_feats))
if binary_feats: transformers.append(('bin', bin_pipe, binary_feats))
if cat_feats: transformers.append(('cat', cat_pipe, cat_feats))
preprocessor = ColumnTransformer(transformers, remainder='drop') if transformers else None
# locate models
model_files = {
    'logistic': repo_root / 'results' / 'metrics' / 'model_logistic_calibrated.joblib',
    'random_forest': repo_root / 'results' / 'metrics' / 'model_random_forest_calibrated.joblib',
    'xgboost': repo_root / 'results' / 'metrics' / 'model_xgboost_calibrated.joblib',
    'neural_network': repo_root / 'results' / 'metrics' / 'model_neural_network_calibrated.joblib',
}
models = {}
for name, p in model_files.items():
    if p.exists():
        try:
            models[name] = joblib.load(p)
            print('Loaded', name)
        except Exception as e:
            print('Failed to load', name, e)
    else:
        print('Model file missing:', p)


Using processed CSV: C:\Users\ramse\xg-prediction-model\data\processed\processed_shots_20251221T175417Z.csv
Features inferred: ['distance', 'minute_num', 'body_head', 'body_foot', 'body_other', 'big_chance', 'half', 'shot_type', 'assist_type']
Loaded logistic
Loaded random_forest
Loaded xgboost
Loaded neural_network


## 1) Model comparison: ROC & Calibration

In [None]:
# Prepare test split (same splitting logic as other notebooks) and plot ROC + calibration
X_train_full, X_hold, y_train_full, y_hold = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_hold, y_hold, test_size=0.5, stratify=y_hold, random_state=42)
if not models:
    print('No models loaded; skipping comparison plots')
else:
    fig, axs = plt.subplots(1,2, figsize=(14,6))
    # ROC on left
    ax = axs[0]
    for name, mdl in models.items():
        try:
            proba = mdl.predict_proba(X_test)[:,1]
            fpr, tpr, _ = roc_curve(y_test, proba)
            ax.plot(fpr, tpr, label=f'{name} (AUC={roc_auc_score(y_test, proba):.3f})')
        except Exception as e:
            print('Skipping ROC for', name, e)
    ax.plot([0,1],[0,1],'k--', alpha=0.3)
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.set_title('ROC curves (test set)')
    ax.legend(loc='lower right')
    ax.grid(True)
    # Calibration on right
    ax2 = axs[1]
    from sklearn.calibration import calibration_curve
    for name, mdl in models.items():
        try:
            proba = mdl.predict_proba(X_test)[:,1]
            prob_true, prob_pred = calibration_curve(y_test, proba, n_bins=10)
            ax2.plot(prob_pred, prob_true, marker='o', label=f'{name} (Brier={brier_score_loss(y_test, proba):.3f})')
        except Exception as e:
            print('Skipping calibration for', name, e)
    ax2.plot([0,1],[0,1],'k--', alpha=0.3)
    ax2.set_xlabel('Mean predicted probability')
    ax2.set_ylabel('Fraction of positives')
    ax2.set_title('Calibration (reliability diagram)')
    ax2.legend()
    ax2.grid(True)
    outdir = repo_root / 'results' / 'metrics' / 'figures'
    outdir.mkdir(parents=True, exist_ok=True)
    savep = outdir / 'model_comparison_roc_calibration.png'
    fig.savefig(savep, dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved', savep)

## 2) Feature importance (models that expose importances or coefficients)

In [None]:
def get_feature_names_from_preprocessor(preproc):
    try:
        return list(preproc.get_feature_names_out())
    except Exception:
        names = []
        if hasattr(preproc, 'transformers_'):
            for name, trans, cols in preproc.transformers_:
                if name == 'remainder' and trans == 'drop':
                    continue
                if hasattr(trans, 'named_steps') and 'ohe' in trans.named_steps:
                    ohe = trans.named_steps['ohe']
                    try:
                        ohe_names = list(ohe.get_feature_names_out(cols))
                        names.extend(ohe_names)
                    except Exception:
                        names.extend(cols)
                else:
                    names.extend(cols)
        return names

def unwrap_model(m):
    try:
        if hasattr(m, 'named_steps'):
            clf = m.named_steps.get('clf') or m.named_steps.get('classifier') or list(m.named_steps.values())[-1]
            m = clf
    except Exception:
        pass
    if hasattr(m, 'base_estimator'):
        return m.base_estimator
    if hasattr(m, 'estimator'):
        return m.estimator
    return m

feat_names = get_feature_names_from_preprocessor(preprocessor) if preprocessor is not None else feature_cols
if not feat_names:
    feat_names = feature_cols
print('Final feature names used for importance mapping:', feat_names)

imp_records = []
for name, mdl in models.items():
    try:
        base = unwrap_model(mdl)
        if hasattr(base, 'coef_'):
            coefs = np.ravel(base.coef_)
            if len(coefs) == len(feat_names):
                df_imp = pd.DataFrame({'feature': feat_names, 'importance': np.abs(coefs)})
                df_imp = df_imp.sort_values('importance', ascending=False).head(20)
                imp_records.append((name, df_imp))
        elif hasattr(base, 'feature_importances_'):
            fi = base.feature_importances_
            if len(fi) == len(feat_names):
                df_imp = pd.DataFrame({'feature': feat_names, 'importance': fi})
                df_imp = df_imp.sort_values('importance', ascending=False).head(20)
                imp_records.append((name, df_imp))
        else:
            print('No direct importances for', name, '- skipping (try permutation importance offline)')
    except Exception as e:
        print('Error extracting importances for', name, e)

if not imp_records:
    print('No feature importances available to plot')
else:
    for name, df_imp in imp_records:
        plt.figure(figsize=(6,4))
        sns.barplot(data=df_imp, x='importance', y='feature')
        plt.title(f'Top features: {name}')
        outdir = repo_root / 'results' / 'metrics' / 'figures'
        outdir.mkdir(parents=True, exist_ok=True)
        p = outdir / f'feature_importance_{name}.png'
        plt.tight_layout()
        plt.savefig(p, dpi=150, bbox_inches='tight')
        plt.show()
        print('Saved', p)

## 3) Player / Team xG analysis

In [None]:
# Look for player/team columns and aggregate xG vs actual goals
player_cols = [c for c in ['player','player_name','shooter','playerId'] if c in df.columns]
team_cols = [c for c in ['team','team_name','teamId'] if c in df.columns]
group_by_player = player_cols[0] if player_cols else None
group_by_team = team_cols[0] if team_cols else None
# compute predicted probabilities using a prefered model if available
if models:
    pref = next((m for m in ['logistic','random_forest','xgboost','neural_network'] if m in models), None)
    mapper = models[pref] if pref else list(models.values())[0]
    try:
        proba_all = mapper.predict_proba(X)[:,1] if (feature_cols and set(feature_cols).issubset(df.columns)) else mapper.predict_proba(X)[:,1]
    except Exception:
        try:
            proba_all = mapper.predict_proba(df)[:,1]
        except Exception:
            proba_all = np.full(len(df), np.nan)
else:
    proba_all = np.full(len(df), np.nan)
df['_pred_xg'] = proba_all
df['_is_goal'] = y
if group_by_player:
    gp = df.groupby(group_by_player).agg(xg=('_pred_xg','sum'), goals=('_is_goal','sum'), attempts=('_is_goal','count'))
    gp = gp.sort_values('xg', ascending=False)
    display(gp.head(10))
    outp = repo_root / 'results' / 'metrics' / 'top_xg_by_player.csv'
    gp.to_csv(outp)
    print('Saved', outp)
else:
    print('No player column found; skipping player xG analysis')
if group_by_team:
    gt = df.groupby(group_by_team).agg(xg=('_pred_xg','sum'), goals=('_is_goal','sum'), attempts=('_is_goal','count'))
    gt = gt.sort_values('xg', ascending=False)
    display(gt.head(10))
    outt = repo_root / 'results' / 'metrics' / 'top_xg_by_team.csv'
    gt.to_csv(outt)
    print('Saved', outt)
else:
    print('No team column found; skipping team xG analysis')

## 4) Interactive prediction widget (ipywidgets)

In [None]:
# Small interactive UI to input shot features and get xG prediction from chosen model
# Use dynamic import to avoid static-analysis missing-import diagnostics in editors
try:
    import importlib
    widgets = importlib.import_module('ipywidgets')
    ipy_disp = importlib.import_module('IPython.display')
    display = ipy_disp.display
    HTML = ipy_disp.HTML
except Exception:
    print('ipywidgets not available. To enable interactive widgets install ipywidgets.')
    widgets = None

if widgets is not None:
    input_widgets = {}
    for f in numeric_feats:
        input_widgets[f] = widgets.FloatText(value=float(df[f].median()) if f in df.columns else 0.0, description=f)
    for f in binary_feats:
        # ensure default is one of the options to avoid TraitError
        opts = [0, 1]
        default = opts[0]
        if f in df.columns:
            try:
                mode_val = df[f].mode().iloc[0]
                mode_int = int(mode_val) if pd.notnull(mode_val) else default
                if mode_int in opts:
                    default = mode_int
            except Exception:
                default = opts[0]
        input_widgets[f] = widgets.Dropdown(options=opts, value=default, description=f)
    for f in cat_feats:
        opts = sorted(df[f].dropna().unique().tolist()) if f in df.columns else ['missing']
        if not opts:
            opts = ['missing']
        default = opts[0]
        input_widgets[f] = widgets.Dropdown(options=opts, value=default, description=f)
    model_select = widgets.Dropdown(options=list(models.keys()) if models else [], description='model')
    out = widgets.Output()
    def on_predict(_):
        with out:
            out.clear_output()
            if not model_select.value:
                print('No model selected')
                return
            row = {k: (w.value if not isinstance(w.value, np.ndarray) else w.value.item()) for k,w in input_widgets.items()}
            xdf = pd.DataFrame([row])
            mdl = models[model_select.value]
            try:
                p = mdl.predict_proba(xdf)[:,1][0]
            except Exception:
                try:
                    p = mdl.predict_proba(preprocessor.transform(xdf))[:,1][0] if preprocessor is not None else mdl.predict_proba(xdf)[:,1][0]
                except Exception as e:
                    print('Prediction failed:', e)
                    return
            print(f'Predicted xG probability: {p:.3f}')
    btn = widgets.Button(description='Predict')
    btn.on_click(on_predict)
    ui = widgets.VBox([widgets.HBox(list(input_widgets.values())), widgets.HBox([model_select, btn]), out])
    display(ui)
else:
    print('Interactive widgets not available in this environment')