In [None]:
# Imports and helpers
import json
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, brier_score_loss, precision_recall_curve, roc_curve, auc, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# repo-root aware helper
def find_repo_root(start=Path.cwd(), markers=('setup.py','requirements.txt','README.md')):
    cur = start.resolve()
    for _ in range(10):
        if any((cur / m).exists() for m in markers):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

repo_root = find_repo_root()
metrics_path = repo_root / 'results' / 'metrics' / 'metrics_summary.json'
metrics_summary = {}
if metrics_path.exists():
    with open(metrics_path, 'r', encoding='utf-8') as fh:
        metrics_summary = json.load(fh)
    print('Loaded metrics_summary.json')
else:
    print('metrics_summary.json not found at', metrics_path, '- run training script first')

In [None]:
# Load processed data and prepare features (matching training script)
proc_dir = repo_root / 'data' / 'processed'
files = sorted(proc_dir.glob('processed_shots_*.csv'), key=lambda p: p.stat().st_mtime)
if not files:
    raise FileNotFoundError('No processed_shots_*.csv files in data/processed/')
data_path = files[-1]
print('Using processed CSV:', data_path)
df = pd.read_csv(data_path)
if 'outcome' not in df.columns:
    raise ValueError('processed data missing outcome column')
y = df['outcome'].astype(str).str.lower().eq('goal').astype(int)
numeric_feats = [c for c in ['distance','minute_num'] if c in df.columns]
binary_feats = [c for c in ['body_head','body_foot','body_other','big_chance','half'] if c in df.columns]
cat_feats = [c for c in ['shot_type','assist_type'] if c in df.columns]
feature_cols = numeric_feats + binary_feats + cat_feats
X = df[feature_cols].copy()
print('Features used:', feature_cols)
print('Class counts:', y.value_counts().to_dict())

# build preprocessor
num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())]) if numeric_feats else None
bin_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent'))]) if binary_feats else None
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown='ignore'))]) if cat_feats else None
transformers = []
if numeric_feats: transformers.append(('num', num_pipe, numeric_feats))
if binary_feats: transformers.append(('bin', bin_pipe, binary_feats))
if cat_feats: transformers.append(('cat', cat_pipe, cat_feats))
preprocessor = ColumnTransformer(transformers, remainder='drop')

# splits
X_train_full, X_hold, y_train_full, y_hold = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_hold, y_hold, test_size=0.5, stratify=y_hold, random_state=42)
print('Train/Val/Test sizes:', len(X_train_full), len(X_val), len(X_test))

# model files
model_files = {
    'logistic': repo_root / 'results' / 'metrics' / 'model_logistic_calibrated.joblib',
    'random_forest': repo_root / 'results' / 'metrics' / 'model_random_forest_calibrated.joblib',
    'xgboost': repo_root / 'results' / 'metrics' / 'model_xgboost_calibrated.joblib',
    'neural_network': repo_root / 'results' / 'metrics' / 'model_neural_network_calibrated.joblib',
}

eval_rows = []
for name, path in model_files.items():
    if path.exists():
        mdl = joblib.load(path)
        try:
            proba = mdl.predict_proba(X_test)[:,1]
        except Exception:
            try:
                # ensure preprocessor is applied if needed
                Xt = preprocessor.fit_transform(X_train_full) if hasattr(preprocessor, 'fit_transform') else preprocessor.transform(X_test)
                proba = mdl.predict_proba(X_test)[:,1]
            except Exception:
                print('predict_proba failed for', name)
                continue
        pred = (proba >= 0.5).astype(int)
        auc_score = roc_auc_score(y_test, proba) if len(np.unique(y_test))>1 else None
        brier = brier_score_loss(y_test, proba)
        report = classification_report(y_test, pred, output_dict=True, zero_division=0)
        cm = confusion_matrix(y_test, pred)
        eval_rows.append({'model': name, 'roc_auc': auc_score, 'brier': brier, 'proba': proba, 'report': report, 'cm': cm})
    else:
        print('Missing model file:', path)

# summary
if eval_rows:
    df_summary = pd.DataFrame([{ 'model': r['model'], 'roc_auc': r['roc_auc'], 'brier': r['brier']} for r in eval_rows])
    display(df_summary)
else:
    print('No models evaluated')

## ROC curves for available models

In [None]:
plt.figure(figsize=(8,6))
for r in eval_rows:
    proba = r['proba']
    fpr, tpr, _ = roc_curve(y_test, proba)
    plt.plot(fpr, tpr, label=f"{r['model']} (AUC={roc_auc_score(y_test, proba):.3f})")
plt.plot([0,1],[0,1],'k--', alpha=0.3)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curves (test set)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## Simple upsampling experiment (Logistic Regression)

In [None]:
# Upsampling and evaluate
train_df = pd.concat([X_train_full, y_train_full.rename('y')], axis=1)
minor = train_df[train_df['y']==1]
major = train_df[train_df['y']==0]
print('Before upsample counts:', train_df['y'].value_counts().to_dict())
if len(minor) == 0:
    print('No positive samples in training set; skipping upsampling')
else:
    minor_up = resample(minor, replace=True, n_samples=len(major), random_state=42)
    train_up = pd.concat([major, minor_up])
    print('After upsample counts:', train_up['y'].value_counts().to_dict())
    X_train_up = train_up[feature_cols]
    y_train_up = train_up['y']
    pipe_up = Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
    pipe_up.fit(X_train_up, y_train_up)
    proba_up = pipe_up.predict_proba(X_test)[:,1]
    pred_up = (proba_up >= 0.5).astype(int)
    print('Upsampled logistic ROC AUC:', roc_auc_score(y_test, proba_up))
    print(classification_report(y_test, pred_up, zero_division=0))

## Calibration and Precision-Recall Plots
The cells below compute reliability (calibration) diagrams, predicted-probability histograms, and precisionâ€“recall curves for the evaluated models and save the figures to `results/metrics/figures/`.

In [None]:
# Calibration (reliability) diagram + probability histograms
from sklearn.calibration import calibration_curve
fig_dir = repo_root / 'results' / 'metrics' / 'figures'
fig_dir.mkdir(parents=True, exist_ok=True)
# Reliability diagram
plt.figure(figsize=(8,6))
for r in eval_rows:
    proba = r['proba']
    prob_true, prob_pred = calibration_curve(y_test, proba, n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', label=f"{r['model']} (Brier={r['brier']:.3f})")
plt.plot([0,1],[0,1], 'k--', alpha=0.5)
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Calibration plot (reliability diagram)')
plt.legend()
plt.grid(True)
cal_fig = fig_dir / 'calibration_reliability.png'
plt.savefig(cal_fig, dpi=150, bbox_inches='tight')
plt.show()
plt.close()
# Predicted probability histograms
plt.figure(figsize=(8,3))
for r in eval_rows:
    plt.hist(r['proba'], bins=20, alpha=0.4, label=r['model'])
plt.legend()
plt.title('Predicted probability distribution (test set)')
hist_fig = fig_dir / 'probability_histograms.png'
plt.savefig(hist_fig, dpi=150, bbox_inches='tight')
plt.show()
plt.close()
print('Saved calibration and probability histogram figures to', fig_dir)

In [None]:
# Precision-Recall curves
from sklearn.metrics import precision_recall_curve, average_precision_score
plt.figure(figsize=(8,6))
for r in eval_rows:
    precision, recall, _ = precision_recall_curve(y_test, r['proba'])
    ap = average_precision_score(y_test, r['proba'])
    plt.plot(recall, precision, label=f"{r['model']} (AP={ap:.3f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curves (test set)')
plt.legend()
plt.grid(True)
pr_fig = fig_dir / 'precision_recall_curves.png'
plt.savefig(pr_fig, dpi=150, bbox_inches='tight')
plt.show()
plt.close()
print('Saved precision-recall figure to', pr_fig)

## Interpretation of Results
This section summarizes the key numeric outcomes and provides concise interpretation and next-step recommendations based on the evaluation above. Run the cell below to see a short report and save it to `results/metrics/interpretation.txt`.

In [None]:
# Simple interpretation report and save to file
out_dir = repo_root / 'results' / 'metrics'
out_dir.mkdir(parents=True, exist_ok=True)
report_lines = []
# build dataframe from metrics_summary (loaded earlier)
try:
    df_metrics = pd.DataFrame.from_dict(metrics_summary, orient='index').reset_index().rename(columns={'index':'model'})
except Exception:
    df_metrics = None
if df_metrics is not None:
    display(df_metrics)
    report_lines.append('Model summary (roc_auc, brier):')
    for model, row in metrics_summary.items():
        roc = row.get('roc_auc')
        brier = row.get('brier')
        report_lines.append(f'- {model}: ROC AUC={roc:.3f}' if roc is not None else f'- {model}: ROC AUC=None')
        report_lines.append(f'  Brier={brier:.4f}')
        cm = row.get('cm')
        if cm is not None:
            report_lines.append(f'  Confusion matrix: {cm}')
    # quick interpretation heuristics
    report_lines.append('Quick interpretation:')
    for model, row in metrics_summary.items():
        roc = row.get('roc_auc') or 0.0
        brier = row.get('brier') or 1.0
        if roc < 0.6:
            report_lines.append(f'- {model}: Low discrimination (ROC AUC={roc:.3f}). Consider richer features or resampling/hyperparameter tuning.')
        elif roc < 0.7:
            report_lines.append(f'- {model}: Moderate discrimination (ROC AUC={roc:.3f}). Consider calibration and threshold tuning for downstream decisions.')
        else:
            report_lines.append(f'- {model}: Good discrimination (ROC AUC={roc:.3f}). Evaluate calibration and practical thresholds.')
        if brier > 0.10:
            report_lines.append(f'  Brier score {brier:.3f} indicates probability estimates are noisy; recalibration or more data may help.')
else:
    report_lines.append('No metrics available to summarize. Run the evaluation cells first.')
# reference generated figures
fig_dir = out_dir / 'figures'
if fig_dir.exists():
    report_lines.append('\nSaved figures:')
    for p in sorted(fig_dir.glob('*.png')):
        report_lines.append(f'- {p.name}')
# write to file
int_path = out_dir / 'interpretation.txt'
with open(int_path, 'w', encoding='utf-8') as fh:
    fh.write('\n'.join(report_lines))
print('Wrote interpretation to', int_path)
for l in report_lines:
    print(l)
