# Model Training
This notebook runs the training script which fits several models (Logistic Regression, Random Forest, XGBoost, and an optional Neural Network MLP), calibrates probabilities, and saves metrics and models under `results/metrics/`.

In [None]:
import sys
import json
from pathlib import Path
print('Python executable:', sys.executable)
print('Working directory:', Path('.').resolve())

## Run training script
Run the training script from the repository root. This will read the latest processed CSV in `data/processed/`, train models (including the added MLP neural network), calibrate, and save metrics and joblib models to `results/metrics/`.

In [None]:
# Run the training script using the current Python interpreter from the repository root
import subprocess, sys
from pathlib import Path
def find_repo_root(start=Path.cwd(), markers=('setup.py','requirements.txt','README.md')):
    cur = start.resolve()
    for _ in range(10):
        if any((cur / m).exists() for m in markers):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()
repo_root = find_repo_root()
print('Running training script from repo root:', repo_root)
proc = subprocess.run([sys.executable, 'scripts/train_models.py'], cwd=str(repo_root), capture_output=True, text=True)
print('train_models.py exit code:', proc.returncode)
print('--- STDOUT ---')
print(proc.stdout)
print('--- STDERR ---')
print(proc.stderr)

## Load and inspect saved metrics
Load the JSON summary that the script outputs to `results/metrics/metrics_summary.json`.

In [9]:
from pathlib import Path
import json, pandas as pd

def find_repo_root(start=Path.cwd(), markers=('setup.py','requirements.txt','README.md')):
    cur = start.resolve()
    for _ in range(10):
        if any((cur / m).exists() for m in markers):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

repo_root = find_repo_root()
metrics_path = repo_root / 'results' / 'metrics' / 'metrics_summary.json'

if metrics_path.exists():
    with open(metrics_path, 'r') as fh:
        metrics = json.load(fh)
    rows = [{'model': n, 'roc_auc': i.get('roc_auc'), 'brier_score': i.get('brier_score')}
            for n, i in metrics.items()]
    display(pd.DataFrame(rows))
else:
    print('metrics_summary.json not found at', metrics_path)

Unnamed: 0,model,roc_auc,brier_score
0,logistic,0.738323,0.089516
1,random_forest,0.638377,0.097723
2,xgboost,0.657066,0.096564
3,neural_network,0.5,0.100158


## Load a saved model example
You can load any of the saved calibrated models (joblib) from `results/metrics/`. Below is an example loading the neural network calibrated model if present.

In [11]:
import joblib
from pathlib import Path

def find_repo_root(start=Path.cwd(), markers=('setup.py','requirements.txt','README.md')):
    cur = start.resolve()
    for _ in range(10):
        if any((cur / m).exists() for m in markers):
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

repo_root = find_repo_root()
mdl = repo_root / 'results' / 'metrics' / 'model_neural_network_calibrated.joblib'
if mdl.exists():
    model = joblib.load(mdl)
    print('Loaded model type:', type(model))
    # To predict, prepare features like in the training script and call model.predict_proba(X)[:,1]
else:
    print(mdl, 'not found')

Loaded model type: <class 'sklearn.calibration.CalibratedClassifierCV'>
