# Notebook 03 — Refinement, Validation & Error Analysis (Days 5–6)

This notebook loads `models/xgb_tuned_pipe.pkl` (or other pipeline), performs SHAP interpretation, bootstrap confidence intervals for MAE, error slice analysis (where absolute error > 150k), fairness checks by district/finishing, and saves artifacts for the report.

In [None]:
# Setup
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.utils import resample

import shap

RND = 42
np.random.seed(RND)
OUT_DIR = Path('outputs/03_refinement')
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR = Path('models')

# Load data
data_path = Path('data/cleaned_df.parquet')
if not data_path.exists():
    raise FileNotFoundError('Run Notebook 02 to create data/cleaned_df.parquet')
df = pd.read_parquet(data_path)
print('Loaded df with shape', df.shape)

## Load best model pipeline
We prefer `xgb_tuned_pipe.pkl` if present; otherwise fall back to other pipelines

In [None]:
# Load best model
model_candidates = ['xgb_tuned_pipe.pkl','final_model_pipeline.pkl','xgboost_pipe.pkl','xgb_pipe.pkl']
model_pipe = None
for fname in model_candidates:
    p = MODEL_DIR / fname
    if p.exists():
        model_pipe = joblib.load(p)
        print('Loaded model:', p)
        break

if model_pipe is None:
    # try any pipeline file in models
    files = list(MODEL_DIR.glob('*_pipe.pkl'))
    if files:
        model_pipe = joblib.load(files[0])
        print('Loaded model:', files[0])

if model_pipe is None:
    raise FileNotFoundError('No trained model pipeline found in models/. Run Notebook 02 first.')

## Predictions and global metrics

In [None]:
# Prepare X and y
pre = model_pipe.named_steps.get('pre')
try:
    num_cols = pre.transformers_[0][2]
    cat_cols = pre.transformers_[1][2]
    feature_cols = list(num_cols) + list(cat_cols)
except Exception:
    feature_cols = [c for c in df.columns if c not in ['listing_id','price_egp','listing_date']]

X = df[feature_cols]
y = df['price_egp']

preds = model_pipe.predict(X)
mae_all = mean_absolute_error(y, preds)
rmse_all = mean_squared_error(y, preds, squared=False)
r2_all = r2_score(y, preds)
print('Overall MAE:', mae_all, 'RMSE:', rmse_all, 'R2:', r2_all)

# Save predictions
df_out = df.copy()
df_out['y_pred'] = preds
df_out['abs_error'] = (df_out['y_pred'] - df_out['price_egp']).abs()
df_out.to_parquet(OUT_DIR / 'predictions_with_errors.parquet', index=False)

## Bootstrap CI for MAE

In [None]:
# Bootstrap CI
n_boot = 500
mae_boot = []
for i in range(n_boot):
    Xb, yb = resample(X, y, replace=True, random_state=RND+i)
    predb = model_pipe.predict(Xb)
    mae_boot.append(mean_absolute_error(yb, predb))

import numpy as np
lo, hi = np.percentile(mae_boot, [2.5, 97.5])
print(f'Bootstrap MAE 95% CI: [{lo:.2f}, {hi:.2f}], mean {np.mean(mae_boot):.2f}')
with open(OUT_DIR / 'mae_bootstrap_ci.txt', 'w') as f:
    f.write(f'Bootstrap MAE 95% CI: [{lo}, {hi}]\nmean: {np.mean(mae_boot)}\n')

## Error slice analysis (abs error > 150,000) and SHAP

In [None]:
threshold = 150000
bad = df_out[df_out['abs_error'] > threshold]
print('Number of bad listings:', bad.shape[0], 'out of', df_out.shape[0])

bad_examples = bad.sort_values('abs_error', ascending=False).head(10)
bad_examples.to_csv(OUT_DIR / 'bad_examples_top10.csv', index=False)

if 'district' in df_out.columns:
    grp = df_out.assign(is_bad=(df_out['abs_error']>threshold)).groupby(['district','is_bad']).size().unstack(fill_value=0)
    grp.to_csv(OUT_DIR / 'district_bad_counts.csv')

print('Saved bad examples and diagnostics to', OUT_DIR)

# SHAP (best-effort)
try:
    X_sample = X.sample(min(400, len(X)), random_state=RND)
    explainer = shap.Explainer(model_pipe.predict, X_sample)
    shap_values = explainer(X_sample)
    plt.figure(figsize=(8,6))
    shap.summary_plot(shap_values, X_sample, show=False)
    plt.savefig(OUT_DIR / 'shap_summary.png', bbox_inches='tight')
    plt.close()
    print('Saved SHAP summary plot')
except Exception as e:
    print('SHAP step skipped or failed:', e)

## Outputs saved
- predictions_with_errors.parquet
- mae_bootstrap_ci.txt
- bad_examples_top10.csv
- shap_summary.png (if generated)
