# Notebook 02 — Feature Engineering & Model Development (Days 3–4)

This notebook reads `data/cleaned_df_stage1.parquet`, performs feature engineering, constructs robust preprocessing pipelines, trains baseline models (Linear, RandomForest, XGBoost) with CV, performs hyperparameter tuning on XGBoost, and saves model artifacts.

In [None]:
# Imports and setup
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb

RND = 42
np.random.seed(RND)
OUT_DIR = Path('outputs/02_models')
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR = Path('models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Load stage1 cleaned data
stage1 = Path('data/cleaned_df_stage1.parquet')
if not stage1.exists():
    raise FileNotFoundError('Run Notebook 01 first or upload cleaned_df_stage1.parquet')
df = pd.read_parquet(stage1)
print('Loaded df shape:', df.shape)

## Feature engineering
- Create binary flags and proximity score
- Prepare candidate feature list

In [None]:
# In-compound flag
if 'compound_name' in df.columns:
    df['in_compound'] = df['compound_name'].notnull().astype(int)
# premium view
if 'view_type' in df.columns:
    df['premium_view'] = df['view_type'].isin(['Nile','Garden','Compound']).astype(int)
# seller_is_broker
if 'seller_type' in df.columns:
    df['seller_is_broker'] = (df['seller_type'] == 'Broker').astype(int)
# proximity score
dist_cols = [c for c in ['distance_to_auc_km','distance_to_mall_km','distance_to_metro_km'] if c in df.columns]
if dist_cols:
    df['proximity_score'] = df[dist_cols].mean(axis=1)
else:
    df['proximity_score'] = pd.NA

# Candidate features (keep only existing)
candidate_features = [
    'area_sqm','bedrooms','bathrooms','floor_number','building_age_years',
    'in_compound','premium_view','has_parking','has_security','has_amenities',
    'proximity_score','is_negotiable','seller_is_broker','price_per_sqm','listing_month'
]
feature_cols = [c for c in candidate_features if c in df.columns]
print('Using features:', feature_cols)

# Drop rows with missing target
if 'price_egp' not in df.columns:
    raise KeyError('price_egp target missing')
df = df[~df['price_egp'].isnull()].copy()

## Temporal train-test split
We sort by `listing_date` (if available) and take the latest 20% as test set to simulate production.

In [None]:
# Temporal split
if 'listing_date' in df.columns:
    df = df.sort_values('listing_date').reset_index(drop=True)
    cutoff = int(len(df)*0.8)
    train_df = df.iloc[:cutoff].copy()
    test_df = df.iloc[cutoff:].copy()
else:
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=RND)

X_train = train_df[feature_cols]
y_train = train_df['price_egp']
X_test = test_df[feature_cols]
y_test = test_df['price_egp']
print('Train/test sizes:', X_train.shape, X_test.shape)

## Preprocessing pipeline
- Numeric: median imputation + RobustScaler
- Categorical: constant impute + OneHotEncoder


In [None]:
# Identify numeric and categorical
num_features = X_train.select_dtypes(include=['number']).columns.tolist()
cat_features = [c for c in feature_cols if c not in num_features]

from sklearn.pipeline import make_pipeline
num_pipeline = make_pipeline(SimpleImputer(strategy='median'), RobustScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore', sparse=False))

preprocessor = ColumnTransformer(transformers=[('num', num_pipeline, num_features), ('cat', cat_pipeline, cat_features)], remainder='drop', n_jobs=-1)
joblib.dump(preprocessor, MODEL_DIR / 'preprocessor_stage1.joblib')
print('Preprocessor saved')

## Baseline models with cross-validation
We compare Linear Regression, RandomForest, XGBoost using 5-fold CV (neg MAE).

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=RND, n_jobs=-1),
    'XGBoost': xgb.XGBRegressor(random_state=RND, n_jobs=-1, objective='reg:squarederror')
}
cv = KFold(n_splits=5, shuffle=True, random_state=RND)
results = []
for name, estimator in models.items():
    pipe = Pipeline([('pre', preprocessor), ('model', estimator)])
    scores = cross_val_score(pipe, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    results.append({'model': name, 'cv_MAE_mean': -scores.mean(), 'cv_MAE_std': scores.std()})

import pandas as pd
pd.DataFrame(results).to_csv(OUT_DIR / 'baseline_model_cv_results.csv', index=False)
print(pd.DataFrame(results))

## Fit on train and evaluate on temporal test set

In [None]:
# Fit and evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

evals = []
for name, estimator in models.items():
    pipe = Pipeline([('pre', preprocessor), ('model', estimator)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    evals.append({'model': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2})
    joblib.dump(pipe, MODEL_DIR / f'{name.lower()}_pipe.pkl')

pd.DataFrame(evals).to_csv(OUT_DIR / 'test_eval_results.csv', index=False)
print(pd.DataFrame(evals))

## Hyperparameter tuning for XGBoost (RandomizedSearchCV)
Tune a compact grid and save the best pipeline.

In [None]:
# Hyperparameter tuning
xgb_pipe = Pipeline([('pre', preprocessor), ('model', xgb.XGBRegressor(objective='reg:squarederror', random_state=RND, n_jobs=-1))])
param_dist = {
    'model__n_estimators': [100,200,400],
    'model__max_depth': [3,6,9],
    'model__learning_rate': [0.01,0.03,0.05,0.1],
    'model__subsample': [0.6,0.8,1.0],
    'model__colsample_bytree': [0.6,0.8,1.0]
}
search = RandomizedSearchCV(xgb_pipe, param_dist, n_iter=20, scoring='neg_mean_absolute_error', cv=3, random_state=RND, n_jobs=-1)
search.fit(X_train, y_train)
print('Best params:', search.best_params_)
joblib.dump(search.best_estimator_, MODEL_DIR / 'xgb_tuned_pipe.pkl')

# Evaluate tuned model
best_pipe = search.best_estimator_
preds = best_pipe.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)
print('Tuned XGB on test - MAE:', mae, 'RMSE:', rmse, 'R2:', r2)

with open(OUT_DIR / 'final_metrics.txt', 'w') as f:
    f.write(f"Tuned XGB test MAE: {mae}\nRMSE: {rmse}\nR2: {r2}\n")

# Save full cleaned dataset for next notebook
df.to_parquet('data/cleaned_df.parquet', index=False)
print('Saved data/cleaned_df.parquet')

### Notes
- Depending on dataset size, RandomizedSearchCV may take time. Adjust `n_iter` as needed.
- If XGBoost not installed, install via pip.