# ðŸ§© 04 â€” Model Optimization

Here we tune:
- RandomForest
- XGBoost

Using RandomizedSearchCV with PR-AUC scoring.

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

from src.data_prep import load_raw_data, train_val_test_split, build_preprocessor
from src.models import get_rf_model, get_xgb_model
from src.utils import evaluate_probabilities

In [None]:
df = load_raw_data()
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df)
preprocessor = build_preprocessor()

In [None]:
rf = get_rf_model()
pipe = Pipeline([("preprocessor", preprocessor), ("model", rf)])

param_dist = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [3, 5, 7, None],
    "model__min_samples_split": [2, 5, 10],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search_rf = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=10,
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search_rf.fit(X_train, y_train)
search_rf.best_params_, search_rf.best_score_

In [None]:
xgb = get_xgb_model()
pipe = Pipeline([("preprocessor", preprocessor), ("model", xgb)])

param_dist_xgb = {
    "model__n_estimators": [200, 300, 400],
    "model__max_depth": [3, 4, 5, 7],
    "model__learning_rate": [0.01, 0.05, 0.1],
}

search_xgb = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist_xgb,
    n_iter=10,
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search_xgb.fit(X_train, y_train)
search_xgb.best_params_, search_xgb.best_score_