In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
from joblib import dump


In [9]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from data_preparation.fish_survival_data_preparation import create_fish_pipeline, prepare_fish_data

In [10]:
def evaluate_xgb(X_train, y_train, X_dev, y_dev):
    print("Evaluating XGBoost Regressor...")

    # Define the hyperparameter grid search to try combinations of these hyperparameters.
    param_grid = {
        'algo__n_estimators': [1000],
        'algo__max_depth': [2, 3, 4],
        'algo__learning_rate': [0.01, 0.05, 0.1], # smaller learning rate is possibly better as training consisitency increasees.
        'algo__subsample': [0.8, 1.0],

    }

    # This here uses the pipeline to handle missing values, scaling, encoding, etc for teh dataset.
    pipeline = create_fish_pipeline()

    # This combines the preprocessing and XGBoost model into one clean pipeline.
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', XGBRegressor(
            objective='reg:squarederror',
            random_state=42
        ))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo, param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='neg_mean_squared_error',  
        verbose=1  # Show progress in terminal
    )
    grid_search.fit(X_train, y_train)

    # This shows us our best model based on cross-validation R² score.
    best_estimator = grid_search.best_estimator_

    # 📊 FEATURE IMPORTANCE SECTION
    try:
        model = best_estimator.named_steps["algo"]
        preprocessor = best_estimator.named_steps["preprocessor"]
        feature_names = preprocessor.get_feature_names_out()
        importances = model.feature_importances_

        feature_df = pd.DataFrame({
            "Feature": feature_names,
            "Importance": importances
        }).sort_values(by="Importance", ascending=False)

        print("\nTop 10 Most Important Features:")
        print(feature_df.head(10))
    except Exception as e:
        print("Could not extract feature importances:", e)

    # We are making predicitons on the dev set here
    y_pred = best_estimator.predict(X_dev)

    # Here we are calculating the following values
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    mape = mean_absolute_percentage_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    # Shows you the best performance from the training phase and the hyperparameters that gave it.
    print("Grid searching is done!")
    print("Best score (neg MSE):", grid_search.best_score_)
    print("Best hyperparameters:")
    print(grid_search.best_params_)

    return best_estimator, rmse, mape, r2


In [11]:
# Step 2: Define a reusable function to evaluate metrics
def evaluate_metrics(y_true, y_pred, label):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mean_target = np.mean(y_true)
    print(f"\n📊 {label} Set Performance:")
    print(f"Mean of y_{label.lower()}: {mean_target:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAPE: {mape:.4f}")
    print(f"R²: {r2:.4f}")
    return rmse, mape, r2

In [12]:
def main():
    print("\n🚀 Evaluating model for: fish survival model)")
    X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_fish_data(ratios=((1/10), (1/10)))

    best_model, _, _, _ = evaluate_xgb(X_train, y_train, X_dev, y_dev)

    print("✅ Data Split Shapes:")
    print("  X_train:", X_train.shape)
    print("  X_dev:", X_dev.shape)
    print("  X_test:", X_test.shape)
    print("  y_train:", y_train.shape)
    print("  y_dev:", y_dev.shape)
    print("  y_test:", y_test.shape)

    y_train_pred = best_model.predict(X_train)
    y_dev_pred = best_model.predict(X_dev)
    y_test_pred = best_model.predict(X_test)

    evaluate_metrics(y_train, y_train_pred, "Train")
    evaluate_metrics(y_dev, y_dev_pred, "Dev")
    evaluate_metrics(y_test, y_test_pred, "Test")

    # ✅ Save model
    dump(best_model, "../models/fish_survial_model.joblib")
    dump(best_model, "../../app/models/fish_survial_model.joblib")

    print("✅ Model saved as: models/fish_survial_model.joblib")

if __name__ == "__main__":
    main()


🚀 Evaluating model for: fish survival model)
Evaluating XGBoost Regressor...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Top 10 Most Important Features:
                          Feature  Importance
1               num__Max air temp    0.151554
17  transparency__PM Transparency    0.138291
0            num__Spring Temp (F)    0.103562
5                     num__# fish    0.081378
12          num__Dec Rain (Lag 1)    0.074135
16  transparency__AM Transparency    0.069202
6         num__Spring_Temp x Rain    0.051962
8           num__Dec Rain (Lag 3)    0.047732
19             cat__Season_Spring    0.044108
15     num__Calmar Rain 7-day avg    0.034960
Grid searching is done!
Best score (neg MSE): -0.06556099133870939
Best hyperparameters:
{'algo__learning_rate': 0.1, 'algo__max_depth': 2, 'algo__n_estimators': 1000, 'algo__subsample': 1.0}
✅ Data Split Shapes:
  X_train: (21400, 19)
  X_dev: (2674, 19)
  X_test: (2674, 19)
  y_train: (21400,)
  y_dev: (2674,)
  y_test

In [7]:
from joblib import load

model = load("../models/fish_survial_model.joblib")
print(model.named_steps["preprocessor"].get_feature_names_out())

['num__Spring Temp (F)' 'num__Max air temp' 'num__Min air temp'
 'num__Dec Rain' 'num__Calmar Rain' 'num__# fish'
 'num__Spring_Temp x Rain' 'num__Max Air Temp x Rain'
 'num__Dec Rain (Lag 3)' 'num__Calmar Rain (Lag 3)'
 'num__Dec Rain (Lag 2)' 'num__Calmar Rain (Lag 2)'
 'num__Dec Rain (Lag 1)' 'num__Calmar Rain (Lag 1)'
 'num__Dec Rain 7-day avg' 'num__Calmar Rain 7-day avg'
 'transparency__AM Transparency' 'transparency__PM Transparency'
 'cat__Season_Fall' 'cat__Season_Spring' 'cat__Season_Summer'
 'cat__Season_Winter']
