In [None]:
%pip install xgboost
%pip install --upgrade pip

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
from Data_preparation import create_fish_pipeline, prepare_fish_data


In [None]:
def evaluate_xgb(X_train, y_train, X_dev, y_dev):
    print("Evaluating XGBoost Regressor...")

    # Define the hyperparameter grid search to try combinations of these hyperparameters.
    param_grid = {
        'algo__n_estimators': [50, 100],
        'algo__max_depth': [3, 5],
        'algo__learning_rate': [0.05, 0.1],
        'algo__subsample': [0.8, 1.0]
    }

    # This here uses the pipeline to handle missing values, scaling, encoding, etc for teh dataset.
    pipeline = create_fish_pipeline()

    # This combines the preprocessing and XGBoost model into one clean pipeline.
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', XGBRegressor(
            objective='reg:squarederror',
            random_state=42
        ))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo, param_grid,
        cv=3,  # 3-fold cross-validation
        scoring='r2',  # Use R² as the evaluation metric
        verbose=1  # Show progress in terminal
    )
    grid_search.fit(X_train, y_train)

    # This shows us our best model based on cross-validation R² score.
    best_estimator = grid_search.best_estimator_

    # We are making predicitons on the dev set here
    y_pred = best_estimator.predict(X_dev)

    # Here we are calculating the following values
    mse = mean_squared_error(y_dev, y_pred)
    mae = mean_absolute_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    # Shows you the best performance from the training phase and the hyperparameters that gave it.
    print("Grid searching is done!")
    print("Best score (neg MSE):", grid_search.best_score_)
    print("Best hyperparameters:")
    print(grid_search.best_params_)

    return best_estimator, mse, mae, r2


In [None]:
# Step 1: Prepare fish data (split into train/dev/test)
X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_fish_data(ratios=((1/10), (1/10)))

# Step 2: Run hyperparameter tuning on train/dev sets
best_model, dev_mse, dev_mae, dev_r2 = evaluate_xgb(X_train, y_train, X_dev, y_dev)

print("\n----- Dev Set Performance -----")
print("Dev MSE:", dev_mse)
print("Dev MAE:", dev_mae)
print("Dev R²:", dev_r2)

# Step 3: Evaluate best model on test set
y_test_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n----- Test Set Performance -----")
print("Test MSE:", test_mse)
print("Test MAE:", test_mae)
print("Test R²:", test_r2)

## 🔍 Observations from the Data Stats

### 📉 Missing Values
- Columns like **"AM Transparency"**, **"PM Transparency"**, and **"Spring Temp"** have missing values (their count is less than the total 28,918 rows).
- ✅ Your pipeline handles this using `SimpleImputer` and `KNNImputer`.

### 📊 Wide Value Ranges
- **Transparency** values go above **1200**, so scaling is needed — ✅ you're using `StandardScaler`.
- **Fish counts** range from **220 to 51,827**, creating large variance — yet your model handled it well (**Test R² ~0.81**).

### 🎯 Survival Rate
- "Fish survival rate" ranges from **~68% to 100%**
- Mean ≈ **99.97%** → Highly **imbalanced**, as most fish survive.
- ✅ Your model's low **MAE (≈ 0.0098)** shows it's accurately predicting survival rates even in this tight range.

### 📝 Other Notes
- **Rain values** include 0 — makes sense seasonally.
- **Temperature and rainfall** columns are continuous — great for regression models.

---

## ✅ Why Your Pipeline Is Working

Your preprocessing pipeline and model work well because:

| Problem                              | Solution in Pipeline                   |
|--------------------------------------|----------------------------------------|
| Missing transparency/temp values     | `KNNImputer`, `SimpleImputer`          |
| Large numeric value ranges           | `StandardScaler`                       |
| Categorical feeding/location columns | `OneHotEncoder`                        |
| Special handling for `"Morts"`       | Filled with `0` using `SimpleImputer` ✅ |

---

## ✅ Why the Model Performed Well

| Metric     | Result     | Interpretation                              |
|------------|------------|----------------------------------------------|
| Dev R²     | 0.96       | Extremely good fit to known (dev) data       |
| Test R²    | 0.81       | Strong generalization to unseen (test) data  |
| MAE        | ~0.0098    | Average error is <1% of survival rate — ✅ very precise |


In [None]:
# Get the preprocessor from the trained pipeline
preprocessor = best_model.named_steps['preprocessor']

# Transform the training data
X_train_transformed = preprocessor.transform(X_train)

# If it's a sparse matrix, convert it to dense
if hasattr(X_train_transformed, "toarray"):
    X_train_transformed = X_train_transformed.toarray()

# Convert to DataFrame
X_train_df = pd.DataFrame(X_train_transformed)

# Optional: Add target column
X_train_df["Survival Rate"] = y_train.reset_index(drop=True)

# Preview
print(X_train_df)

In [None]:
X_train_df.to_excel("../Data/Prepared/preprocessed_train_data.xlsx", index=False)
