In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# === Train model using grid search with defined hyperparameters ===
def train_model_with_grid(model_name, X_train, y_train):
    if model_name == "RF":
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
        base_model = RandomForestRegressor(random_state=42)
    elif model_name == "GBRT":
        param_grid = {
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5, 10],
            'min_samples_leaf': [1, 2],
            'max_iter': [100, 200]
        }
        base_model = HistGradientBoostingRegressor(random_state=42)
    elif model_name == "XGB":
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 6],
            'learning_rate': [0.05, 0.1]
        }
        base_model = XGBRegressor(random_state=42, verbosity=0)
    elif model_name == "LR":
        param_grid = {
            'fit_intercept': [True, False],
            'positive': [False]
        }
        base_model = LinearRegression()
    else:
        raise ValueError("Polynomial Regression is handled separately.")

    grid = GridSearchCV(base_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_estimator_

# === File paths ===
base_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns"
models_path = os.path.join(base_path, "murilo_salmon", "julia_models", "best_models_per_river.csv")
training_data_path = os.path.join(base_path, "data", "Combined_FeatureSet_For_Model.csv")
prediction_data_path = os.path.join(base_path, "data", "Samples_2024.csv")

# === Load data ===
best_models_df = pd.read_csv(models_path)
df_train = pd.read_csv(training_data_path)
df_test = pd.read_csv(prediction_data_path)

# === Result storage ===
results = []

# === Loop through each river-model pair ===
for _, row in best_models_df.iterrows():
    river_name = row["River_Name"]
    model_type = row["Model"].split(" - ")[0]
    features = row["Feature_Names"].split(", ")

    # Drop missing values
    df_train_clean = df_train.dropna(subset=features + ["Total_Returns_NextYear"]).copy()
    df_test_clean = df_test.dropna(subset=features).copy()

    if df_train_clean.empty or df_test_clean.empty:
        continue

    # Prepare training and test sets
    X_train = df_train_clean[features]
    y_train = df_train_clean["Total_Returns_NextYear"]
    X_test = df_test_clean[features]

    # Train model
    if model_type == "PR":
        model = make_pipeline(PolynomialFeatures(degree=2, include_bias=False), LinearRegression())
        model.fit(X_train, y_train)
    else:
        model = train_model_with_grid(model_type, X_train, y_train)

    # Predict
    predictions = model.predict(X_test)

    for i, pred in enumerate(predictions):
        results.append({
            "River_Name": river_name,
            "Model": model_type,
            "Prediction": pred,
            "Sample_ID": df_test_clean.iloc[i].get("Sample_ID", f"Index_{i}")
        })

# === Save output ===
output_df = pd.DataFrame(results)
output_path = os.path.join(base_path, "murilo_salmon", "julia_models", "predictions_2024_tuned_clean.csv")
output_df.to_csv(output_path, index=False)
