In [2]:
import pandas as pd
import numpy as np
import os
from rf_model import train_and_apply_rf_with_tuning
from statsmodels.tsa.arima.model import ARIMA

# Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Paths
base_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon"
csv_best_models = os.path.join(base_path, "julia_models", "best_models_per_river.csv")
csv_data = os.path.join(base_path, "data", "Combined_FeatureSet_For_Model.csv")
csv_output = os.path.join(base_path, "julia_models", "predictions_2025.csv")

# Load model selection and full dataset
best_models_df = pd.read_csv(csv_best_models)
df_full = pd.read_csv(csv_data)

# Columns to drop globally
columns_to_drop = [
    'mean_temp_JunAug', 'mean_temp_MaySep', 'max_temp_JunAug',
    'mean_discharge_MarMay', 'max_discharge_AprSep'
]

# Storage
all_predictions = []

for _, row in best_models_df.iterrows():
    model_name = row["Model"].split(" - ")[0]
    river_name = row["River_Name"]
    system = row["System"]
    top_k = int(row["TopK_Features"])
    use_extra_features = row["Additional_Features_Used"]
    use_arima = row["ARIMA_Enabled"]

    print(f"\nProcessing {river_name} | {model_name} | TopK={top_k} | ExtraFeat={use_extra_features} | ARIMA={use_arima}")

    # Clone dataframe and drop specified columns
    df = df_full.copy()
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    # Special logic for "Bristol Bay - Ugashik"
    if system == "Bristol Bay - Ugashik":
        actual_system = "Bristol Bay"
        is_ugashik = True
        apply_strict_dropna = False
    else:
        actual_system = system
        is_ugashik = False
        apply_strict_dropna = True

    df = df[df["System"] == actual_system]
    if is_ugashik:
        df = df[df["River"] == "Ugashik"]

    if actual_system == "Bristol Bay":
        df = df[df["Year"] >= 1995]

    # Add lag features if required
    if use_extra_features:
        features_to_lag = [
            'Total_Returns', 'AgeClass_0.1', 'AgeClass_0.2', 'AgeClass_0.3', 'AgeClass_0.4', 'AgeClass_0.5',
            'AgeClass_1.1', 'AgeClass_1.2', 'AgeClass_1.3', 'AgeClass_1.4', 'AgeClass_1.5',
            'AgeClass_2.1', 'AgeClass_2.2', 'AgeClass_2.3', 'AgeClass_2.4',
            'AgeClass_3.1', 'AgeClass_3.2', 'AgeClass_3.3', 'AgeClass_3.4',
            'Total_Returns_NextYear', 'Pacea_ALPI_Anomaly', 'npi_mean_NovMar', 'oni_mean_DecFeb',
            'npgo_mean_DecFeb', 'ao_mean_DecMar', 'pdo_mean_DecMar', 'pdo_mean_MaySep'
        ]
        for feat in features_to_lag:
            for lag in [1, 2, 3, 4, 5]:
                df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)

    # Conditional dropna based on flag
    if apply_strict_dropna and use_extra_features:
        if 'total_spawners_y_minus_2_to_4' in df.columns:
            df = df.dropna(subset=['total_spawners_y_minus_2_to_4'])
        if 'AgeClass_0.2_Yminus5' in df.columns:
            df = df.dropna(subset=['AgeClass_0.2_Yminus5'])

    df = df.dropna(axis=1, how="any").dropna()

    # Final river filtering for safety
    df_river = df[df["River"] == river_name].copy()

    df_train = df_river[df_river["Year"] < 2025].copy()
    df_test = df_river[df_river["Year"] == 2025].copy()

    if df_train.empty or df_test.empty:
        print(f"⚠️ Skipping {river_name}: not enough data.")
        continue

    # Add River_Name field
    df_train["River_Name"] = df_train["River"]
    df_test["River_Name"] = df_test["River"]

    # Encode river one-hot
    df_train_encoded = pd.get_dummies(df_train, columns=["River"], prefix="River")
    df_test_encoded = pd.get_dummies(df_test, columns=["River"], prefix="River")
    df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

    # Train and predict
    result = train_and_apply_rf_with_tuning(
        model=model_name,
        train_df=df_train_encoded,
        test_df=df_test_encoded,
        topk_feat=top_k
    )

    # ARIMA correction if enabled
    if use_arima:
        residuals = result["Timeline_train"]["Actual"] - result["Timeline_train"]["Predicted"]
        residuals_series = pd.Series(residuals.values, index=result["Timeline_train"]["Year"])

        try:
            arima_model = ARIMA(residuals_series, order=(1, 0, 0))
            arima_fit = arima_model.fit()
            forecast = arima_fit.forecast(steps=1)
            pred_value = result["Timeline_test"]["Predicted"].values[0] + forecast.values[0]
        except Exception as e:
            print(f"❌ ARIMA failed for {river_name}: {e}")
            pred_value = result["Timeline_test"]["Predicted"].values[0]
    else:
        pred_value = result["Timeline_test"]["Predicted"].values[0]

    # Save prediction
    all_predictions.append({
        "River_Name": river_name,
        "Model": model_name,
        "System": system,
        "TopK": top_k,
        "Extra_Features": use_extra_features,
        "ARIMA_Enabled": use_arima,
        "Prediction_2025": round(pred_value, 2)
    })

# Export predictions
pred_df = pd.DataFrame(all_predictions)
pred_df.to_csv(csv_output, index=False)
print(f"\n✅ Final predictions saved to:\n{csv_output}")



Processing Alagnak | LR | TopK=10 | ExtraFeat=True | ARIMA=False
⚠️ Skipping Alagnak: not enough data.

Processing Egegik | LR | TopK=6 | ExtraFeat=False | ARIMA=True
⚠️ Skipping Egegik: not enough data.

Processing Igushik | PR | TopK=10 | ExtraFeat=False | ARIMA=False
⚠️ Skipping Igushik: not enough data.

Processing Kvichak | PR | TopK=10 | ExtraFeat=False | ARIMA=True
⚠️ Skipping Kvichak: not enough data.

Processing Naknek | LR | TopK=6 | ExtraFeat=False | ARIMA=False
⚠️ Skipping Naknek: not enough data.

Processing Nushagak | XGB | TopK=0 | ExtraFeat=False | ARIMA=False
⚠️ Skipping Nushagak: not enough data.

Processing Ugashik | LR | TopK=10 | ExtraFeat=True | ARIMA=True


  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 

⚠️ Skipping Ugashik: not enough data.

Processing Wood | XGB | TopK=6 | ExtraFeat=True | ARIMA=True
⚠️ Skipping Wood: not enough data.

Processing Bonneville Lock & Dam | LR | TopK=6 | ExtraFeat=True | ARIMA=True


  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 

⚠️ Skipping Bonneville Lock & Dam: not enough data.

Processing Chilko | LR | TopK=10 | ExtraFeat=True | ARIMA=True
⚠️ Skipping Chilko: not enough data.

Processing Late Stuart | LR | TopK=6 | ExtraFeat=True | ARIMA=False


  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 

⚠️ Skipping Late Stuart: not enough data.

Processing Quesnel | XGB | TopK=6 | ExtraFeat=False | ARIMA=False
⚠️ Skipping Quesnel: not enough data.

Processing Stellako | LR | TopK=6 | ExtraFeat=True | ARIMA=False
⚠️ Skipping Stellako: not enough data.

Processing Raft | LR | TopK=6 | ExtraFeat=True | ARIMA=False
⚠️ Skipping Raft: not enough data.

✅ Final predictions saved to:
C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\julia_models\predictions_2025.csv


  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 'River'])[feat].shift(lag)
  df[f"{feat}_Yminus{lag}"] = df.groupby(['System', 