Connected to Python 3.10.11

### Load libraries

In [None]:
import sys
import os
import matplotlib.pyplot as plt
import pandas as pd

# Directly import helper function
notebook_dir = os.getcwd()
src_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\src"
if src_path not in sys.path:
    sys.path.append(src_path)

from utils import add_src_to_path
add_src_to_path()

from data_split import split_time_series_by_river
from rf_model import train_and_apply_rf_with_tuning
from plot_predictions import plot_predictions_by_river
from plot_predictions import plot_actual_vs_predicted

# Choose from "Bristol Bay", "Fraser River" and "Columbia River"
river_system = "Fraser River"

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # assumes notebook is in /notebooks
data_path = os.path.join(project_root, 'data', 'Combined_FeatureSet_For_Model.csv')

combined_df = pd.read_csv(data_path)
# Optional: Select river system
combined_df = combined_df[combined_df["System"] == river_system]
# Optional: Select river
#combined_df = combined_df[combined_df["River"] == "Alagnak"]

combined_df.columns

if True:
    features_to_lag = ['Total_Returns', 'AgeClass_0.1',
       'AgeClass_0.2', 'AgeClass_0.3', 'AgeClass_0.4', 'AgeClass_0.5',
       'AgeClass_1.1', 'AgeClass_1.2', 'AgeClass_1.3', 'AgeClass_1.4',
       'AgeClass_1.5', 'AgeClass_2.1', 'AgeClass_2.2', 'AgeClass_2.3',
       'AgeClass_2.4', 'AgeClass_3.1', 'AgeClass_3.2', 'AgeClass_3.3',
       'AgeClass_3.4', 'Total_Returns_NextYear', 'Pacea_ALPI_Anomaly',
       'npi_mean_NovMar', 'oni_mean_DecFeb', 'npgo_mean_DecFeb',
       'ao_mean_DecMar', 'pdo_mean_DecMar', 'pdo_mean_MaySep']
    for feat in features_to_lag:
        for lag in [1, 2, 3, 4, 5]:
            combined_df[f'{feat}_Yminus{lag}'] = combined_df.groupby(['System', 'River'])[feat].shift(lag)

            # Optional: Standardize Total_Returns_NextYear
if (False):
    # Step 1: Compute per-river return stats
    return_stats = combined_df.groupby('River')['Total_Returns_NextYear'].agg(
        returns_mean='mean',
        returns_std='std'
    ).reset_index()

    # Step 2: Merge stats temporarily for scaling
    combined_df = combined_df.merge(return_stats, on='River', how='left')

    # Step 3: Standardize Total_Returns_NextYear
    combined_df['Total_Returns_NextYear'] = (
        (combined_df['Total_Returns_NextYear'] - combined_df['returns_mean']) /
        combined_df['returns_std']
    )

    # Step 4: Drop the extra columns again
    combined_df = combined_df.drop(columns=['returns_mean', 'returns_std'])

    # Optional: Keep Spawner data and remove river Ugashik and first four year (1963-1966) 
# as no data available 
if False:
    combined_df = combined_df.dropna(subset=['total_spawners_y_minus_2_to_4'])
    combined_df = combined_df.dropna(subset=['AgeClass_0.2_Yminus5'])

missing_summary = combined_df.isnull().sum()
missing_cols = missing_summary[missing_summary > 0]


combined_df = combined_df.drop(columns=missing_cols.index)

train_df, test_df = split_time_series_by_river(
    combined_df,
    time_column="Year",
    group_columns=["System", "River"],
    test_fraction=0.2,
    gap_years=0  # Set to 1 if you want a 1-year gap between train and test
)

train_df["River_Name"] = train_df["River"] # For visualization
test_df["River_Name"] = test_df["River"]

train_df_encoded = pd.get_dummies(train_df, columns=["River"], prefix="River")
test_df_encoded = pd.get_dummies(test_df, columns=["River"], prefix="River")

model_list = ["RF", "GBRT", "XGB", "LR", "PR"]
all_results = {}

for model_name in model_list:
    print(f"\n===================== {model_name} =====================")
    try:
        results = train_and_apply_rf_with_tuning(
            model=model_name,
            train_df=train_df_encoded,
            test_df=test_df_encoded,
            topk_feat=10
        )

        all_results[model_name] = results  # Save results here

        print(f"✅ R2 Train: {results['R2_train']:.4f}")
        print(f"✅ R2 Test : {results['R2']:.4f}")
        print(f"📉 MSE     : {results['MSE']:.2f}")
        print(f"📊 MAPE    : {results['MAPE']:.2f}%")
        if results['Best_Params'] is not None:
            print(f"🔧 Best Params: {results['Best_Params']}")
        else:
            print("ℹ️ No parameter tuning applied.")

        # ✅ Print per river
        print("\n📍 Metrics by River (Test):")
        print(results['Metrics_by_River_Test'].round(2).to_string(index=False))

        print("\n📍 Metrics by River (Train):")
        print(results['Metrics_by_River_Train'].round(2).to_string(index=False))

    except Exception as e:
        print(f"❌ Error while running model {model_name}: {e}")



# 📦 Collect and save all river test metrics from all models
all_river_metrics = []

for model_name, result in all_results.items():
    df = result['Metrics_by_River_Test'].copy()
    df.insert(0, "Model", model_name)  # 👈 Insert model name as first column
    all_river_metrics.append(df)

# Concatenate all into one DataFrame
final_df = pd.concat(all_river_metrics, ignore_index=True)

# Save to CSV
output_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\julia_models\results.csv"
final_df.to_csv(output_path, index=False)

print(f"\n✅ Results saved to: {output_path}")



### Optional: Fit ARIMA model on residuals (only works if only one river selected so far)

In [None]:
if True:    
    residuals = results["Timeline_train"]["Actual"] - results["Timeline_train"]["Predicted"]

    from statsmodels.tsa.arima.model import ARIMA
    residuals_series = pd.Series(residuals.values, index=results["Timeline_train"]["Year"])
    arima_model = ARIMA(residuals_series, order=(1,0,0))  # You may want to auto-tune this
    arima_fit = arima_model.fit()
    residual_forecast = arima_fit.forecast(steps=len(results["Timeline_test"]["Predicted"]))

    hybrid_pred = results["Timeline_test"]["Predicted"] + residual_forecast.values
    from sklearn.metrics import mean_squared_error, r2_score
    r2 = r2_score(results["Timeline_test"]["Actual"], hybrid_pred)
    print(r2)

    results["Timeline_test"]["Predicted"] = hybrid_pred

### Performance metrics

In [None]:
results["Metrics_by_System"]

In [None]:
results["Metrics_by_River"]

### Plot predictions

In [None]:
plot_predictions_by_river(results["Timeline_test"])

In [None]:
plot_predictions_by_river(results["Timeline_train"])

In [None]:
# Plot Predicted vs Actual
plot_actual_vs_predicted(results)

### Feature Importances

In [None]:
sorted_items = sorted(results["Feature_Importances"].items(), key=lambda x: x[1], reverse=True)
features, importances = zip(*sorted_items)

# Plotting
plt.figure(figsize=(10, 6))
plt.barh(features, importances, color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # Most important on top
plt.tight_layout()
plt.show()