Spiting the data set in train and test

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Compute correlation with target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Keep only top 5 + 'Year' + target
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]


# Step 3: Split into features and labels
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# Step 4: Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 6: Print metrics
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Linear Regression (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Linear Regression:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - TOP 5 FEATURES
Linear Regression (Top 5 Correlated Features)
R²   (R-squared test):          0.3033
R²   (R-squared train):         0.5416
MSE  (Mean Squared Error):      16586756763735.38
MAPE (Mean Absolute % Error):   45.89%


here i was using another model

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# === Paths ===
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# === Load data ===
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# === Recover River column ===
river_cols = [col for col in df_train.columns if col.startswith('River_')]

def recover_river(df):
    df['River'] = df[river_cols].idxmax(axis=1).str.replace('River_', '')
    return df

df_train = recover_river(df_train)
df_test = recover_river(df_test)

# === Correlation and top 5 features ===
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# === Select features ===
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear', 'River']
df_train_top5 = df_train[columns_to_keep].copy()
df_test_top5 = df_test[columns_to_keep].copy()

# === Train/Test sets ===
X_train = df_train_top5.drop(['Total_Returns_NextYear', 'River'], axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop(['Total_Returns_NextYear', 'River'], axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# === Train model ===
model = LinearRegression()
model.fit(X_train, y_train)

# === Global predictions ===
df_test_top5['Predicted'] = model.predict(X_test)
df_test_top5['Actual'] = y_test.values

# === Grouped Metrics per River ===
grouped = df_test_top5.groupby('River')
metrics_list = []

for river, group in grouped:
    if len(group) < 2:
        continue
    r2 = r2_score(group['Actual'], group['Predicted'])
    mse = mean_squared_error(group['Actual'], group['Predicted'])
    mape = np.mean(np.abs((group['Actual'] - group['Predicted']) / group['Actual'])) * 100

    metrics_list.append({
        'River': river,
        'R2': r2,
        'MSE': mse,
        'MAPE': mape
    })

    # ✅ Print block for this river
    print(f"✅ FINAL METRICS - River: {river}")
    print(f"R² Test:      {r2:.4f}")
    print(f"R² Train:     {r2_score(y_train, model.predict(X_train)):.4f}")
    print(f"MSE Test:     {mse:.2f}")
    print(f"MAPE Test:    {mape:.2f}%\n")

# === Save CSV if needed ===
df_metrics = pd.DataFrame(metrics_list)
df_metrics.to_csv(r"C:\Users\MuriloFarias\Desktop\metrics_by_river_printed.csv", index=False)


✅ FINAL METRICS - River: Alagnak
R² Test:      -0.9769
R² Train:     0.5416
MSE Test:     3334497226146.19
MAPE Test:    51.75%

✅ FINAL METRICS - River: Egegik
R² Test:      -0.8511
R² Train:     0.5416
MSE Test:     41870960796820.15
MAPE Test:    59.01%

✅ FINAL METRICS - River: Igushik
R² Test:      -3.0603
R² Train:     0.5416
MSE Test:     1268714267348.30
MAPE Test:    67.75%

✅ FINAL METRICS - River: Kvichak
R² Test:      -0.6784
R² Train:     0.5416
MSE Test:     7805190330021.62
MAPE Test:    26.54%

✅ FINAL METRICS - River: Naknek
R² Test:      -0.4454
R² Train:     0.5416
MSE Test:     5250565917132.77
MAPE Test:    35.49%

✅ FINAL METRICS - River: Nushagak
R² Test:      -0.2406
R² Train:     0.5416
MSE Test:     25553781044312.93
MAPE Test:    50.14%

✅ FINAL METRICS - River: Wood
R² Test:      -0.9208
R² Train:     0.5416
MSE Test:     31023587764365.73
MAPE Test:    30.52%

