FIRST_APPROACH
TOP_10 FEATURES

GradientBoostingRegressor

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load full datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Calculate correlation with target and select top 10 features
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_10_features = target_corr.abs().sort_values(ascending=False).head(10).index.tolist()

# Step 2: Select top 10 features + 'Year' + target
columns_to_keep = top_10_features + ['Year', 'Total_Returns_NextYear']
df_train_top10 = df_train[columns_to_keep]
df_test_top10 = df_test[columns_to_keep]

# Step 3: Prepare training and testing data
X_train = df_train_top10.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top10['Total_Returns_NextYear']
X_test = df_test_top10.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top10['Total_Returns_NextYear']

# Step 4: Train Gradient Boosting model
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
gbr.fit(X_train, y_train)

# Step 5: Predict
y_pred_test = gbr.predict(X_test)
y_pred_train = gbr.predict(X_train)

# Step 6: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Output results
print("FIRSH APPROACH - TOP 10 FEATURES")
print("Gradient Boosting (Top 10 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  10 FEATURES \n"
    "Gradient Boosting:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 10 FEATURES
Gradient Boosting (Top 10 Correlated Features)
R²   (R-squared test):          0.1162
R²   (R-squared train):         0.9934
MSE  (Mean Squared Error):      21043753439138.82
MAPE (Mean Absolute % Error):   49.93%


****************************************************************

Polynomial Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Calculate top 10 features correlated with target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_10_features = target_corr.abs().sort_values(ascending=False).head(10).index.tolist()

# Step 2: Filter relevant columns
columns_to_keep = top_10_features + ['Year', 'Total_Returns_NextYear']
df_train_top10 = df_train[columns_to_keep]
df_test_top10 = df_test[columns_to_keep]

# Step 3: Separate features and target
X_train = df_train_top10.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top10['Total_Returns_NextYear']
X_test = df_test_top10.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top10['Total_Returns_NextYear']

# Step 4: Polynomial transformation (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Step 5: Train model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Step 6: Predict and evaluate
y_pred_test = model.predict(X_test_poly)
y_pred_train = model.predict(X_train_poly)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Output results
print("FIRSH APPROACH - TOP 10 FEATURES")
print("Polynomial Regression (Top 10 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  10 FEATURES \n"
    "Polynomial Regressiong:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 10 FEATURES
Polynomial Regression (Top 10 Correlated Features)
R²   (R-squared test):          -4.1189
R²   (R-squared train):         0.8912
MSE  (Mean Squared Error):      121877601694846.55
MAPE (Mean Absolute % Error):   77.30%


***************************************************************************************

Linear Regression

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Calculate top 10 correlated features with the target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_10_features = target_corr.abs().sort_values(ascending=False).head(10).index.tolist()

# Step 2: Keep top 10 + 'Year' + target
columns_to_keep = top_10_features + ['Year', 'Total_Returns_NextYear']
df_train_top10 = df_train[columns_to_keep]
df_test_top10 = df_test[columns_to_keep]

# Step 3: Prepare feature/target split
X_train = df_train_top10.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top10['Total_Returns_NextYear']
X_test = df_test_top10.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top10['Total_Returns_NextYear']

# Step 4: Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 6: Print metrics
print("FIRSH APPROACH - TOP 10 FEATURES")
print("Linear Regression (Top 10 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  10 FEATURES \n"
    "Linear Regression:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 10 FEATURES
Linear Regression (Top 10 Correlated Features)
R²   (R-squared test):          0.2845
R²   (R-squared train):         0.6075
MSE  (Mean Squared Error):      17036243804633.45
MAPE (Mean Absolute % Error):   41.10%


************************************************************************************

Random Forest model

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load full datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Calculate correlation and select top 10 features
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_10_features = target_corr.abs().sort_values(ascending=False).head(10).index.tolist()

# Step 2: Filter top features + Year + target
columns_to_keep = top_10_features + ['Year', 'Total_Returns_NextYear']
df_train_top10 = df_train[columns_to_keep]
df_test_top10 = df_test[columns_to_keep]

# Step 3: Prepare feature/target splits
X_train = df_train_top10.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top10['Total_Returns_NextYear']
X_test = df_test_top10.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top10['Total_Returns_NextYear']

# Step 4: Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred_test = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 6: Output results
print("FIRSH APPROACH - TOP 10 FEATURES")
print("Random Forest (Top 10 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  10 FEATURES \n"
    "Random Forest:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 10 FEATURES
Random Forest (Top 10 Correlated Features)
R²   (R-squared test):          0.2023
R²   (R-squared train):         0.9573
MSE  (Mean Squared Error):      18993281636184.82
MAPE (Mean Absolute % Error):   41.61%


*************************************************************************************

XGBoost Regressor

In [5]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Get top 10 correlated features
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_10_features = target_corr.abs().sort_values(ascending=False).head(10).index.tolist()

# Step 2: Select columns
columns_to_keep = top_10_features + ['Year', 'Total_Returns_NextYear']
df_train_top10 = df_train[columns_to_keep]
df_test_top10 = df_test[columns_to_keep]

# Step 3: Split into features and target
X_train = df_train_top10.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top10['Total_Returns_NextYear']
X_test = df_test_top10.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top10['Total_Returns_NextYear']

# Step 4: Train model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 5: Predict
y_pred_test = xgb_model.predict(X_test)
y_pred_train = xgb_model.predict(X_train)

# Step 6: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Print results
print("FIRSH APPROACH - TOP 10 FEATURES")
print("XGBoost (Top 10 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  10 FEATURES \n"
    "GXGBoost :\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\BristolBay\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 10 FEATURES
XGBoost (Top 10 Correlated Features)
R²   (R-squared test):          -0.0937
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      26039471182985.89
MAPE (Mean Absolute % Error):   50.82%
