In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

gbr = GradientBoostingRegressor(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=4, 
    random_state=42
)
# Step 2: Train the model
gbr.fit(X_train, y_train)

# Predict
y_pred_test = gbr.predict(X_test)
y_pred_train = gbr.predict(X_train)

# Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Print results
print("FIRSH APPROACH - ALL FEATURES")
print("Gradient Boosting Evaluation Metrics:")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "Gradient Boosting Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)




FIRSH APPROACH - ALL FEATURES
Gradient Boosting Evaluation Metrics:
R²   (R-squared test):          0.6299
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      21634369392.12
MAPE (Mean Absolute % Error):   69.04%


In [2]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

# Step 1: Create polynomial features (degree=2 is a good start)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Step 2: Train Linear Regression on the transformed features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Step 3: Predict
y_pred_test = model.predict(X_test_poly)
y_pred_train = model.predict(X_train_poly)

# Step 4: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 5: Print results
print("FIRSH APPROACH - ALL FEATURES")
print("Polynomial Regression Evaluation Metrics:")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")


# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "Polynomial Regression Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - ALL FEATURES
Polynomial Regression Evaluation Metrics:
R²   (R-squared test):          -207.8937
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      12210595312425.88
MAPE (Mean Absolute % Error):   1529.88%


In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

# Step 1: Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 2: Predict
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Step 3: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 4: Print metrics
print("FIRSH APPROACH - ALL FEATURES")
print("Linear Regression Evaluation Metrics:")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "Linear Regression Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - ALL FEATURES
Linear Regression Evaluation Metrics:
R²   (R-squared test):          -1.0277
R²   (R-squared train):         0.9947
MSE  (Mean Squared Error):      118524980899.37
MAPE (Mean Absolute % Error):   161.63%


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

#File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

# Define hyperparameter grid
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Step 1: Train the model
grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid=params,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

# Step 2: Predict
y_pred_test = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

# Step 3: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 4: Print results
print("FIRSH APPROACH - ALL FEATURES")
print("Random Forest Evaluation Metrics:")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "Random Forest Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - ALL FEATURES
Random Forest Evaluation Metrics:
R²   (R-squared test):          0.5143
R²   (R-squared train):         0.9593
MSE  (Mean Squared Error):      28388870083.53
MAPE (Mean Absolute % Error):   77.27%


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

#File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

#  Step 1: Apply log transformation to the target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

#  Step 2: Set up hyperparameter grid
params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

#  Step 3: Train the model using log-transformed target
grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid=params,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid.fit(X_train, y_train_log)

best_model = grid.best_estimator_

#  Step 4: Predict and reverse the log transformation
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # Reverse log1p

#  Step 4: Predict and reverse the log transformation
y_pred_log_train = best_model.predict(X_train)
y_pred_train = np.expm1(y_pred_log_train)  # Reverse log1p0

#  Step 5: Evaluate on original scale
r2 = r2_score(y_test, y_pred)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

#  Step 6: Print results
print("FIRSH APPROACH - ALL FEATURES")
print("Log-Transformed Random Forest Evaluation Metrics:")
print(f"R²   (R-squared test):              {r2:.4f}")
print(f"R²   (R-squared train):              {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):     {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):  {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "Log-Transformed Random Forest Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - ALL FEATURES
Log-Transformed Random Forest Evaluation Metrics:
R²   (R-squared test):              0.0082
R²   (R-squared train):              0.8392
MSE  (Mean Squared Error):     57971846214.59
MAPE (Mean Absolute % Error):  50.07%


In [6]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

#File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
train_loaded = pd.read_csv(train_path)
test_loaded = pd.read_csv(test_path)

X_train = train_loaded.drop('Total_Returns_NextYear', axis=1)
y_train = train_loaded['Total_Returns_NextYear']

X_test = test_loaded.drop('Total_Returns_NextYear', axis=1)
y_test = test_loaded['Total_Returns_NextYear']

# Step 1: Train XGBoost Regressor
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 2: Predict
y_pred_test = xgb_model.predict(X_test)
y_pred_train = xgb_model.predict(X_train)

# Step 3: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 4: Print results
print("FIRSH APPROACH - ALL FEATURES")
print("XGBoost Evaluation Metrics:")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - ALL FEATURES\n"
    "XGBoost Evaluation Metrics:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - ALL FEATURES
XGBoost Evaluation Metrics:
R²   (R-squared test):          0.4018
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      34969460875.24
MAPE (Mean Absolute % Error):   86.53%
