In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# 🔹 Step 1: Get top 5 features most correlated with the target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
target_corr_sorted = target_corr.reindex(target_corr.abs().sort_values(ascending=False).index)
top_5_features = target_corr_sorted.head(5).index.tolist()

# 🔹 Step 2: Filter the datasets
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# 🔹 Step 3: Prepare training and testing data
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# 🔹 Step 4: Train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=4, 
    random_state=42
)
gbr.fit(X_train, y_train)

# 🔹 Step 5: Predict and evaluate
y_pred_test = gbr.predict(X_test)
y_pred_train = gbr.predict(X_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# 🔹 Step 6: Show results
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Gradient Boosting (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Gradient Boosting:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 5 FEATURES
Gradient Boosting (Top 5 Correlated Features)
R²   (R-squared test):          0.5010
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      29168909613.09
MAPE (Mean Absolute % Error):   86.53%


In [2]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load original datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Select Top 5 features by correlation with the target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Filter features + 'Year' + target
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# Step 3: Split into X and y
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# Step 4: Apply log1p to avoid log(0) errors
y_train_log = np.log1p(y_train)

# Step 5: Train Gradient Boosting on log-transformed target
gbr = GradientBoostingRegressor(
    n_estimators=200, 
    learning_rate=0.05, 
    max_depth=4, 
    random_state=42
)
gbr.fit(X_train, y_train_log)

# Step 6: Predict and inverse log
y_pred_log_test = gbr.predict(X_test)
y_pred_test = np.expm1(y_pred_log_test)

y_pred_log_train = gbr.predict(X_train)
y_pred_train = np.expm1(y_pred_log_train)

# Step 7: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 8: Print results
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Gradient Boosting (Top 5 Correlated + Log Target)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Gradient Boosting Correlated + Log Target:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - TOP 5 FEATURES
Gradient Boosting (Top 5 Correlated + Log Target)
R²   (R-squared test):          0.6328
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      21465047392.60
MAPE (Mean Absolute % Error):   42.27%


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load original datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Select Top 5 features by correlation
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Filter the data
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# Step 3: Split features and target
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# Step 4: Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Step 5: Train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Step 6: Predict and evaluate
y_pred_test = model.predict(X_test_poly)
y_pred_train = model.predict(X_train_poly)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Print results
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Polynomial Regression (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Polynomial Regression:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - TOP 5 FEATURES
Polynomial Regression (Top 5 Correlated Features)
R²   (R-squared test):          -53.3972
R²   (R-squared train):         0.9925
MSE  (Mean Squared Error):      3179712483228.92
MAPE (Mean Absolute % Error):   1063.95%


In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Compute correlation with target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Keep only top 5 + 'Year' + target
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# Step 3: Split into features and labels
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# Step 4: Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 6: Print metrics
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Linear Regression (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Linear Regression:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)



FIRSH APPROACH - TOP 5 FEATURES
Linear Regression (Top 5 Correlated Features)
R²   (R-squared test):          0.3999
R²   (R-squared train):         0.8768
MSE  (Mean Squared Error):      35076693351.73
MAPE (Mean Absolute % Error):   98.08%


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Get Top 5 most correlated features with the target
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Keep only top 5 + 'Year' + target
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# Step 3: Split into X and y
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear']
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_test = df_test_top5['Total_Returns_NextYear']

# Step 4: Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 5: Predict
y_pred_test = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Step 6: Evaluate
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Print metrics
print("FIRSH APPROACH - TOP 5 FEATURES")
print("Random Forest (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Define result string
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "Random Forest:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

# Path to results file
results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"

# Append to file
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 5 FEATURES
Random Forest (Top 5 Correlated Features)
R²   (R-squared test):          0.5582
R²   (R-squared train):         0.9727
MSE  (Mean Squared Error):      25826327079.81
MAPE (Mean Absolute % Error):   76.03%


In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# File paths
train_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_train_encoded.csv"
test_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\FIRST_APPROACH\data_test_encoded.csv"

# Load datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Step 1: Get top 5 correlated features
corr_matrix = df_train.corr(numeric_only=True)
target_corr = corr_matrix['Total_Returns_NextYear'].drop('Total_Returns_NextYear')
top_5_features = target_corr.abs().sort_values(ascending=False).head(5).index.tolist()

# Step 2: Filter columns
columns_to_keep = top_5_features + ['Year', 'Total_Returns_NextYear']
df_train_top5 = df_train[columns_to_keep]
df_test_top5 = df_test[columns_to_keep]

# Step 3: Split X and y
X_train = df_train_top5.drop('Total_Returns_NextYear', axis=1)
X_test = df_test_top5.drop('Total_Returns_NextYear', axis=1)
y_train = df_train_top5['Total_Returns_NextYear'].values
y_test = df_test_top5['Total_Returns_NextYear'].values

# ✅ Step 4: Train XGBoost Regressor using numpy arrays
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train.values, y_train)

# ✅ Step 5: Predict and evaluate using numpy arrays
y_pred_test = xgb_model.predict(X_test.values)
y_pred_train = xgb_model.predict(X_train.values)

# Step 6: Metrics
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
mse = mean_squared_error(y_test, y_pred_test)
mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

# Step 7: Print results
print("FIRSH APPROACH - TOP 5 FEATURES")
print("XGBoost (Top 5 Correlated Features)")
print(f"R²   (R-squared test):          {r2_test:.4f}")
print(f"R²   (R-squared train):         {r2_train:.4f}")
print(f"MSE  (Mean Squared Error):      {mse:.2f}")
print(f"MAPE (Mean Absolute % Error):   {mape:.2f}%")

# Step 8: Save results to file
result_str = (
    "\n\n===============================\n"
    "FIRSH APPROACH - TOP  5 FEATURES \n"
    "XGBoost:\n"
    f"R²   (R-squared test):          {r2_test:.4f}\n"
    f"R²   (R-squared train):         {r2_train:.4f}\n"
    f"MSE  (Mean Squared Error):      {mse:.2f}\n"
    f"MAPE (Mean Absolute % Error):   {mape:.2f}%\n"
)

results_path = r"C:\Users\MuriloFarias\Desktop\NNS-JULIA\PredictSalmonRuns\murilo_salmon\Columbia\results.txt"
with open(results_path, "a", encoding="utf-8") as f:
    f.write(result_str)


FIRSH APPROACH - TOP 5 FEATURES
XGBoost (Top 5 Correlated Features)
R²   (R-squared test):          0.4455
R²   (R-squared train):         1.0000
MSE  (Mean Squared Error):      32412714191.55
MAPE (Mean Absolute % Error):   83.48%
