In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('resources/final_data_wins.csv')

In [3]:
X = pd.get_dummies(df.drop('current_value', axis=1), drop_first=True)
X = X.dropna()

y = df['current_value']
y = y[X.index]


# Remove outliers from y using IQR method
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
non_outliers = (y >= lower_bound) & (y <= upper_bound)
X = X[non_outliers]
y = y[non_outliers]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize the model
gbr_model = GradientBoostingRegressor(random_state=42)


grid_search_gbr = GridSearchCV(gbr_model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_gbr.fit(X_train, y_train)


best_gbr_params = grid_search_gbr.best_params_
best_gbr_model = grid_search_gbr.best_estimator_
gbr_predictions = best_gbr_model.predict(X_test)


gbr_mse = mean_squared_error(y_test, gbr_predictions)
gbr_r2 = r2_score(y_test, gbr_predictions)
gbr_mae = mean_absolute_error(y_test, gbr_predictions)


gbr_errors = abs(gbr_predictions - y_test.values)
best_gbr_idx = gbr_errors.argmin()
worst_gbr_idx = gbr_errors.argmax()

best_gbr_prediction = gbr_predictions[best_gbr_idx]
worst_gbr_prediction = gbr_predictions[worst_gbr_idx]
best_gbr_actual = y_test.iloc[best_gbr_idx]
worst_gbr_actual = y_test.iloc[worst_gbr_idx]


print(f"Gradient Boosting Regression Performance (After Hyperparameter Tuning):")
print(f"Best Parameters: {best_gbr_params}")
print(f"R²: {gbr_r2:.2f}")
print(f"MSE: {gbr_mse:.2f}")
print(f"MAE: {gbr_mae:.2f}")
print(f"Best GBR Prediction: {best_gbr_prediction:.2f}, Actual: {best_gbr_actual:.2f}")
print(f"Worst GBR Prediction: {worst_gbr_prediction:.2f}, Actual: {worst_gbr_actual:.2f}")

Gradient Boosting Regression Performance (After Hyperparameter Tuning):
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
R²: 0.82
MSE: 428164209931.84
MAE: 358480.11
Best GBR Prediction: 249897.58, Actual: 250000.00
Worst GBR Prediction: 1789490.16, Actual: 6000000.00


In [None]:
from sklearn.model_selection import cross_val_score
#Check if its overfitting, if CV scores are much lower than test R², this model is likely overfitting to the training set. (its not)
cv_scores = cross_val_score(best_gbr_model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validated R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.2f}")