In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv('resources/test.csv')

In [3]:
X = pd.get_dummies(df.drop('goals', axis=1), drop_first=True)
X = X.dropna()

y = df['goals']
y = y[X.index]


# Remove outliers from y using IQR method
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
non_outliers = (y >= lower_bound) & (y <= upper_bound)
X = X[non_outliers]
y = y[non_outliers]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}
lasso_model = Lasso()
grid_search_lasso = GridSearchCV(lasso_model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_lasso.fit(X_train, y_train)

best_lasso_params = grid_search_lasso.best_params_
best_lasso_model = grid_search_lasso.best_estimator_
lasso_predictions = best_lasso_model.predict(X_test)

# Calculate performance metrics
lasso_mse = mean_squared_error(y_test, lasso_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_mae = mean_absolute_error(y_test, lasso_predictions)

# Find best and worst predictions
lasso_errors = abs(lasso_predictions - y_test)
best_lasso_idx = lasso_errors.argmin()  # index of smallest error (best prediction)
worst_lasso_idx = lasso_errors.argmax()  # index of largest error (worst prediction)

best_lasso_prediction = lasso_predictions[best_lasso_idx]
worst_lasso_prediction = lasso_predictions[worst_lasso_idx]
best_lasso_actual = y_test.iloc[best_lasso_idx]
worst_lasso_actual = y_test.iloc[worst_lasso_idx]

# Print results for Lasso
print(f"Lasso Regression Performance (After Hyperparameter Tuning):")
print(f"Best Alpha: {best_lasso_params['alpha']}")
print(f"R²: {lasso_r2:.2f}")
print(f"MSE: {lasso_mse:.2f}")
print(f"MAE: {lasso_mae:.2f}")
print(f"Best Lasso Prediction: {best_lasso_prediction:.2f}, Actual: {best_lasso_actual:.2f}")
print(f"Worst Lasso Prediction: {worst_lasso_prediction:.2f}, Actual: {worst_lasso_actual:.2f}")

Lasso Regression Performance (After Hyperparameter Tuning):
Best Alpha: 0.001
R²: 0.61
MSE: 0.01
MAE: 0.06
Best Lasso Prediction: 0.07, Actual: 0.07
Worst Lasso Prediction: 0.05, Actual: 0.40
