In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [20]:
df = pd.read_csv('resources/2014_player_data_standardized.csv')
df = df.drop('Unnamed: 0', axis=1)

In [21]:
X = pd.get_dummies(df.drop('player_performance_valuation_standardized', axis=1), drop_first=True)
X = X.dropna()

y = df['player_performance_valuation_standardized']
y = y[X.index]


# Remove outliers from y using IQR method
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
non_outliers = (y >= lower_bound) & (y <= upper_bound)
X = X[non_outliers]
y = y[non_outliers]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}
lasso_model = Lasso()
grid_search_lasso = GridSearchCV(lasso_model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_lasso.fit(X_train, y_train)

best_lasso_params = grid_search_lasso.best_params_
best_lasso_model = grid_search_lasso.best_estimator_
lasso_predictions = best_lasso_model.predict(X_test)

# Calculate performance metrics
lasso_mse = mean_squared_error(y_test, lasso_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_mae = mean_absolute_error(y_test, lasso_predictions)

# Find best and worst predictions
lasso_errors = abs(lasso_predictions - y_test)
best_lasso_idx = lasso_errors.argmin()  # index of smallest error (best prediction)
worst_lasso_idx = lasso_errors.argmax()  # index of largest error (worst prediction)

best_lasso_prediction = lasso_predictions[best_lasso_idx]
worst_lasso_prediction = lasso_predictions[worst_lasso_idx]
best_lasso_actual = y_test.iloc[best_lasso_idx]
worst_lasso_actual = y_test.iloc[worst_lasso_idx]

# Print results for Lasso
print(f"Lasso Regression Performance (After Hyperparameter Tuning):")
print(f"Best Alpha: {best_lasso_params['alpha']}")
print(f"R²: {lasso_r2:.2f}")
print(f"MSE: {lasso_mse:.2f}")
print(f"MAE: {lasso_mae:.2f}")
print(f"Best Lasso Prediction: {best_lasso_prediction:.2f}, Actual: {best_lasso_actual:.2f}")
print(f"Worst Lasso Prediction: {worst_lasso_prediction:.2f}, Actual: {worst_lasso_actual:.2f}")

  model = cd_fast.enet_coordinate_descent(


Lasso Regression Performance (After Hyperparameter Tuning):
Best Alpha: 0.1
R²: 0.06
MSE: 0.06
MAE: 0.18
Best Lasso Prediction: -0.30, Actual: -0.31
Worst Lasso Prediction: -0.25, Actual: 0.57


In [19]:
# Get feature names if using a DataFrame
feature_names = X_train.columns  # Assuming X_train is a DataFrame

# Get the coefficients from the best Lasso model
lasso_coefficients = best_lasso_model.coef_

# Identify features with non-zero coefficients
used_features = feature_names[lasso_coefficients != 0]
excluded_features = feature_names[lasso_coefficients == 0]

print("Features used by Lasso regression:")
print(used_features.tolist())

print("\nFeatures excluded (coefficient = 0):")
print(excluded_features.tolist())

Features used by Lasso regression:
['Unnamed: 0', 'minutes_played', 'key_passes', 'market_value_in_eur', 'age_in_months_2015', 'team_rank']

Features excluded (coefficient = 0):
['goals', 'npg', 'assists', 'xG', 'xA', 'npxG', 'xG90', 'xA90', 'npxG90', 'shots', 'yellow_cards', 'red_cards', 'xGBuildup', 'xGChain', 'height_in_cm', 'points_per_game', 'player_name_Abdoulay Konko', 'player_name_Abdoulaye Keita', 'player_name_Accursio Bentivegna', 'player_name_Achraf Lazaar', 'player_name_Adam Hlousek', 'player_name_Adam Johnson', 'player_name_Adam Lallana', 'player_name_Adlène Guédioura', 'player_name_Adnan Januzaj', 'player_name_Adrian Mariappa', 'player_name_Adrián Embarba', 'player_name_Adrián Ramos', 'player_name_Alan Hutton', 'player_name_Alberto Gilardino', 'player_name_Alberto Grassi', 'player_name_Alberto Moreno', 'player_name_Albin Ekdal', 'player_name_Aleksandar Ignjovski', 'player_name_Aleksandar Kolarov', 'player_name_Alessandro Diamanti', 'player_name_Alessandro Florenzi', 'play