In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
from google.colab import files

In [None]:
uploaded_file = files.upload()

Saving picktable.csv to picktable.csv


In [None]:
data_filename = next(iter(uploaded_file))
data = pd.read_csv(data_filename)

Our target is to find the best combination / synergy champions that have been picked over the years based on 1st pick in the draft phase.

It is crucial to note that this synergy can be achieved with the overall pick as well, but our target is to find the result based on the 1st pick becasue often the 1st pick is the most important pick during draft phase (i.e. strongest champion, or the best champion for a specific player/role)

#Data Preprocessing

Taking only the winning result combination data for better output

In [None]:
winning_data = data[data['result'] == 1]

Using OneHot Encoder to encode the champion names in order to feed it to the model

In [None]:
encoder = OneHotEncoder()
encoded_winning_picks = encoder.fit_transform(winning_data[['pick1', 'pick2', 'pick3', 'pick4', 'pick5']]).toarray()

using pick1 as input variable and the rest 4 as target variable
i.e. pick 1 champ, the model will give us 4 more champion

In [None]:
X_winning = encoded_winning_picks[:, :encoder.categories_[0].size]  # pick1
y_winning = encoded_winning_picks[:, encoder.categories_[0].size:]  # pick2 to pick5

In [None]:
X_subset, _, y_subset, _ = train_test_split(X_winning, y_winning, test_size=0.9, random_state=42)

Using 10% of the dataset for faster result, The Gridsearch on this takes a lot of time.

In [None]:
X_subset_train, X_subset_val, y_subset_train, y_subset_val = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

#Gridsearch

##XGBRegressor

In [None]:
xgb_model_winning_sample = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', random_state=42))

In [None]:
param_grid_xgb = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 4,5, 6]
}

In [None]:
grid_search_xgb = GridSearchCV(xgb_model_winning_sample, param_grid_xgb, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [None]:
grid_search_xgb.fit(X_subset_train, y_subset_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
best_params_xgb = grid_search_xgb.best_params_
best_model_xgb = grid_search_xgb.best_estimator_


In [None]:
y_subset_pred_best_xgb = best_model_xgb.predict(X_subset_val)
mse_subset_best_xgb = mean_squared_error(y_subset_val, y_subset_pred_best_xgb)

In [None]:
print("Best MSE on subset (XGBRegressor):", mse_subset_best_xgb)
print("Best parameters (XGBRegressor) from subset:", best_params_xgb)

Best MSE on subset (XGBRegressor): 0.006485400442611549
Best parameters (XGBRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}


Best MSE on subset (XGBRegressor): 0.006485400442611549

Best parameters (XGBRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}

##RandomForestRegressor

In [None]:
rf_model_winning_sample = MultiOutputRegressor(RandomForestRegressor(random_state=42))

In [None]:
param_grid_rf = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [3,4,5,6]

}

In [None]:
grid_search_rf = GridSearchCV(rf_model_winning_sample, param_grid_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [None]:
grid_search_rf.fit(X_subset_train, y_subset_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
best_params_rf = grid_search_rf.best_params_
best_model_rf = grid_search_rf.best_estimator_

In [None]:
y_subset_pred_best_rf = best_model_rf.predict(X_subset_val)
mse_subset_best_rf = mean_squared_error(y_subset_val, y_subset_pred_best_rf)

In [None]:
print("Best MSE on subset (RandomForestRegressor):", mse_subset_best_rf)
print("Best parameters (RandomForestRegressor) from subset:", best_params_rf)

Best MSE on subset (RandomForestRegressor): 0.00678526258377234
Best parameters (RandomForestRegressor) from subset: {'estimator__max_depth': 3, 'estimator__n_estimators': 100}


Best MSE on subset (RandomForestRegressor): 0.00678526258377234

Best parameters (RandomForestRegressor) from subset: {'estimator__max_depth': 3, 'estimator__n_estimators': 100}

##GradientBoostingRegressor

In [None]:
gbr_model_winning_sample = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))

In [None]:
param_grid_gbr = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 4, 5,6]
}

In [None]:
grid_search_gbr = GridSearchCV(gbr_model_winning_sample, param_grid_gbr, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [None]:
grid_search_gbr.fit(X_subset_train, y_subset_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
best_params_gbr = grid_search_gbr.best_params_
best_model_gbr = grid_search_gbr.best_estimator_

In [None]:
y_subset_pred_best_gbr = best_model_gbr.predict(X_subset_val)
mse_subset_best_gbr = mean_squared_error(y_subset_val, y_subset_pred_best_gbr)

In [None]:
print("Best MSE on subset (GradientBoostingRegressor):", mse_subset_best_gbr)
print("Best parameters (GradientBoostingRegressor) from subset:", best_params_gbr)

Best MSE on subset (GradientBoostingRegressor): 0.006523597808535117
Best parameters (GradientBoostingRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}


Best MSE on subset (GradientBoostingRegressor): 0.006523597808535117

Best parameters (GradientBoostingRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}

##DecisionTreeRegressor

In [None]:
dt_model_winning_sample = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))

In [None]:
param_grid_dt = {
    'estimator__max_depth': [3,4,5,6],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [2, 4]
}

In [None]:
grid_search_dt = GridSearchCV(dt_model_winning_sample, param_grid_dt, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [None]:
grid_search_dt.fit(X_subset_train, y_subset_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
best_params_dt = grid_search_dt.best_params_
best_model_dt = grid_search_dt.best_estimator_

In [None]:
y_subset_pred_best_dt = best_model_dt.predict(X_subset_val)
mse_subset_best_dt = mean_squared_error(y_subset_val, y_subset_pred_best_dt)

In [None]:
print("Best MSE on subset (DecisionTreeRegressor):", mse_subset_best_dt)
print("Best parameters (DecisionTreeRegressor) from subset:", best_params_dt)

Best MSE on subset (DecisionTreeRegressor): 0.0066696733724682265
Best parameters (DecisionTreeRegressor) from subset: {'estimator__max_depth': 3, 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 2}


Best MSE on subset (DecisionTreeRegressor): 0.0066696733724682265

Best parameters (DecisionTreeRegressor) from subset: {'estimator__max_depth': 3, 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 2}

#Result

Best MSE on subset (XGBRegressor): 0.006485400442611549

Best parameters (XGBRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}

Best MSE on subset (RandomForestRegressor): 0.00678526258377234

Best parameters (RandomForestRegressor) from subset: {'estimator__max_depth': 3, 'estimator__n_estimators': 100}

Best MSE on subset (GradientBoostingRegressor): 0.006523597808535117

Best parameters (GradientBoostingRegressor) from subset: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}

Best MSE on subset (DecisionTreeRegressor): 0.0066696733724682265

Best parameters (DecisionTreeRegressor) from subset: {'estimator__max_depth': 3, 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 2}