In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final players df.csv')

Approach 3, without the major metrics - "kills", "deaths", "assists"

Also, columns taken from a mix of correlation matrix and Random Forest feature selection, based on which metrics are most impactful in-game.

#Data Preprocessing

In [None]:
columns = [
    'goldat10','xpat10','csat10','opp_goldat10','opp_xpat10','opp_csat10','killsat10', 'assistsat10', 'deathsat10', 'opp_killsat10','opp_assistsat10','opp_deathsat10',
    'goldat15','xpat15','csat15','opp_goldat15','opp_xpat15','opp_csat15','killsat15', 'assistsat15', 'deathsat15', 'opp_killsat15','opp_assistsat15','opp_deathsat15',
    'result'
]

In [None]:
columns_df = df[columns]

In [None]:
columns_df.isna().sum()

Unnamed: 0,0
goldat10,0
xpat10,0
csat10,0
opp_goldat10,0
opp_xpat10,0
opp_csat10,0
killsat10,0
assistsat10,0
deathsat10,0
opp_killsat10,0


In [None]:
X = columns_df.drop(columns =['result'])
y = columns_df['result']

In [None]:
X.head()

Unnamed: 0,goldat10,xpat10,csat10,opp_goldat10,opp_xpat10,opp_csat10,killsat10,assistsat10,deathsat10,opp_killsat10,...,csat15,opp_goldat15,opp_xpat15,opp_csat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,3319.0,4466.0,69.0,2905.0,4125.0,72.0,1.0,0.0,0.0,0.0,...,107.0,4293.0,6875.0,111.0,2.0,0.0,0.0,0.0,0.0,2.0
1,3201.0,3177.0,64.0,3305.0,2983.0,55.0,0.0,1.0,0.0,1.0,...,93.0,4955.0,5309.0,99.0,2.0,1.0,0.0,1.0,0.0,0.0
2,3139.0,4360.0,82.0,3253.0,4766.0,83.0,0.0,1.0,1.0,0.0,...,142.0,4923.0,7388.0,132.0,0.0,1.0,1.0,0.0,1.0,0.0
3,3167.0,2950.0,77.0,2956.0,2503.0,72.0,0.0,0.0,0.0,0.0,...,132.0,4477.0,4479.0,119.0,1.0,2.0,0.0,0.0,0.0,2.0
4,2315.0,2306.0,1.0,2138.0,2431.0,1.0,0.0,0.0,0.0,0.0,...,2.0,3155.0,3645.0,5.0,0.0,3.0,0.0,0.0,0.0,1.0


#Method

Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=23)

Gridsearch

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}

In [None]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3,4,5,6]
    }

In [None]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

In [None]:
param_grid_rf = {
    'n_estimators': [10, 100, 200],
    'criterion': ['gini', 'entropy']

}

In [None]:
param_grid_nb = {
    'var_smoothing': [1e-08, 1e-07, 1e-06, 1e-05]
}

In [None]:
param_grid_gb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_xgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_lgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3,4,5,6]
}

In [None]:
param_grid_bagging = {
    'n_estimators': [10, 100, 200],
    'max_features': [0.5, 0.7, 1.0]
}

Defining the models


In [None]:
models = {
    'KNN': (KNeighborsClassifier(), param_grid_knn),
    'Decision Tree': (DecisionTreeClassifier(), param_grid_dt),
    'Logistic Regression': (LogisticRegression(max_iter=1000), param_grid_lr),
    'Random Forest': (RandomForestClassifier(), param_grid_rf)

}

In [None]:
models2 = {

    'Naive Bayes': (GaussianNB(), param_grid_nb),
    'Gradient Boosting': (GradientBoostingClassifier(), param_grid_gb),
    'XGBoost': (XGBClassifier(), param_grid_xgb)

}

In [None]:
models3 = {

    'LightGBM': (LGBMClassifier(), param_grid_lgb)

}

In [None]:
models4 = {

    'Bagging': (BaggingClassifier(), param_grid_bagging)

}

using pipeling to scale the models

In [None]:
for model_name, (model, param_grid) in models.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

KNN - Best Params: {'model__metric': 'euclidean', 'model__n_neighbors': 10}, Best Score: 0.6204985219371711
Test Accuracy for KNN: 0.6242362525458248
Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6438043743610977
Test Accuracy for Decision Tree: 0.6439548231808925
Logistic Regression - Best Params: {'model__C': 100, 'model__solver': 'liblinear'}, Best Score: 0.663904809628483
Test Accuracy for Logistic Regression: 0.6668672468061471
Random Forest - Best Params: {'model__criterion': 'gini', 'model__n_estimators': 200}, Best Score: 0.6542422478044042
Test Accuracy for Random Forest: 0.6561747824476949


KNN - Best Params: {'model__metric': 'euclidean', 'model__n_neighbors': 10}, Best Score: 0.6204985219371711

Test Accuracy for KNN: 0.6242362525458248

Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6438043743610977

Test Accuracy for Decision Tree: 0.6439548231808925

Logistic Regression - Best Params: {'model__C': 100, 'model__solver': 'liblinear'}, Best Score: 0.663904809628483

Test Accuracy for Logistic Regression: 0.6668672468061471

Random Forest - Best Params: {'model__criterion': 'gini', 'model__n_estimators': 200}, Best Score: 0.6542422478044042

Test Accuracy for Random Forest: 0.6561747824476949

In [None]:
for model_name, (model, param_grid) in models2.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6475652513088438
Test Accuracy for Naive Bayes: 0.6508516941307165
Gradient Boosting - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.6632799268204139
Test Accuracy for Gradient Boosting: 0.6658026291427513
XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 100}, Best Score: 0.6620764293786745
Test Accuracy for XGBoost: 0.6635345306424736


Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6475652513088438

Test Accuracy for Naive Bayes: 0.6508516941307165

Gradient Boosting - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.6632799268204139

Test Accuracy for Gradient Boosting: 0.6658026291427513

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 100}, Best Score: 0.6620764293786745

Test Accuracy for XGBoost: 0.6635345306424736

In [None]:
for model_name, (model, param_grid) in models3.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 34564, number of negative: 34569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2737
[LightGBM] [Info] Number of data points in the train set: 69133, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499964 -> initscore=-0.000145
[LightGBM] [Info] Start training from score -0.000145
[LightGBM] [Info] Number of positive: 34563, number of negative: 34570
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2735
[LightGBM] [Info] Number of data points in the

LightGBM - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.6642056712679485

Test Accuracy for LightGBM: 0.6657563414182559

In [None]:
for model_name, (model, param_grid) in models4.items():

    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', BaggingClassifier(n_jobs=1))
    ])

    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}

    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')



Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.6530271628985616
Test Accuracy for Bagging: 0.6539066839474171


Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.6530271628985616

Test Accuracy for Bagging: 0.6539066839474171

#Result

KNN - Best Params: {'model__metric': 'euclidean', 'model__n_neighbors': 10}, Best Score: 0.6204985219371711

Test Accuracy for KNN: 0.6242362525458248

Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6438043743610977

Test Accuracy for Decision Tree: 0.6439548231808925

Logistic Regression - Best Params: {'model__C': 100, 'model__solver': 'liblinear'}, Best Score: 0.663904809628483

Test Accuracy for Logistic Regression: 0.6668672468061471

Random Forest - Best Params: {'model__criterion': 'gini', 'model__n_estimators': 200}, Best Score: 0.6542422478044042

Test Accuracy for Random Forest: 0.6561747824476949

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6475652513088438

Test Accuracy for Naive Bayes: 0.6508516941307165

Gradient Boosting - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.6632799268204139

Test Accuracy for Gradient Boosting: 0.6658026291427513

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 100}, Best Score: 0.6620764293786745

Test Accuracy for XGBoost: 0.6635345306424736

LightGBM - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.6642056712679485

Test Accuracy for LightGBM: 0.6657563414182559

Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.6530271628985616

Test Accuracy for Bagging: 0.6539066839474171