In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final players df 2.csv')

Approach 2, on columns extracted by preforming Random Forest Feature Selection

#Data Preprocessing

In [None]:
columns = [
    'kills','deaths','assists','earnedgold','earned gpm','earnedgoldshare','totalgold', 'golddiffat15', 'opp_goldat15', 'goldspent','result'
]

In [None]:
columns_df = df[columns]

#Method

In [None]:
X = columns_df.drop(columns =['result'])
y = columns_df['result']

In [None]:
X.head()

Unnamed: 0,kills,deaths,assists,earnedgold,earned gpm,earnedgoldshare,totalgold,golddiffat15,opp_goldat15,goldspent
0,3,1,6,7875,261.3385,0.192233,11839,724,4293,10600
1,4,1,5,7129,236.5819,0.174023,11093,338,4955,10350
2,3,2,8,8896,295.2212,0.217156,12860,410,4923,12075
3,9,0,9,12607,418.3739,0.307742,16571,1174,4477,14500
4,1,0,17,4459,147.9757,0.108847,8423,530,3155,7585


Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=23)

Gridsearch

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}

In [None]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3,4,5,6]
    }

In [None]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

In [None]:
param_grid_rf = {
    'n_estimators': [10, 100, 200],
    'criterion': ['gini', 'entropy']

}

In [None]:
param_grid_nb = {
    'var_smoothing': [1e-08, 1e-07, 1e-06, 1e-05]
}

In [None]:
param_grid_gb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_xgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_lgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3,4,5,6]
}

In [None]:
param_grid_bagging = {
    'n_estimators': [10, 100, 200],
    'max_features': [0.5, 0.7, 1.0]
}

Defining the models


In [None]:
models = {
    'KNN': (KNeighborsClassifier(), param_grid_knn),
    'Decision Tree': (DecisionTreeClassifier(), param_grid_dt),
    'Logistic Regression': (LogisticRegression(max_iter=1000), param_grid_lr),
    'Random Forest': (RandomForestClassifier(), param_grid_rf)

}

In [None]:
models2 = {

    'Naive Bayes': (GaussianNB(), param_grid_nb),
    'Gradient Boosting': (GradientBoostingClassifier(), param_grid_gb),
    'XGBoost': (XGBClassifier(), param_grid_xgb)

}

In [None]:
models3 = {

    'LightGBM': (LGBMClassifier(), param_grid_lgb)

}

In [None]:
models4 = {

    'Bagging': (BaggingClassifier(), param_grid_bagging)

}

using pipeling to scale the models

In [None]:
for model_name, (model, param_grid) in models.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.9315636215250211
Test Accuracy for KNN: 0.9316330309202
Decision Tree - Best Params: {'model__criterion': 'entropy', 'model__max_depth': 6}, Best Score: 0.8959104855107956
Test Accuracy for Decision Tree: 0.8982132938344751
Logistic Regression - Best Params: {'model__C': 10, 'model__solver': 'saga'}, Best Score: 0.9579823373490488
Test Accuracy for Logistic Regression: 0.9606554341788558
Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.9513400403104757
Test Accuracy for Random Forest: 0.9537122755045362


KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.9315636215250211

Test Accuracy for KNN: 0.9316330309202

Decision Tree - Best Params: {'model__criterion': 'entropy', 'model__max_depth': 6}, Best Score: 0.8959104855107956

Test Accuracy for Decision Tree: 0.8982132938344751

Logistic Regression - Best Params: {'model__C': 10, 'model__solver': 'saga'}, Best Score: 0.9579823373490488

Test Accuracy for Logistic Regression: 0.9606554341788558

Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.9513400403104757

Test Accuracy for Random Forest: 0.9537122755045362

In [None]:
for model_name, (model, param_grid) in models2.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.8256109838099152
Test Accuracy for Naive Bayes: 0.8307720792445843
Gradient Boosting - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.9586187835183317
Test Accuracy for Gradient Boosting: 0.9609331605258286
XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.9599958445279538
Test Accuracy for XGBoost: 0.9621366413627106


Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.8256109838099152

Test Accuracy for Naive Bayes: 0.8307720792445843

Gradient Boosting - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.9586187835183317

Test Accuracy for Gradient Boosting: 0.9609331605258286

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.9599958445279538

Test Accuracy for XGBoost: 0.9621366413627106

In [None]:
for model_name, (model, param_grid) in models3.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

[LightGBM] [Info] Number of positive: 34563, number of negative: 34569
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1849
[LightGBM] [Info] Number of data points in the train set: 69132, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499957 -> initscore=-0.000174
[LightGBM] [Info] Start training from score -0.000174
[LightGBM] [Info] Number of positive: 34564, number of negative: 34569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1849
[LightGBM] [Info] Number of data points in the train set: 69133, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499964 -> initscore=-0.000145
[L

LightGBM - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.958722942816042

Test Accuracy for LightGBM: 0.9612571745972968

In [None]:
for model_name, (model, param_grid) in models4.items():

    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', BaggingClassifier(n_jobs=1))
    ])

    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}

    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

Bagging - Best Params: {'model__max_features': 0.7, 'model__n_estimators': 200}, Best Score: 0.9546033151140932
Test Accuracy for Bagging: 0.958387335678578


Bagging - Best Params: {'model__max_features': 0.7, 'model__n_estimators': 200}, Best Score: 0.9546033151140932

Test Accuracy for Bagging: 0.958387335678578

#Result

KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.9315636215250211

Test Accuracy for KNN: 0.9316330309202

Decision Tree - Best Params: {'model__criterion': 'entropy', 'model__max_depth': 6}, Best Score: 0.8959104855107956

Test Accuracy for Decision Tree: 0.8982132938344751

Logistic Regression - Best Params: {'model__C': 10, 'model__solver': 'saga'}, Best Score: 0.9579823373490488

Test Accuracy for Logistic Regression: 0.9606554341788558

Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.9513400403104757

Test Accuracy for Random Forest: 0.9537122755045362

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.8256109838099152

Test Accuracy for Naive Bayes: 0.8307720792445843

Gradient Boosting - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.9586187835183317

Test Accuracy for Gradient Boosting: 0.9609331605258286

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.9599958445279538

Test Accuracy for XGBoost: 0.9621366413627106

LightGBM - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.958722942816042

Test Accuracy for LightGBM: 0.9612571745972968

Bagging - Best Params: {'model__max_features': 0.7, 'model__n_estimators': 200}, Best Score: 0.9546033151140932

Test Accuracy for Bagging: 0.958387335678578