In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final players df.csv')

Approach 4, with additional new columns metrics - "dpm",  "damagemitigatedperminute", & "vspm"

these columns then calculated for 10th min and 15th min by applying the growth rate logic.

#Data Preprocessing

In [None]:
def calculate_growth_rate(value_per_minute, total_time=30):
    total_value_at_time = value_per_minute * total_time
    return (total_value_at_time / value_per_minute) ** (1 / total_time) - 1

In [None]:
def calculate_value_at_time(value_per_minute, minute, growth_rate):
    return value_per_minute * (1 + growth_rate) ** minute

In [None]:
df['damage_at_10'] = df['dpm'].apply(lambda dpm: calculate_value_at_time(dpm, 10, calculate_growth_rate(dpm)))
df['damage_at_15'] = df['dpm'].apply(lambda dpm: calculate_value_at_time(dpm, 15, calculate_growth_rate(dpm)))

df['damage_taken_at_10'] = df['damagetakenperminute'].apply(lambda dmg_taken: calculate_value_at_time(dmg_taken, 10, calculate_growth_rate(dmg_taken)))
df['damage_taken_at_15'] = df['damagetakenperminute'].apply(lambda dmg_taken: calculate_value_at_time(dmg_taken, 15, calculate_growth_rate(dmg_taken)))

df['damage_mitigated_at_10'] = df['damagemitigatedperminute'].apply(lambda dmg_mitigated: calculate_value_at_time(dmg_mitigated, 10, calculate_growth_rate(dmg_mitigated)))
df['damage_mitigated_at_15'] = df['damagemitigatedperminute'].apply(lambda dmg_mitigated: calculate_value_at_time(dmg_mitigated, 15, calculate_growth_rate(dmg_mitigated)))

df['vision_score_at_10'] = df['vspm'].apply(lambda vspm: calculate_value_at_time(vspm, 10, calculate_growth_rate(vspm)))
df['vision_score_at_15'] = df['vspm'].apply(lambda vspm: calculate_value_at_time(vspm, 15, calculate_growth_rate(vspm)))

In [None]:
columns = [
    'damage_at_10','damage_taken_at_10','damage_mitigated_at_10','vision_score_at_10','goldat10','xpat10','csat10','opp_goldat10','opp_xpat10','opp_csat10','killsat10', 'assistsat10', 'deathsat10', 'opp_killsat10','opp_assistsat10','opp_deathsat10',
    'damage_at_15','damage_taken_at_15','damage_mitigated_at_15','vision_score_at_15','goldat15','xpat15','csat15','opp_goldat15','opp_xpat15','opp_csat15','killsat15', 'assistsat15', 'deathsat15', 'opp_killsat15','opp_assistsat15','opp_deathsat15',
    'result'
]

In [None]:
columns_df = df[columns]

In [None]:
columns_df.isna().sum()

Unnamed: 0,0
damage_at_10,0
damage_taken_at_10,0
damage_mitigated_at_10,0
vision_score_at_10,0
goldat10,0
xpat10,0
csat10,0
opp_goldat10,0
opp_xpat10,0
opp_csat10,0


In [None]:
X = columns_df.drop(columns =['result'])
y = columns_df['result']

In [None]:
X.head()

Unnamed: 0,damage_at_10,damage_taken_at_10,damage_mitigated_at_10,vision_score_at_10,goldat10,xpat10,csat10,opp_goldat10,opp_xpat10,opp_csat10,...,csat15,opp_goldat15,opp_xpat15,opp_csat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,1987.357014,2394.04714,2116.045844,3.29957,3319.0,4466.0,69.0,2905.0,4125.0,72.0,...,107.0,4293.0,6875.0,111.0,2.0,0.0,0.0,0.0,0.0,2.0
1,1062.920853,2392.809529,1262.553705,8.558561,3201.0,3177.0,64.0,3305.0,2983.0,55.0,...,93.0,4955.0,5309.0,99.0,2.0,1.0,0.0,1.0,0.0,0.0
2,1775.143909,2125.429376,1860.420973,3.609051,3139.0,4360.0,82.0,3253.0,4766.0,83.0,...,142.0,4923.0,7388.0,132.0,0.0,1.0,1.0,0.0,1.0,0.0
3,3562.24973,1423.208812,809.358562,3.40273,3167.0,2950.0,77.0,2956.0,2503.0,72.0,...,132.0,4477.0,4479.0,119.0,1.0,2.0,0.0,0.0,0.0,2.0
4,562.395412,781.104497,478.046478,7.630742,2315.0,2306.0,1.0,2138.0,2431.0,1.0,...,2.0,3155.0,3645.0,5.0,0.0,3.0,0.0,0.0,0.0,1.0


#Method

Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=23)

Gridsearch

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['euclidean', 'manhattan', 'chebyshev']
}

In [None]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3,4,5,6]
    }

In [None]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

In [None]:
param_grid_rf = {
    'n_estimators': [10, 100, 200],
    'criterion': ['gini', 'entropy']

}

In [None]:
param_grid_nb = {
    'var_smoothing': [1e-08, 1e-07, 1e-06, 1e-05]
}

In [None]:
param_grid_gb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_xgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3, 4, 5, 6]
}

In [None]:
param_grid_lgb = {
    'n_estimators': [10, 100, 200],

    'max_depth': [3,4,5,6]
}

In [None]:
param_grid_bagging = {
    'n_estimators': [10, 100, 200],
    'max_features': [0.5, 0.7, 1.0]
}

Defining the models


In [None]:
models = {
    'KNN': (KNeighborsClassifier(), param_grid_knn),
    'Decision Tree': (DecisionTreeClassifier(), param_grid_dt),
    'Logistic Regression': (LogisticRegression(max_iter=1000), param_grid_lr),
    'Random Forest': (RandomForestClassifier(), param_grid_rf)

}

In [None]:
models2 = {

    'Naive Bayes': (GaussianNB(), param_grid_nb),
    'Gradient Boosting': (GradientBoostingClassifier(), param_grid_gb),
    'XGBoost': (XGBClassifier(), param_grid_xgb)

}

In [None]:
models3 = {

    'LightGBM': (LGBMClassifier(), param_grid_lgb)

}

In [None]:
models4 = {

    'Bagging': (BaggingClassifier(), param_grid_bagging)

}

using pipeling to scale the models

In [None]:
for model_name, (model, param_grid) in models.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.6655133380502688
Test Accuracy for KNN: 0.673949268653953
Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6552028115538487
Test Accuracy for Decision Tree: 0.6631642288465099
Logistic Regression - Best Params: {'model__C': 0.1, 'model__solver': 'saga'}, Best Score: 0.6987479680427288
Test Accuracy for Logistic Regression: 0.7057026476578412
Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.7130855698237073
Test Accuracy for Random Forest: 0.719866691353453


KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.6655133380502688

Test Accuracy for KNN: 0.673949268653953

Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6552028115538487

Test Accuracy for Decision Tree: 0.6631642288465099

Logistic Regression - Best Params: {'model__C': 0.1, 'model__solver': 'saga'}, Best Score: 0.6987479680427288

Test Accuracy for Logistic Regression: 0.7057026476578412

Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.7130855698237073

Test Accuracy for Random Forest: 0.719866691353453

In [None]:
for model_name, (model, param_grid) in models2.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6713224785435074
Test Accuracy for Naive Bayes: 0.670847991112757
Gradient Boosting - Best Params: {'model__max_depth': 4, 'model__n_estimators': 200}, Best Score: 0.7192534168825182
Test Accuracy for Gradient Boosting: 0.7255137937418996
XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.7180036439016081
Test Accuracy for XGBoost: 0.7246343269764859


Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6713224785435074

Test Accuracy for Naive Bayes: 0.670847991112757

Gradient Boosting - Best Params: {'model__max_depth': 4, 'model__n_estimators': 200}, Best Score: 0.7192534168825182

Test Accuracy for Gradient Boosting: 0.7255137937418996

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.7180036439016081

Test Accuracy for XGBoost: 0.7246343269764859

In [None]:
for model_name, (model, param_grid) in models3.items():


    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', model)
    ])


    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}


    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)


    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')


    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')

[LightGBM] [Info] Number of positive: 34563, number of negative: 34569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013907 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4780
[LightGBM] [Info] Number of data points in the train set: 69132, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499957 -> initscore=-0.000174
[LightGBM] [Info] Start training from score -0.000174
[LightGBM] [Info] Number of positive: 34564, number of negative: 34569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4777
[LightGBM] [Info] Number of data points in the train set: 69133, number of used features: 32
[LightGBM] [Info] 

LightGBM - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.7199014518549538

Test Accuracy for LightGBM: 0.7288465099055731

In [None]:
for model_name, (model, param_grid) in models4.items():

    pipeline = Pipeline([
        ('scaler', RobustScaler()),
        ('model', BaggingClassifier(n_jobs=1))
    ])

    adjusted_param_grid = {f'model__{key}': value for key, value in param_grid.items()}

    grid_search = GridSearchCV(estimator=pipeline, param_grid=adjusted_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f'{model_name} - Best Params: {best_params}, Best Score: {best_score}')

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {model_name}: {test_accuracy}')



Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.712020930467141
Test Accuracy for Bagging: 0.7220422144047398


Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.712020930467141

Test Accuracy for Bagging: 0.7220422144047398

#Result

KNN - Best Params: {'model__metric': 'manhattan', 'model__n_neighbors': 10}, Best Score: 0.6655133380502688

Test Accuracy for KNN: 0.673949268653953

Decision Tree - Best Params: {'model__criterion': 'gini', 'model__max_depth': 6}, Best Score: 0.6552028115538487

Test Accuracy for Decision Tree: 0.6631642288465099

Logistic Regression - Best Params: {'model__C': 0.1, 'model__solver': 'saga'}, Best Score: 0.6987479680427288

Test Accuracy for Logistic Regression: 0.7057026476578412

Random Forest - Best Params: {'model__criterion': 'entropy', 'model__n_estimators': 200}, Best Score: 0.7130855698237073

Test Accuracy for Random Forest: 0.719866691353453

Naive Bayes - Best Params: {'model__var_smoothing': 1e-08}, Best Score: 0.6713224785435074

Test Accuracy for Naive Bayes: 0.670847991112757

Gradient Boosting - Best Params: {'model__max_depth': 4, 'model__n_estimators': 200}, Best Score: 0.7192534168825182

Test Accuracy for Gradient Boosting: 0.7255137937418996

XGBoost - Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}, Best Score: 0.7180036439016081

Test Accuracy for XGBoost: 0.7246343269764859

LightGBM - Best Params: {'model__max_depth': 6, 'model__n_estimators': 200}, Best Score: 0.7199014518549538

Test Accuracy for LightGBM: 0.7288465099055731

Bagging - Best Params: {'model__max_features': 0.5, 'model__n_estimators': 200}, Best Score: 0.712020930467141

Test Accuracy for Bagging: 0.7220422144047398
