In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.stats import sem
import joblib

lan_df = pd.read_csv("/Users/dhanv/Desktop/New folder (9)/New folder (7)/lan_dataset.csv")
na_df = pd.read_csv("/Users/dhanv/Desktop/New folder (9)/New folder (7)/na_dataset.csv")
lan_df.head()


Unnamed: 0,Blue Mastery 1,Blue Mastery 2,Blue Mastery 3,Blue Mastery 4,Blue Mastery 5,Blue Masteries Avg,Blue Masteries Median,Blue Masteries Kurtorsis,Blue Masteries Skewness,Blue Masteries Std,...,Red Winrate 3,Red Winrate 4,Red Winrate 5,Red Winrates Avg,Red Winrates Median,Red Winrates Kurtorsis,Red Winrates Skewness,Red Winrates Std,Red Winrates Variance,Blue Won
0,302361,32548,137831,42344,2594552,621927.2,137831.0,4.804286,2.183943,991060.524461,...,0.485714,0.333333,0.588384,0.511934,0.552239,1.755027,-1.441162,0.097786,0.009562,0
1,244724,55894,166393,151398,17928,127267.4,151398.0,-1.336965,0.013165,81189.183929,...,0.551136,0.641509,0.5,0.589079,0.551136,-0.950929,0.766032,0.077926,0.006073,0
2,1370461,165699,328554,11922,64623,388251.8,165699.0,4.066366,1.991324,502829.634615,...,0.631579,0.535714,0.529412,0.580125,0.535714,-0.171439,0.832254,0.092372,0.008533,0
3,859153,8207,152833,30736,94462,229078.2,94462.0,4.494974,2.09872,319077.677286,...,0.142857,0.515789,0.523632,0.336456,0.5,-2.252575,-0.789161,0.22119,0.048925,1
4,295938,326564,1812645,64742,83541,516686.0,295938.0,4.480466,2.090117,656696.848512,...,0.565284,0.6,0.416667,0.547907,0.565284,-1.091492,0.001576,0.094206,0.008875,1


In [7]:
# Combine and shuffle datasets
frames = [lan_df, na_df]
result = pd.concat(frames)
dataset = result.to_numpy()
np.random.shuffle(dataset)

# Split features and target
X = dataset[:, :44]
Y = dataset[:, 44]

print(f"Dataset shape: X={X.shape}, Y={Y.shape}")

Dataset shape: X=(17010, 44), Y=(17010,)


Let's define our model

In [8]:
model = GradientBoostingClassifier(n_estimators=55, learning_rate=0.14)

Let's test it with Stratified K Fold validation

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import sem

accuracy = []

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, random_state=None)

for train_index, test_index in skf.split(X, Y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]

    model.fit(x_train_fold, y_train_fold)
    prediction = model.predict(x_test_fold)
    score = accuracy_score(prediction, y_test_fold)
    accuracy.append(score * 100)


# Print the output.
print("List of possible accuracy: {}".format(accuracy))
print("\nMaximum Accuracy: {:.2f}%".format(max(accuracy)))
print("\nMinimum Accuracy: {:.2f}%".format(min(accuracy)))
print("\nOverall Accuracy: {:.2f}%".format(np.mean(accuracy)))
print("\nStandard Deviation is: {:.2f}%".format(np.std(accuracy)))
print("\nStandard Error is: {:.2f}%".format(sem(accuracy)))


List of possible accuracy: [88.94767783656673, 89.7119341563786, 90.29982363315696, 90.24103468547912, 90.06466784244563, 88.47736625514403, 88.77131099353322, 89.94708994708994, 89.00646678424457, 88.47736625514403]

Maximum Accuracy: 90.30%

Minimum Accuracy: 88.48%

Overall Accuracy: 89.39%

Standard Deviation is: 0.69%

Standard Error is: 0.23%


Now let's divide the datasets from LAN and NA to a final training and testing.

In [10]:
from sklearn.metrics import confusion_matrix, classification_report

lan_dataset = lan_df.to_numpy()
na_dataset = na_df.to_numpy()

np.random.shuffle(lan_dataset)
np.random.shuffle(na_dataset)

x_train = lan_dataset[:, 0:44]
y_train = lan_dataset[:, 44]

x_test = na_dataset[:, 0:44]
y_test = na_dataset[:, 44]


print(f"x_train size: {len(x_train)}")
print(f"y_train size: {len(y_train)}")


print(f"x_test size: {len(x_test)}")
print(f"y_test size: {len(y_test)}")

model_final_test = GradientBoostingClassifier(n_estimators=55, learning_rate=0.14)
model_final_test.fit(x_train, y_train)

print("Accuracy: {:.2f}%".format(model_final_test.score(x_test, y_test) * 100))

predictions = model_final_test.predict(x_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report:")
print(classification_report(y_test, predictions))


x_train size: 12458
y_train size: 12458
x_test size: 4552
y_test size: 4552
Accuracy: 88.58%
Confusion Matrix:
[[1987  246]
 [ 274 2045]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.89      0.88      2233
         1.0       0.89      0.88      0.89      2319

    accuracy                           0.89      4552
   macro avg       0.89      0.89      0.89      4552
weighted avg       0.89      0.89      0.89      4552



We can notice that it performed really well by predicting 4552 matches from a totally different server. With an accuracy of **88.60%**


Finally we create a final model and save it for live games predictions with streamlit.

In [11]:
import joblib

model_final = GradientBoostingClassifier(n_estimators=55, learning_rate=0.14)
model_final.fit(X, Y)

# save the model to disk
filename = "finalized_model.sav"
joblib.dump(model_final, filename)


['finalized_model.sav']