In [28]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [29]:
from matplotlib.colors import ListedColormap
import time

# for splitting data into training and testing
from sklearn.model_selection import train_test_split

# standardize features removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler

# multilayer perceptron classifier
from sklearn.neural_network import MLPClassifier

# K nearest neighbors classifier
from sklearn.neighbors import KNeighborsClassifier

# Support vector machine, C-support vector classifier
from sklearn.svm import SVC

# Gaussian process classifier
from sklearn.gaussian_process import GaussianProcessClassifier

# radial basis function kernel (squared exponential kernel)
from sklearn.gaussian_process.kernels import RBF

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Ensemble classifiers: Random forest and AdaBoost
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Quadratic Discriminant Analysis classifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn import metrics

In [30]:
# Reading the data
# df = pd.read_csv('la2_rankIIIonly_extended.csv')
df = pd.read_csv('csv/la2_50matchs.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118292 entries, 0 to 118291
Data columns (total 48 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   gameId                       118292 non-null  int64 
 1   region                       118292 non-null  object
 2   summonerName                 118292 non-null  object
 3   tier                         118292 non-null  object
 4   rank                         118292 non-null  object
 5   tierRank                     118292 non-null  object
 6   wins                         118292 non-null  int64 
 7   losses                       118292 non-null  int64 
 8   win                          118292 non-null  bool  
 9   lane                         118292 non-null  object
 10  role                         118292 non-null  object
 11  championId                   118292 non-null  int64 
 12  spell1Id                     118292 non-null  int64 
 13  spell2Id      

In [31]:
# Review missing data
print(df.isnull().sum())

gameId                         0
region                         0
summonerName                   0
tier                           0
rank                           0
tierRank                       0
wins                           0
losses                         0
win                            0
lane                           0
role                           0
championId                     0
spell1Id                       0
spell2Id                       0
kills                          0
deaths                         0
assists                        0
largestKillingSpree            0
largestMultiKill               0
killingSprees                  0
longestTimeSpentLiving         0
doubleKills                    0
tripleKills                    0
quadraKills                    0
pentaKills                     0
totalDamageDealt               0
totalDamageDealtToChampions    0
totalHeal                      0
totalUnitsHealed               0
damageDealtToObjectives        0
timeCCingO

In [32]:
# Eliminate rows with missing values
df.dropna(inplace=True)

In [33]:
X = df.drop(['tier', 'rank', 'tierRank', 'region', 'summonerName', 'lane', 'role'], axis=1).values
X_columns = df.drop(['tier', 'rank', 'tierRank', 'region', 'summonerName', 'lane', 'role'], axis=1).columns
y = df[['tier']].to_numpy().ravel()

In [34]:
# encoding categorical data e.g. tier as a dummy variable
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,1] = labelencoder_X.fit_transform(X[:,1])

# encoding categorical data e.g. tier as a dummy variable
y,class_names = pd.factorize(y)

In [35]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state = 42)

In [36]:
# Function to pretty print the confusion matrix

import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [37]:
names = [
            # "Nearest Neighbors",
            # "SVM Linear",
            # "SVM rbf",
            # "GP",
            "Decision Tree"
            # "Random Forest",
            "Neural Net", 
            # "AdaBoost",
            # "Naive Bayes",
            # "QDA"
        ]        
        
classifiers = [
    # KNeighborsClassifier(500),
    # SVC(kernel="linear", C=0.03, probability = True),
    # SVC(gamma=1.5, C=1, probability = True),
    # GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(criterion='gini', max_depth=27, random_state=42),
    # RandomForestClassifier(max_depth=200, n_estimators=400),
    MLPClassifier(alpha=2),
    # AdaBoostClassifier(),
    # GaussianNB(),
    # QuadraticDiscriminantAnalysis()
    ]

In [38]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report


In [41]:
i=1
plt.figure(figsize=(20,4))
for name, clf in zip(names, classifiers):
        ax = plt.subplot(1 , len(classifiers) + 1, i)
        

        start = time.time()
        # fit the model using the training set
        clf.fit(X_train, y_train)
        end = time.time()
        # compute the mean accuracy of the classifier
        # score = clf.score(X_test, y_test)
        end2 = time.time()
        # compute ROC curve
        # y_test_pred = clf.predict_proba(X_test)[:, 1]

        y_pred=clf.predict(X_test)

        # Classification results on test set
        accuracy = metrics.accuracy_score(y_test, y_pred)
        print("Accuracy: {:.2f}".format(accuracy))

        cm=confusion_matrix(y_test,y_pred)
        print('Confusion Matrix: \n', cm)
        print(classification_report(y_test, y_pred, target_names=class_names))

        # plt.subplots()
        # plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred >= 0.5), range(2))
        # plt.subplots()
        # plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred >= 0.5), range(2), normalize=True, title='Normalized confusion matrix')
        
        print(name,str(end-start),str(end2 - end))

        i += 1

In [None]:
# # Visualize the tree by graphiz
# import graphviz
# from sklearn import tree
# import os
# os.environ["PATH"] += os.pathsep + 'C:/Users/matia/anaconda3/Library/bin/graphviz/'
# feature_names = X_columns
# dot_data = tree.export_graphviz(classifier, out_file=None, filled=True, rounded = True, feature_names=feature_names, class_names=class_names)
# graph = graphviz.Source(dot_data)
# graph

In [None]:
# i=1
# plt.figure(figsize=(20,4))
# for name, clf in zip(names, classifiers):
#         ax = plt.subplot(1 , len(classifiers) + 1, i)
        

#         start = time.time()
#         # fit the model using the training set
#         clf.fit(X_train, y_train)
#         end = time.time()
#         # compute the mean accuracy of the classifier
#         score = clf.score(X_test, y_test)
#         end2 = time.time()
#         compute ROC curve
#         y_test_pred = clf.predict_proba(X_test)[:, 1]
#         fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred, pos_label=1)
#         auc = metrics.roc_auc_score(y_test, y_test_pred, multi_class='ovo')
#         acc = metrics.accuracy_score(y_test, y_test_pred >= 0.5, multi_class='ovo')
#         f1 = metrics.f1_score(y_test, y_test_pred >= 0.5, multi_class='ovo')
#         ax.set_xlim(-.05, 1.05)
#         ax.set_ylim(-.05, 1.05)
#         ax.set_xticks(())
#         ax.set_yticks(())
#         ax.text(0.95, 0.3, "Acc: %.2f" % acc, ha = 'right')
#         ax.text(0.95, 0.2, "F1-score: %.2f" % f1, ha = 'right')
#         ax.text(0.95, 0.1, "AUC: %.2f" % auc, ha = 'right')
#         ax.plot(fpr, tpr, lw = 5)
#         idx = np.argmin(np.abs(thresholds - 0.5))
#         ax.scatter(fpr[idx], tpr[idx], marker = 'o', c = 'r')
#         print(name,str(end-start),str(end2 - end))
#         counter 
#         i += 1