In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, recall_score, make_scorer, plot_confusion_matrix, confusion_matrix, accuracy_score
from scipy.stats import uniform


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
sns.set_style('ticks')

In [None]:
#Auxiliary functions

def check_variance(clf):
    sns.set()

    clfr_train = classification_report(y_true=y_train, y_pred=clf.predict(X_train),output_dict=True)
    clfr_test = classification_report(y_true=y_test, y_pred=clf.predict(X_test),output_dict=True)

    metrics = ['accuracy','recall (macro)', 'precision (macro)']
    score_train = [clfr_train['accuracy'], clfr_train['macro avg']['recall'],clfr_train['macro avg']['precision']]
    score_test = [clfr_test['accuracy'], clfr_test['macro avg']['recall'],clfr_test['macro avg']['precision']]

    score_train = [round(x,2) for x in score_train]
    score_test = [round(x,2) for x in score_test]

    ser = pd.concat((pd.Series(score_train,index=metrics,name='Train'),pd.Series(score_test,index=metrics,name='Test')),axis=1)
    ser = ser.reset_index().melt(id_vars=['index'])

    plt.figure(figsize=(7,7))
    g = sns.barplot(data=ser,x='index',y='value',hue='variable')
    g.set(xlabel='Metrics',ylabel='Score',ylim = (0,1.1))
    for p in g.patches:
        g.annotate(p.get_height(), (p.get_x()+0.2, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')
    plt.show()


In [None]:
df = pd.read_csv('/kaggle/input/league-of-legends-diamond-ranked-games-10-min/high_diamond_ranked_10min.csv')
df.info()

In [None]:
df.isnull().sum()

Some features must be identical (in a sense of perfect correlation), let's check what those features are

In [None]:
cols = df.columns
feat1 = []
feat2 = []
val = []
HIGH = 0.95
LOW = -0.95
for i1 in range(len(cols)):
    for i2 in range(i1+1,len(cols)):
                    col1 = df[cols[i1]]
                    col2 = df[cols[i2]]
                    cor_coef = np.corrcoef(col1,col2)[0][1]
                    if cor_coef >= HIGH:
                        feat1.append(cols[i1])
                        feat2.append(cols[i2])
                        val.append(cor_coef)
                    elif cor_coef <= LOW:
                        feat1.append(cols[i1])
                        feat2.append(cols[i2])
                        val.append(cor_coef)
                        

#left_side = [x[0] for x in corr_eq_1] + [x[0] for x in corr_eq_m1]
#right_side = [x[1] for x in corr_eq_1] + [x[1] for x in corr_eq_m1]

In [None]:
cors = pd.DataFrame({'Feature 1': feat1, 'Feature 2': feat2, "Corr. Coef.": val})
#cors[(cors['Corr. Coef.'].astype(int) != 1) & (cors['Corr. Coef.'].astype(int) != -1)]
cors

For each pair of the perfectly correlated feature, we remove 1 feature.

In [None]:
removed_left = []
for y in feat1:
    if y not in feat2:
        df.drop([y],axis=1,inplace=True)
        removed_left.append(y)

In [None]:
removed_left == feat1

In [None]:
df['blueWins'].value_counts()

In [None]:
df['blueWins'] = df['blueWins'].map({0: "Red", 1: "Blue"})
df.rename(columns={'blueWins': "Winner"},inplace=True)

# How does ward placement affect the game?

In [None]:
print('Summary for `blueWardsPlaced`')
df['blueWardsPlaced'].describe()

We see that that the distribution of `blueWardsPlaced` is skewed to the right. Let's use histogram to verify that.

In [None]:
dataframe = df
feature = 'blueWardsPlaced'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of {feature}",fontsize=25)
plt.show()

Let's check how many outliers there are. (We define outliers as a value larger than Q3 + IQR*1.5)

In [None]:
q3 = df['blueWardsPlaced'].quantile(0.75)
q1 = df['blueWardsPlaced'].quantile(0.25)
IQR = q3-q1
thr = q3 + IQR*1.5
outliers_count = df[df['blueWardsPlaced'] > thr]['blueWardsPlaced'].count()
total_count = df['blueWardsPlaced'].count()

print(f'Number of outliers: {outliers_count}')
print(f'Number of all entries: {total_count}')

Is there any correlation between `blueWardsPlaced` and other metrics? (Like kill, gold and experience)

In [None]:
dataframe = df[['blueWardsPlaced','blueTotalExperience','blueGoldPerMin','redDeaths','Winner']]
dff = pd.DataFrame(dataframe.corr().iloc[:,0])
df
dff.iloc[1:,:].plot.bar(title='Correlation with `blueWardsPlaced`')
plt.show()


We see that there is pretty weak correlation between wards placed and other important metrics (like kills, gold and experience)

How good is `blueWardsPlaced` at separating winning blue teams and losing ones?

In [None]:
dataframe = df
feature_1 = 'Winner' #cat
feature_2 = 'blueWardsPlaced' #cont
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, order=['Blue','Red'], data=dataframe,whis=1.5)
plt.title('Box plot for `blueWardsPlaced` (with outliers)')
plt.show()

In [None]:
q3 = df['blueWardsPlaced'].quantile(0.75)
q1 = df['blueWardsPlaced'].quantile(0.25)
interq = q3-q1


dataframe = df[df['blueWardsPlaced'] <= (q3 + interq*1.5)]
feature_1 = 'Winner' #cat
feature_2 = 'blueWardsPlaced' #cont
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, order=['Blue','Red'], data=dataframe)
plt.title('Box plot for `blueWardsPlaced` (without outliers)')
plt.show()

Seems like losing and winning blue teams place the same amount of wards.

In [None]:
dataframe = df
x_feat = 'blueWardsPlaced'
y_feat = ['blueTotalExperience','blueGoldPerMin','redDeaths']
cat_feat = 'Winner'




row = 1
col = len(y_feat)
cat_feat_labels = dataframe[cat_feat].unique() if cat_feat != None else None
colors = {'Blue': 'Blue','Red': 'Red'} #You can set it for NONE, then plt will automatically select color

fig, ax = plt.subplots(row,col,figsize=(17,5))
for i in range(col):
    if cat_feat != None:
        for label in cat_feat_labels:
            conditional_dataframe = dataframe[dataframe[cat_feat] == label]
            ax[i].scatter(conditional_dataframe[x_feat],
                          conditional_dataframe[y_feat[i]],
                          alpha=0.2,
                          color=colors[label],
                          label=label)
    else:
        ax[i].scatter(dataframe[x_feat],dataframe[y_feat[i]],alpha=0.2,color="Blue")
    ax[i].set_xlabel(x_feat)
    ax[i].set_ylabel(y_feat[i])
    ax[i].legend(title=cat_feat)
plt.show()

By visualizing the relation between`blueWardsPlaced`with another cont. features, we see that the blueWardsPlaced neither affect the values of other features, nor affect the chance of winning a game.

# How does number of kills made by blue team affect a game? (Note that `blueKills = redDeaths`. We will be using `redDeaths`)

In [None]:
dataframe = df
feature = 'redDeaths'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of {feature}",fontsize=25)
plt.show()

Although it is pretty obvious, but let's check it anyway: How number of kills made by blue team affect the gold earned?

In [None]:
dataframe = df
feature1 = 'redDeaths'
feature2 = 'blueGoldPerMin'


['blueTotalExperience','blueGoldPerMin','redDeaths']
g=sns.jointplot(x=dataframe[feature1], y=dataframe[feature2], kind="kde")
g.fig.set_figwidth(11)
g.fig.set_figheight(13)
plt.show()

 As expected, we see the positive correlation between the kills by blue team and the gold earned.

In [None]:
dataframe = df
feature1 = 'redDeaths'
feature2 = 'blueTotalExperience'


['blueTotalExperience','blueGoldPerMin','redDeaths']
g=sns.jointplot(x=dataframe[feature1], y=dataframe[feature2], kind="kde")
g.fig.set_figwidth(11)
g.fig.set_figheight(13)
plt.show()

We see that the joint distribution is a bit more dispersed (which signifies weaker correlation)

Now let's see how does blue kills fare at predicting the winner

In [None]:
dataframe = df
feature_1 = 'Winner'
feature_2 = 'redDeaths'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe,order=['Blue','Red'])
plt.show()

Kills does a better job at separating winner then wards placed, but separation is clearly not perfect.

# Which feature separates the winner best: Number of kills or CS?

In [None]:
dataframe = df
cat_feat = 'Winner'
cont_feat = 'redDeaths'


figure, axes = plt.subplots(1,2,figsize=(14,7))
plt.figure(figsize=(7,7))
for value in df[cat_feat].unique():
    sns.distplot(df[df[cat_feat] == value][cont_feat], label=value,ax=axes[0])
    
axes[0].set_title('Kills conditional on Winner')
axes[0].legend()


dataframe = df
cat_feat = 'Winner'
cont_feat = 'blueCSPerMin'

plt.figure(figsize=(7,7))
for value in df[cat_feat].unique():
    sns.distplot(df[df[cat_feat] == value][cont_feat], label=value,ax=axes[1])
    
axes[1].set_title('CS conditional on Winner')
axes[1].legend()
plt.show()

Visually, both CS and Kills do equal job at separating the winner. If one wants to look at the importances of these two features more rigorously, then some quantitaive approaches should be used.

In [None]:
cr = np.corrcoef(df['redDeaths'], df['blueGoldPerMin'])[0][1]
print(f'Correlation between kills and gold: {cr}')

cr = np.corrcoef(df['blueCSPerMin'], df['blueGoldPerMin'])[0][1]
print(f'Correlation between CS and gold: {cr}')

We see that killing heros will build up economy faster than farming

# How does blueAssists affect the game?

In [None]:
dataframe = df
feature = 'blueAssists'
sns.set_style('ticks')
plt.figure(figsize=(10,7))
dataframe[feature].hist()
plt.title(f"Distribution of {feature}",fontsize=25)
plt.show()

One can observe that distribution of assists is very similar to the distribution of kills. It is indicative of high correlation between the two features. Let's check it

In [None]:
cr = np.corrcoef(df['redDeaths'], df['blueAssists'])[0][1]
print(f'Correlation between kills and assists: {cr}')

Indeed, features are highly correlated. High correlations between kills and assists suggests that most kills are made with the help of the other teammates. 

In [None]:
dataframe = df
feature_1 = 'Winner'
feature_2 = 'blueAssists'
plt.figure(figsize=(7,7))
sns.boxplot(x=feature_1, y=feature_2, data=dataframe,order=['Blue','Red'])
plt.show()

Consequently, conditional distributions of assits are almost indentical to conditional distributions of blue kills

# How does elite monsters kills by blue affect the game?

In [None]:
dataframe = df
feature = 'blueEliteMonsters'
series = dataframe[feature].value_counts()

labels = series.index
sizes = series.values
explode = [0 for x in range(series.size)]  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots(figsize=(10,15))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title(f'Distribution of `{feature}`',fontsize=20)
plt.show()

We see that in half cases, 0 monsters were killed. 
And only in 7% of matches, 2 elite monsteres were killed.

Let's see how well does number of elite monsters killed separate winner

In [None]:
dataframe = df
feature_1 = 'Winner'
feature_2 = 'blueEliteMonsters'
plt.figure(figsize=(7,7))
sns.countplot(data=df,x=feature_1,hue=feature_2)
plt.show()

We see that the distributions do differ. In matches where red team wins, blue teams mostly kill 0 monsters. While if blue team wins, it is most likely that 1 monster will be killed.

# Conclusions:

1. Based on the our dataset, number of wards placed does not affect the outcome of the game (However there is a caveat: we have only considered the NUMBER of wards, yet we didn't consider the QUALITY of those wards (each ward placed has a different impact on a game based on WHERE it was placed. In other words, position of the ward placed is very important)

2. Team play is important. As the correlation coefficient showed, most kills are made with a help of teammates. And since kills is an important metric at predicting the winner, it follows that to increase the chance of winning, you should make sure that you play around your teammates.

3. The features `elite monsters killed` and `winner` are correlated, in a sense that the winner tend to kill more elite monsters. Although it should be noted that the causation here is unclear: Is it the case that killing more monsters increase your chance of winning, or the fact that you are winning makes you kill more elite monsters, or something else? Further look it needed.

4. Economy-wise, killing heroes is more important than farming (signified by higher correlation coefficient). 



# Feature Selection: Random Forest

Note that the dataset does not have any high-cardinality features, which implies that RF will do a pretty good job at finding features with high importance.

In [None]:
X = df.drop(['gameId','Winner'], axis=1)
y = df['Winner']

forest_clf = RandomForestClassifier(n_estimators=100)
forest_clf.fit(X, y)

importances = forest_clf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.bar(range(len(indices)),importances[indices])
plt.xticks(range(len(indices)), indices,rotation=90)
plt.show()


Let's select 6 top features

In [None]:
X.iloc[:,indices[:6]].head()

Split and standartize

In [None]:
X = X.iloc[:,indices[:6]]
y = df['Winner']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=22)

sc = StandardScaler()
X_train= sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
acc_score = {}

# NB

In [None]:
#Naive Bayes
nb_clf = GaussianNB().fit(X_train,y_train)
print(classification_report(y_true=y_test, y_pred=nb_clf.predict(X_test)))
plot_confusion_matrix(nb_clf, X_test, y_test)


In [None]:
acc = accuracy_score(y_true=y_test,y_pred=nb_clf.predict(X_test))
acc_score['NB'] = round(acc,2)

# Logistic regression

In [None]:
log_random_state = None
log_clf = LogisticRegression(random_state=log_random_state).fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=log_clf.predict(X_test)))
plot_confusion_matrix(log_clf, X_test, y_test)


In [None]:
acc = accuracy_score(y_true=y_test,y_pred=log_clf.predict(X_test))
acc_score['Logistic'] = round(acc,2)

# KNN

In [None]:
MIN = 15 #Min number of neighbors
MAX = 30 #Max number of neighbors
knn_estimator = KNeighborsClassifier()
knn_clf = GridSearchCV(knn_estimator,
                       {'n_neighbors': range(MIN,MAX+1)}
                       ,scoring='accuracy').fit(X_train, y_train)
print(f"Best estimator: {knn_clf.best_estimator_}")
print(classification_report(y_true=y_test, y_pred=knn_clf.predict(X_test)))
plot_confusion_matrix(knn_clf, X_test, y_test)

In [None]:
acc = accuracy_score(y_true=y_test,y_pred=knn_clf.predict(X_test))
acc_score['KNN'] = round(acc,2)

# Decision Tree

In [None]:
tree_clf = tree.DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
print(classification_report(y_true=y_test, y_pred=tree_clf.predict(X_test)))
plot_confusion_matrix(tree_clf, X_test, y_test)


# Random Forest

In [None]:
estimator = RandomForestClassifier(n_estimators=150, random_state=13)
rf_clf = GridSearchCV(estimator,
                      param_grid={'max_depth': [3,5,7,None]},
                      scoring='accuracy').fit(X_train, y_train)

print(classification_report(y_true=y_test, y_pred=rf_clf.predict(X_test)))
plot_confusion_matrix(rf_clf, X_test, y_test)


acc = accuracy_score(y_true=y_test,y_pred=rf_clf.predict(X_test))
acc_score['Random Forest'] = round(acc,2)

# SVM

In [None]:
svm_clf = SVC().fit(X_train,y_train)
print(classification_report(y_true=y_test, y_pred=svm_clf.predict(X_test)))
plot_confusion_matrix(svm_clf, X_test, y_test)


acc = accuracy_score(y_true=y_test,y_pred=svm_clf.predict(X_test))
acc_score['SVM'] = round(acc,2)

In [None]:
metric_str = 'Accuracy'
models = np.array(list(acc_score.keys()))
score = np.array([acc_score[x] for x in models])


models = models[np.argsort(score)[::-1]]
score = score[np.argsort(score)[::-1]]

plt.figure(figsize=(7,7))
graph = sns.barplot(models,score)
graph.set(ylim=(0, 1.1),title=metric_str)
for p in graph.patches:
    graph.annotate(p.get_height(), (p.get_x()+0.4, p.get_height()),
                   ha='center', va='bottom',
                   color= 'black')
