# How to win League of Legends?

#### The following notebook shows the results of our joint work on the Machine Learning Course Final Project

## Our goal

We wanted to create a model that would be able to judge the final result of the match from the match statistics from the 10th minute of the game. As data, we used 90 features 45 per team.

In [None]:
import os
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
import sklearn.preprocessing

from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from catboost import CatBoostClassifier
from lightgbm  import LGBMClassifier

from skopt import BayesSearchCV

from mlxtend.classifier import StackingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

## Data


## Data collecting

We collected data using a hand-written Python class that creates datasets using the [API](https://developer.riotgames.com/) provided by Riot Games. More info about data collecting proccess in the rest of the repository.

### Getting data

We collected 5 datasets with information about games from 5 leagues: bronze, silver, gold, platinum and diamond.
It turned out that there is huge problem with that datasets because data from different lines of the game is mixed.
We tried to restore true data in following "best" datasets.

In [None]:
df_bronze = pd.read_csv("/kaggle/input/ml-project-data/eun1_BRONZE_RANKED_SOLO_CLEAN.csv")
df_silver = pd.read_csv("/kaggle/input/ml-project-data/eun1_SILVER_RANKED_SOLO_CLEAN.csv")
df_gold = pd.read_csv("/kaggle/input/ml-project-data/eun1_GOLD_RANKED_SOLO_CLEAN.csv")
df_platinum = pd.read_csv("/kaggle/input/ml-project-data/eun1_PLATINUM_RANKED_SOLO_CLEAN.csv")
df_diamond = pd.read_csv("/kaggle/input/ml-project-data/eun1_DIAMOND_RANKED_SOLO_CLEAN.csv")


df_bronze_best = pd.read_csv("/kaggle/input/ml-project-data/eun1_BRONZE_RANKED_SOLO_BEST.csv")
df_silver_best = pd.read_csv("/kaggle/input/ml-project-data/eun1_SILVER_RANKED_SOLO_BEST.csv")
df_gold_best = pd.read_csv("/kaggle/input/ml-project-data/eun1_GOLD_RANKED_SOLO_BEST.csv")
df_platinum_best = pd.read_csv("/kaggle/input/ml-project-data/eun1_PLATINUM_RANKED_SOLO_BEST.csv")
df_diamond_best = pd.read_csv("/kaggle/input/ml-project-data/eun1_DIAMOND_RANKED_SOLO_BEST.csv")




dfs={
    'bronze': df_bronze,
    'silver': df_silver,
    'gold': df_gold,
    'platinum': df_platinum,
    'diamond': df_diamond,
}

dfs_best={
    'bronze': df_bronze_best,
    'silver': df_silver_best,
    'gold': df_gold_best,
    'platinum': df_platinum_best,
    'diamond': df_diamond_best,
}

for name, df in dfs_best.items():
    print(name)
    print(len(dfs_best[name]))

Each dataset contains 6-10k rows.

The example rows of data od diamond dataframe:

In [None]:
df_diamond.head()

To "best" dataset we also translated champion_id to champion_name and for each champion derived its attribute. These are only categorical columns in our dataset. Second column will be the target of our classifiers.

In [None]:
df_diamond_best.head()

### Data pre-processing

We decided to use one hot encode to encode attributes of champions.

In [None]:
from sklearn.preprocessing import OneHotEncoder

def get_rid_of_categorical_columns(df):
    champion_attribute_feats = [col for col in df.columns if col.endswith('champion_attribute')]
    onehotencoder = OneHotEncoder()
    attributes_encoding = onehotencoder.fit_transform(df[champion_attribute_feats]).toarray()
    columns_names_encoding = np.repeat(champion_attribute_feats, 6) 
    categories = ['Assassin', 'Fighter', 'Mage', 'Marksman', 'Support', 'Tank']*10
    columns_names_encoding = [col + "_" + cat for col, cat in zip(columns_names_encoding, categories)]
    to_drop_feats = [col for col in df.columns if col.endswith('champion_attribute') or col.endswith('champion_name')]
    df = df.drop(columns = to_drop_feats)
    return pd.concat([df, pd.DataFrame(attributes_encoding, columns=columns_names_encoding,index=df.index)], axis=1)

for name, df in dfs_best.items():
    dfs_best[name] = get_rid_of_categorical_columns(df)

In [None]:
df = dfs_best['diamond']
df.info()

To leave only significant columns we delete we ones with only one values and the ones which are highly correlated to another column.

In [None]:
def drop_one_value_columns(df):
    to_drop = df.columns[df.nunique()==1]
    print("Dropping one value columns:")
    print(to_drop)
    return df.drop(columns=to_drop)


def drop_high_correlated_columns(df, corr_threshold = 0.95):
    df_corr = df.corr().unstack().reset_index()
    to_drop = df_corr.iloc[:,0][(abs(df_corr.iloc[:,-1])>corr_threshold) & (df_corr.iloc[:,0] < df_corr.iloc[:,1])]
    print("Dropping high correlated columns:")
    print(to_drop)
    return df.drop(columns=to_drop)

bool_columns = df.columns[(df.nunique()<3)]
numerical_columns = df.columns[(df.nunique()>=3)]

for name, df in dfs_best.items():
    dfs_best[name] = drop_one_value_columns(df)
    dfs_best[name] = drop_high_correlated_columns(df)
    
for name, df in dfs.items():
    dfs[name] = drop_one_value_columns(df)
    dfs[name] = drop_high_correlated_columns(df)

We'll  present our data on histograms. There are all columns with statistics about first team which are not binary. 

In [None]:
first_team_columns  = [col for col in numerical_columns if col.startswith('1')]
dfs['diamond'][first_team_columns].hist(bins=50, figsize=(20, 20))

We can see that the distribution of data is nice -- we don't strictly need log transformation. There are no visible outliers.
Let's point out that the distribution of features doesn't vary on different lines (middle, top support etc.). This is concerning and it turned out that is because these data are not true.

After our attempt to restore the true distribution of values we obtained these different results.

In [None]:
first_team_columns  = [col for col in numerical_columns if col.startswith('1')]
dfs_best['diamond'][first_team_columns].hist(bins=50, figsize=(20, 20))

In the part of preprocessing we also tried to apply some transformations to improve our dataset.
Turned out that it doesn't have visible impact on results and in the and we used simple min-max scaler.

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.decomposition import PCA

def min_max_scaled(df, numerical_columns):
    scaler = MinMaxScaler()
    scaler.fit(df[numerical_columns])
    df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

def standard_scaled(df, numerical_columns):
    scaler = StandardScaler()
    scaler.fit(df[numerical_columns])
    df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

def power_yeo_scaled(df, numerical_columns):
    scaler = PowerTransformer(method='yeo-johnson')
    scaler.fit(df[numerical_columns])
    df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

def power_box_cox_scaled(df, numerical_columns):
    scaler = PowerTransformer(method='box-cox')
    scaler.fit(df[numerical_columns])
    df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

def PCA_scaled(df):
    scaler = PCA()
    scaler.fit(df.iloc[:,1:])
    df.iloc[:,1:] = scaler.transform(df.iloc[:,1:])
    return df

def PCA_reduced(df, n_components):
    scaler = PCA(n_components = n_components)
    scaler.fit(df.iloc[:,1:])
    print(scaler)
    return pd.concat([df.iloc[:,0], pd.DataFrame(scaler.transform(df.iloc[:,1:]))], axis=1)


for name, df in dfs_best.items():
    dfs_best[name] = min_max_scaled(df, numerical_columns)

In [None]:
dfs_best['diamond'].describe()

### Early results

In [None]:
clfs = {
    'SVC': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Ridge Regression': RidgeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False),
    'AdaBoost': AdaBoostClassifier(),
    'Bernouli Naive Bayes': BernoulliNB(),
    'Gaussian Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Simple Neural Network': MLPClassifier(),
    'LGBM': LGBMClassifier(),
}

We wanted to try our data on few pure classifiers with defalut settings.
Here are our early results:

In [None]:
from collections import defaultdict
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, verbose=1, n_jobs=3, error_score='raise')
    return scores

In [None]:
df_scores = pd.DataFrame(None, columns=['league', 'model', 'unshuffled', 'accuracy'])

for league, df in dfs.items():
    X = df.iloc[:,2:]
    y = df.iloc[:,1]
    model_scores = defaultdict()
    for name, model in clfs.items():
        print('Evaluating {}'.format(name))
        scores = evaluate_model(model, X, y)
        model_scores[name] = scores
    df_temp = pd.DataFrame.from_dict(model_scores)
    df_temp = pd.melt(df_temp)
    df_temp.columns = ['model','accuracy']
    df_temp['league'] = league
    df_temp['unshuffled'] = False
    df_scores = pd.concat([df_scores, df_temp])


In [None]:
for league, df in dfs_best.items():
    X = df.iloc[:,2:]
    y = df.iloc[:,1]
    model_scores = defaultdict()
    for name, model in clfs.items():
        print('Evaluating {}'.format(name))
        scores = evaluate_model(model, X, y)
        model_scores[name] = scores
    df_temp = pd.DataFrame.from_dict(model_scores)
    df_temp = pd.melt(df_temp)
    df_temp.columns = ['model','accuracy']
    df_temp['league'] = league
    df_temp['unshuffled'] = True
    df_scores = pd.concat([df_scores, df_temp])

Here is shown the performence of different clasifiers on raw and restored dataset (diamond).

In [None]:
plt.rcParams['figure.dpi'] = 100
sns.set_style("whitegrid")
filter_diamond = df_scores['league'] == 'diamond'
sns.catplot(x="model", y="accuracy", hue='unshuffled',
            data=df_scores[filter_diamond], kind="swarm", legend_out=True, s = 3)
plt.xticks(rotation=70)
plt.title('Performance of Different Models Using 5-Fold Cross-Validation, diamond league')
#plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)

The comparizon of accuracy of all these classifiers on different leagues and unshuffled dataset.

In [None]:
filter_best_df = df_scores['unshuffled'] == True
plt.rcParams['figure.dpi'] = 100
sns.catplot(x="model", y="accuracy", row='league', data=df_scores[filter_best_df], kind="swarm")
plt.xticks(rotation=70)
plt.tight_layout()

We can see that on this stage best performing methods are Rigde regression and logistic regression.
The best results are obtained on bronze league -- up to 72,5% accuracy. The hardest league to predict is diamond -- we can only get 70% accuracy.

In [None]:
filter_best_df_diamond = (df_scores['unshuffled'] == True) & (df_scores['league'] == 'diamond')
df_temp = df_scores[filter_best_df_diamond].groupby(by='model')['accuracy'].apply(list)
model_scores = dict(zip(df_temp.index, df_temp.values))

In [None]:
import plotly.graph_objects as go

def plot_results(model_scores, name):
    
    model_names = list(model_scores.keys())
    results = [model_scores[model] for model in model_names]
    fig = go.Figure()
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            marker_size=2,
            line_width=1)
        )
    
    fig.update_layout(
    title='Performance of Different Models Using 5-Fold Cross-Validation',
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    xaxis_title='Model',
    yaxis_title='Accuracy',
    showlegend=False)
    fig.show()
    
plot_results(model_scores, name='base_models_cv')

We also wanted to know if the data from different leagues differs a lot. We tried to train model on the data of one league and predict on other. We use Ridge classifier.

In [None]:
df_results = pd.DataFrame(None, columns=['train_league', 'test_league', 'accuracy'])
model = RidgeClassifier()


for train_league in dfs_best.keys():
    
    df = dfs_best[train_league]
    X = df.iloc[:,2:]
    y = df.iloc[:,1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_scores = defaultdict()
    model = RidgeClassifier()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    model_scores[train_league] = acc

    for league, df_league in dfs_best.items():
        if(league != train_league):
            X = df_league.iloc[:,2:]
            y = df_league.iloc[:,1]
            predictions = model.predict(X)
            acc = accuracy_score(y, predictions)
            model_scores[league] = acc

    df_temp = pd.DataFrame.from_dict([model_scores])
    df_temp = pd.melt(df_temp)
    df_temp.columns = ['test_league','accuracy']
    df_temp['train_league'] = train_league
    df_temp
    df_results = pd.concat([df_results, df_temp])
    
df_results.head()

Accuracy doesn't differ a lot. Using platinum as training set gives the best results.

In [None]:
plt.rcParams['figure.dpi'] = 100
sns.catplot(x="test_league", y="accuracy", col='train_league', data=df_results, kind="point")

The easiest to predict is bronze league.

In [None]:
sns.catplot(x="train_league", y="accuracy", col='test_league', data=df_results, kind="point")

# Feature importance

In [None]:
df = dfs_best['bronze']
df = drop_one_value_columns(df)
df = get_rid_of_categorical_columns(df)
df.head()

In [None]:
from sklearn import feature_selection

def enc_attrs(series):
    if type(series) != 'str':
        return series
    dic = series.values
    return pd.Series(attr_enc[x] for x in series)

# X = df_attrs.iloc[:,2:][filter(lambda x: x.endswith("attribute"),df_attrs.columns)].apply(enc_attrs)
columns = list(filter(lambda x: x.startswith("1"), df.columns[2:]))
#columns = df.columns[2:]
X = df[columns]
#X = df.iloc[:,2:]
Y = df.iloc[:,1]
fs = feature_selection.SelectKBest(feature_selection.f_classif, k=30)
fs.fit_transform(X,Y)
prs = list(zip(columns,fs.scores_))
# prs.sort(reverse = True,key = lambda x : x[1])
# for p in prs:
#     print(f"feature: {p[0]}, score: {p[1]}")
    
df_feature_importance = pd.DataFrame(None)
df_feature_importance['feature'] = columns
df_feature_importance['score'] = fs.scores_
df_feature_importance.head()

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [10, 5]

df_feature_importance = df_feature_importance.sort_values(by='score', ascending = False)

ax = sns.barplot(x='score', y='feature', data=df_feature_importance[:20])

In [None]:
df = df_diamond_best.copy()
df

In [None]:
first_gold_columns = [col for col in df.columns if col.startswith('1') and 'gold' in col]
second_gold_columns = [col for col in df.columns if col.startswith('2') and 'gold' in col]
df['1_mean_gold'] = df[first_gold_columns].apply(np.mean, axis=1)
df['1_gold_std'] = df[first_gold_columns].apply(np.std, axis=1)
df['2_mean_gold'] = df[second_gold_columns].apply(np.mean, axis=1)
df['2_gold_std'] = df[second_gold_columns].apply(np.std, axis=1)

In [None]:
df = get_rid_of_categorical_columns(df)
df = min_max_scaled(df, numerical_columns)
df = drop_high_correlated_columns(df)
df = drop_one_value_columns(df)

columns = list(filter(lambda x: x.startswith("1"), df.columns[2:]))
#columns = df.columns[2:]
X = df[columns]
#X = df.iloc[:,2:]
Y = df.iloc[:,1]
fs = feature_selection.SelectKBest(feature_selection.f_classif, k=30)
fs.fit_transform(X,Y)
prs = list(zip(columns,fs.scores_))
# prs.sort(reverse = True,key = lambda x : x[1])
# for p in prs:
#     print(f"feature: {p[0]}, score: {p[1]}")
    
df_feature_importance = pd.DataFrame(None)
df_feature_importance['feature'] = columns
df_feature_importance['score'] = fs.scores_
df_feature_importance = df_feature_importance.sort_values(by='score', ascending = False)


In [None]:
df_feature_importance[:20]

## On kaggle dataset

In [None]:
df_kaggle = pd.read_csv("/kaggle/input/league-of-legends-diamond-ranked-games-10-min/high_diamond_ranked_10min.csv")
df_kaggle.head()

In [None]:
columns = list(filter(lambda x: x.startswith("red"), df_kaggle.columns[2:]))
X = df_kaggle[columns]
Y = df_kaggle.iloc[:,1]
fs = feature_selection.SelectKBest(feature_selection.f_classif, k=15)
fs.fit_transform(X,Y)
prs = list(zip(columns,fs.scores_))
# prs.sort(reverse = True,key = lambda x : x[1])
# for p in prs:
#     print(f"feature: {p[0]}, score: {p[1]}")
    
df_feature_importance = pd.DataFrame(None)
df_feature_importance['feature'] = columns
df_feature_importance['score'] = fs.scores_
df_feature_importance.head()

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [10, 5]

df_feature_importance = df_feature_importance.sort_values(by='score', ascending = False)

ax = sns.barplot(x='score', y='feature', data=df_feature_importance[:15])