In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import sklearn
import sklearn.model_selection

## Data

In [None]:
# match dataset
df = pd.read_csv('../data/raw/lol-data-matches.csv').drop(columns=['Unnamed: 0'])
# match frames dataset
df1 = pd.read_csv('../data/raw/lol-data-match-frames.csv').drop(columns=['Unnamed: 0'])
# class labels
blue, red = -1, 1
frame = 25
random_state = 0
dpi = 300
# filter by frame
df1 = df1[df1['frame'] == frame]

# highlighting the number of duplicate data points in datasets
print('Number of duplicate data points in lol-data-matches.csv: '
     + str(df.duplicated(subset=['match_id']).sum()))
print('Number of duplicate data points in lol-data-match-frames.csv: '
     + str(df1.duplicated(subset=['match_id']).sum()))

# Discarding duplicate data points
df.drop_duplicates(subset=['match_id'], inplace=True)
df1.drop_duplicates(subset=['match_id'], inplace=True)

# Merging on the match data frames as some matches might end before the selected frame
df = pd.merge(df1, df, on='match_id')

# highlighting data points with missing features
print('Number of data points with missing features: ')
for f in df.columns[df.isna().any()].tolist():
    print(f + ': ' + str(df.loc[df[f].isna()].shape[0]))

# Discarding data points with missing class label
df.dropna(subset=['winning_team'], inplace=True)

# highlighting initial set of features, class and tier distribution
print('Initial set of features:\n', df.columns.tolist())
print('Initial class distribution:\nBlue: ' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[0]/df.shape[0])*100)
      + '\nRed: ' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[1]/df.shape[0])*100))
print('Initial tier distribution: \nBronze: ', df.groupby('tier').size().tolist()[0],
     '\nGold:', df.groupby('tier').size().tolist()[1], '\nDiamond:', df.groupby('tier').size().tolist()[2],
     '\nGrandmaster:', df.groupby('tier').size().tolist()[3])
df.describe()

## Data Preprocessing

### Sampling

In [None]:
# Sampling equal number of data points from each tier
n = 4900
bronze = df[df['tier'] == 'BRONZE'].sample(n, random_state=random_state)
gold = df[df['tier'] == 'GOLD'].sample(n, random_state=random_state)
diamond = df[df['tier'] == 'DIAMOND'].sample(n, random_state=random_state)
gm = df[df['tier'] == 'GRANDMASTERS'].sample(n, random_state=random_state)
df = bronze.append([gold, diamond, gm])
print('Sample class distribution:\nBlue: ' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[0]/df.shape[0])*100)
      + '\nRed: ' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[1]/df.shape[0])*100))

### Data Cleaning and Formatting

In [None]:
# Discarding irrelevant features
df.drop(columns=['match_id', 'frame', 'division', 'patch', 'region'], inplace=True)
# Dragon, inhibitor, rift_herald are available after the 5th minute
if frame < 5:
    df.drop(columns=['red_dragons', 'blue_dragons', 'red_inhibitors',
                     'blue_inhibitors', 'red_rift_heralds', 'blue_rift_heralds'], inplace=True)
# Baron is available after the 20th minute
if frame < 20:
    df.drop(columns=['red_barons', 'blue_barons'], inplace=True)
# Replacing missing values with 0 and class labels with chosen values
df.replace({200: red, 100: blue, np.nan:0}, inplace=True)
df.rename(columns={'winning_team': 'winner', 'first_champion':'first_kill'}, inplace=True)
print('Set of features after removing irrelevant features and renaming:\n', df.columns.tolist())

### Feature Transformation

#### Binary
Adding an advantage column for each objective and filling the rows by comparing objective values in the teams columns.
If red = blue the value assigned to the advantage column will be 0 else if red > blue the value will be 1 else it will be -1.
Discarding the objective columns once the advantage column is built.

In [None]:
# Deep copy of the df
df_bi = df.copy(deep=True)
df_bi['kill_advantage'] = np.where(df_bi['red_total_kills'] == df_bi['blue_total_kills'], 0, 
                        np.where(df_bi['red_total_kills'] > df_bi['blue_total_kills'], red, blue))
df_bi.drop(columns=['red_total_kills', 'blue_total_kills'], inplace=True)
df_bi['gold_advantage'] = np.where(df_bi['red_total_gold'] == df_bi['blue_total_gold'], 0,
                        np.where(df_bi['red_total_gold'] > df_bi['blue_total_gold'], red, blue))
df_bi.drop(columns=['red_total_gold', 'blue_total_gold'], inplace=True)
df_bi['cs_advantage'] = np.where(df_bi['red_total_cs'] == df_bi['blue_total_cs'], 0,
                        np.where(df_bi['red_total_cs'] > df_bi['blue_total_cs'], red, blue))
df_bi.drop(columns=['red_total_cs', 'blue_total_cs'], inplace=True)
df_bi['damage_advantage'] = np.where(df_bi['red_total_damage'] == df_bi['blue_total_damage'], 0,
                        np.where(df_bi['red_total_damage'] > df_bi['blue_total_damage'], red, blue))
df_bi.drop(columns=['red_total_damage', 'blue_total_damage'], inplace=True)
df_bi['tower_advantage'] = np.where(df_bi['red_towers'] == df_bi['blue_towers'], 0,
                        np.where(df_bi['red_towers'] > df_bi['blue_towers'], red, blue))
df_bi.drop(columns=['red_towers', 'blue_towers'], inplace=True)
df_bi['plate_advantage'] = np.where(df_bi['red_plates'] == df_bi['blue_plates'],0, 
                        np.where(df_bi['red_plates'] > df_bi['blue_plates'], red, blue))
df_bi.drop(columns=['red_plates', 'blue_plates'], inplace=True)
# Dragon, inhibitor, rift_herald are available after the 5th minute
if frame > 5:
    df_bi['inhibitor_advantage'] = np.where(df_bi['red_inhibitors'] == df_bi['blue_inhibitors'], 0, 
                            np.where(df_bi['red_inhibitors'] > df_bi['blue_inhibitors'], red, blue))
    df_bi.drop(columns=['red_inhibitors', 'blue_inhibitors'], inplace=True)
    df_bi['dragon_advantage'] = np.where(df_bi['red_dragons'] == df_bi['blue_dragons'], 0,
                            np.where(df_bi['red_dragons'] > df_bi['blue_dragons'], red, blue))
    df_bi.drop(columns=['red_dragons', 'blue_dragons'], inplace=True)
    df_bi['rift_advantage'] = np.where(df_bi['red_rift_heralds'] == df_bi['blue_rift_heralds'], 0,
                            np.where(df_bi['red_rift_heralds'] > df_bi['blue_rift_heralds'], red, blue))
    df_bi.drop(columns=['red_rift_heralds', 'blue_rift_heralds'], inplace=True)
# Baron is available after the 20th minute
if frame > 20:
    df_bi['baron_advantage'] = np.where(df_bi['red_barons'] == df_bi['blue_barons'],0, 
                            np.where(df_bi['red_barons'] > df_bi['blue_barons'], red, blue))
    df_bi.drop(columns=['red_barons', 'blue_barons'], inplace=True)
print('Set of features after binary feature transformation:\n', df_bi.columns.tolist())

#### Difference
Adding an advantage column for each objective and filling the rows by subtracting the objective values in the teams columns.
If red = blue the value assigned to the advantage column will be 0 else if red > blue the value will be positive else it will be Negative.
Discarding the objective columns once the advantage column is built.

In [None]:
# Deep copy of the df
df_diff = df.copy(deep=True)
df_diff['kill_advantage'] = df_diff['red_total_kills'] - df_diff['blue_total_kills']
df_diff.drop(columns=['red_total_kills', 'blue_total_kills'], inplace=True)
df_diff['gold_advantage'] = df_diff['red_total_gold'] - df_diff['blue_total_gold']
df_diff.drop(columns=['red_total_gold', 'blue_total_gold'], inplace=True)
df_diff['cs_advantage'] = df_diff['red_total_cs'] - df_diff['blue_total_cs']
df_diff.drop(columns=['red_total_cs', 'blue_total_cs'], inplace=True)
df_diff['damage_advantage'] = df_diff['red_total_damage'] - df_diff['blue_total_damage']
df_diff.drop(columns=['red_total_damage', 'blue_total_damage'], inplace=True)
df_diff['tower_advantage'] = df_diff['red_towers'] - df_diff['blue_towers']
df_diff.drop(columns=['red_towers', 'blue_towers'], inplace=True)
df_diff['plate_advantage'] = df_diff['red_plates'] - df_diff['blue_plates']
df_diff.drop(columns=['red_plates', 'blue_plates'], inplace=True)
# Dragon, inhibitor, rift_herald are available after the 5th minute
if frame > 5:
    df_diff['inhibitor_advantage'] = df_diff['red_inhibitors'] - df_diff['blue_inhibitors']
    df_diff.drop(columns=['red_inhibitors', 'blue_inhibitors'], inplace=True)
    df_diff['dragon_advantage'] = df_diff['red_dragons'] - df_diff['blue_dragons']
    df_diff.drop(columns=['red_dragons', 'blue_dragons'], inplace=True)
    df_diff['rift_advantage'] = df_diff['red_rift_heralds'] - df_diff['blue_rift_heralds']
    df_diff.drop(columns=['red_rift_heralds', 'blue_rift_heralds'], inplace=True)
# Baron is available after the 20th minute
if frame > 20:
    df_diff['baron_advantage'] = df_diff['red_barons'] - df_diff['blue_barons']
    df_diff.drop(columns=['red_barons', 'blue_barons'], inplace=True)
print('Set of features after difference feature transformation:\n', df_diff.columns.tolist())

## Feature Importance Analysis
Splitting the data into training and test sets. Training data will be used for feature importance analysis.
Splitting each tier separately and stratifying the class label to maintain same tier distribution in training and test sets.

In [None]:
bronze = df_bi[df_bi['tier'] == 'BRONZE']
gold = df_bi[df_bi['tier'] == 'GOLD']
diamond = df_bi[df_bi['tier'] == 'DIAMOND']
gm = df_bi[df_bi['tier'] == 'GRANDMASTERS']
bronze_train, bronze_test = sklearn.model_selection.train_test_split(bronze, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=bronze['winner'])
gold_train, gold_test = sklearn.model_selection.train_test_split(gold, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=gold['winner'])
diamond_train, diamond_test = sklearn.model_selection.train_test_split(diamond, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=diamond['winner'])
gm_train, gm_test = sklearn.model_selection.train_test_split(gm, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=gm['winner'])
df_bi_train = bronze_train.append([gold_train, diamond_train, gm_train])
df_bi_test = bronze_test.append([gold_test, diamond_test, gm_test])
print('Class distribution in training data:\nBlue: ' + '%.2f%%' % ((df_bi_train.groupby('winner').size().tolist()[0]/df_bi_train.shape[0])*100)
      + '\nRed: ' + '%.2f%%' % ((df_bi_train.groupby('winner').size().tolist()[1]/df_bi_train.shape[0])*100))

### Binary Features

In [None]:
# plotting the pair wise correlation of each feature with class label
def plot_objective_first(df, f):
    fig = plt.figure(dpi=dpi, figsize=(12,5))
    pd.crosstab(df[f], df['winner'], normalize='columns').plot(
        kind='bar', color=['b', 'r'], title= f + ' vs winner (normalized)', ax=fig.add_subplot(121))
    plt.ylabel('win percentage')
    plt.legend(bbox_to_anchor = (1.0, 1.0), loc='upper left');
    # Plotting for normalized values
    pd.crosstab(df[f], df['winner']).plot(
        kind='bar', color=['b', 'r'], title= f + ' vs winner', ax=fig.add_subplot(122))
    plt.ylabel('win count')
    plt.legend(bbox_to_anchor = (1.0, 1.0), loc='upper left');
    plt.tight_layout()

plt_df = df_bi_train.drop(columns=['tier', 'game_duration']).replace({-1:'blue', 0:'none', 1:'red'})
for f in plt_df.drop(columns=['winner']):
    plot_objective_first(plt_df, f)

In [None]:
def plot_correlation_bar_plot(df, title):
    plt.figure(dpi=dpi, figsize=(12, 10))
    plot = plt.barh(df.sort_values().index.tolist(), df.sort_values(), color=sns.color_palette(as_cmap=True))
    plt.title(title)
    plt.xlabel('correlation');
    for bar in plot:
        width = bar.get_width()
        if width < 0:
            width -= 0.03
        else:
            width += 0.03  
        plt.text(width, bar.get_y()+0.3, '%.3f' % width, ha='center', va='bottom')

# plotting the correlation of features' transformed using binary method with class label
plt_df = df_bi_train.corr()['winner'].drop(['game_duration', 'winner', 'first_kill', 'first_tower',
        'first_inhibitor', 'first_baron', 'first_dragon', 'first_rift_herald'])
plot_correlation_bar_plot(plt_df, title='Binary Features\' Correlation with Class Label')

### Difference Features

In [None]:
bronze = df_diff[df_diff['tier'] == 'BRONZE']
gold = df_diff[df_diff['tier'] == 'GOLD']
diamond = df_diff[df_diff['tier'] == 'DIAMOND']
gm = df_diff[df_diff['tier'] == 'GRANDMASTERS']
bronze_train, bronze_test = sklearn.model_selection.train_test_split(bronze, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=bronze['winner'])
gold_train, gold_test = sklearn.model_selection.train_test_split(gold, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=gold['winner'])
diamond_train, diamond_test = sklearn.model_selection.train_test_split(diamond, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=diamond['winner'])
gm_train, gm_test = sklearn.model_selection.train_test_split(gm, train_size=0.8, test_size=0.2,
                            random_state=random_state, stratify=gm['winner'])
df_diff_train = bronze_train.append([gold_train, diamond_train, gm_train])
df_diff_test = bronze_test.append([gold_test, diamond_test, gm_test])
print('Class distribution in training data:\nBlue:' + '%.2f%%' % ((df_diff_train.groupby('winner').size().tolist()[0]/df_diff_train .shape[0])*100)
      + '\nRed:' + '%.2f%%' % ((df_diff_train .groupby('winner').size().tolist()[1]/df_diff_train.shape[0])*100))

In [None]:
# plotting the correlation of features' transformed using difference method with class label
plt_df = df_diff_train.corr()['winner'].drop(['game_duration', 'winner', 'first_kill', 'first_tower',
        'first_inhibitor', 'first_baron', 'first_dragon', 'first_rift_herald'])
plot_correlation_bar_plot(plt_df, title='Difference Features\' Correlation with Class Label')

In [None]:
# plotting the correlation of features' transformed using difference method and 
# an initial objective features with class label

plt_df = df_diff_train.corr()['winner'].drop(['game_duration', 'winner'])
plot_correlation_bar_plot(plt_df, title='Difference Features\' Correlation with Class Label')


In [None]:
df_bi_train.to_csv('../data/processed/bi_train.csv')
df_bi_test.to_csv('../data/processed/bi_test.csv')
df_diff_train.to_csv('../data/processed/diff_train.csv')
df_diff_test.to_csv('../data/processed/diff_test.csv')