In [None]:
# Import libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
# Set view options
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 100)

In [None]:
train = pd.read_csv('../data/train.csv')

train['amount_collected_per_min'] = train['amount_collected'] / train['duration'] * 60
train['amount_stolen_per_min'] = train['amount_stolen'] / train['duration'] * 60
train['amount_collected_big_per_min'] = train['amount_collected_big'] / train['duration'] * 60
train['amount_stolen_big_per_min'] = train['amount_stolen_big'] / train['duration'] * 60
train['amount_collected_small_per_min'] = train['amount_collected_small'] / train['duration'] * 60
train['amount_stolen_small_per_min'] = train['amount_stolen_small'] / train['duration'] * 60
train['count_collected_big_per_min'] = train['count_collected_big'] / train['duration'] * 60
train['count_stolen_big_per_min'] = train['count_stolen_big'] / train['duration'] * 60
train['count_collected_small_per_min'] = train['count_collected_small'] / train['duration'] * 60
train['count_stolen_small_per_min'] = train['count_stolen_small'] / train['duration'] * 60
train['amount_overfill_per_min'] = train['amount_overfill'] / train['duration'] * 60
train['amount_overfill_stolen_per_min'] = train['amount_overfill_stolen'] / train['duration'] * 60
train['amount_used_while_supersonic_per_min'] = train['amount_used_while_supersonic'] / train['duration'] * 60
train['total_distance_per_min'] = train['total_distance'] / train['duration'] * 60
train['time_powerslide_per_min'] = train['time_powerslide'] / train['duration'] * 60
train['count_powerslide_per_min'] = train['count_powerslide'] / train['duration'] * 60
train['demos_inflicted_per_min'] = train['demos_inflicted'] / train['duration'] * 60
train['demos_taken_per_min'] = train['demos_taken'] / train['duration'] * 60

variables = [
        'duration', 
        'possession_time', 
        'time_in_side', 
        'shots', 
        'goals', 
        'saves', 
        'score', 
        'shooting_percentage',
        'bpm',
        'bcpm', 
        'avg_amount', 
        'amount_collected_per_min', 
        'amount_stolen_per_min', 
        'amount_collected_big_per_min', 
        'amount_stolen_big_per_min', 
        'amount_collected_small_per_min', 
        'amount_stolen_small_per_min', 
        'count_collected_big_per_min', 
        'count_stolen_big_per_min',
        'count_collected_small_per_min', 
        'count_stolen_small_per_min', 
        'amount_overfill_per_min', 
        'amount_overfill_stolen_per_min', 
        'amount_used_while_supersonic_per_min', 
        'percent_zero_boost', 
        'percent_full_boost',
        'percent_boost_0_25', 
        'percent_boost_25_50', 
        'percent_boost_50_75', 
        'percent_boost_75_100', 
        'avg_speed', 
        'total_distance_per_min', 
        'time_powerslide_per_min', 
        'count_powerslide_per_min', 
        'avg_powerslide_duration', 
        'avg_speed_percentage', 
        'percent_slow_speed', 
        'percent_boost_speed', 
        'percent_supersonic_speed', 
        'percent_ground', 
        'percent_low_air', 
        'percent_high_air', 
        'avg_distance_to_ball',
        'avg_distance_to_ball_possession',
        'avg_distance_to_ball_no_possession', 
        'percent_defensive_third',
        'percent_offensive_third', 
        'percent_neutral_third', 
        'percent_defensive_half', 
        'percent_offensive_half', 
        'percent_behind_ball',
        'percent_infront_ball', 
        'demos_inflicted_per_min', 
        'demos_taken_per_min'
]

In [None]:
heatmap_df = train.groupby(['match_id', 'rank'])[variables].mean().reset_index()
heatmap_df = heatmap_df.drop(columns = 'match_id')
heatmap_df['rank'] = heatmap_df['rank'].map({'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6})

plt.figure(figsize = (32, 32), dpi = 600)
corr = heatmap_df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, mask = mask, annot = True, fmt = '.2f', cmap = plt.cm.RdBu_r, cbar = False)
plt.show()

In [None]:
heatmap_df = train.groupby(['match_id', 'rank'])[variables].std().reset_index()
heatmap_df = heatmap_df.drop(columns = {'match_id', 'duration'})
heatmap_df['rank'] = heatmap_df['rank'].map({'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6})

plt.figure(figsize = (32, 32), dpi = 600)
corr = heatmap_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask = mask, annot = True, fmt = '.2f', cmap = plt.cm.RdBu_r, cbar = False)
plt.show()

In [None]:
train_mean = train.groupby(['match_id', 'rank'])[variables].mean().reset_index()
train_std = train.groupby(['match_id', 'rank'])[variables].std().reset_index()            
train_prepped = train_mean.merge(train_std, on = ('match_id', 'rank'), suffixes = ('', '_std'))

variables_mean_std = variables + [var + '_std' for var in variables]
X = train_prepped[variables_mean_std].drop(columns = 'duration_std')
y = train_prepped['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [None]:
features = [
    'shots', 
    'saves',
    'score',
    'bpm',
    'bcpm',
    'count_collected_small_per_min',
    'percent_full_boost',
    'percent_boost_0_25',
    'percent_boost_75_100', 
    'avg_speed',
    'total_distance_per_min',
    'avg_powerslide_duration',
    'time_powerslide_per_min',
    'count_powerslide_per_min',
    'percent_slow_speed', 
    'percent_supersonic_speed',
    'percent_ground',
    'percent_low_air',
    'avg_distance_to_ball',
    'percent_behind_ball'
]

pipeline = Pipeline(
    steps = [
        ('scale', StandardScaler()),
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('logreg', LogisticRegression(max_iter = 10000))
    ]
)

pipeline.fit(X_train[features], y_train)
pipeline.score(X_test[features], y_test)

In [None]:
print(classification_report(y_test, pipeline.predict(X_test[features]), zero_division = 0))

In [None]:
test = pd.read_csv('../data/test.csv')

test['total_distance_per_min'] = test['total_distance'] / test['duration'] * 60
test['time_powerslide_per_min'] = test['time_powerslide'] / test['duration'] * 60
test['count_powerslide_per_min'] = test['count_powerslide'] / test['duration'] * 60
test['amount_collected_big_per_min'] = test['amount_collected_big'] / test['duration'] * 60
test['amount_collected_small_per_min'] = test['amount_collected_small'] / test['duration'] * 60
test['count_collected_big_per_min'] = test['count_collected_big'] / test['duration'] * 60
test['count_collected_small_per_min'] = test['count_collected_small'] / test['duration'] * 60
test['demos_inflicted_per_min'] = test['demos_inflicted'] / test['duration'] * 60
test['demos_taken_per_min'] = test['demos_taken'] / test['duration'] * 60

In [None]:
test = test.groupby('match_id')[features].mean().reset_index()

ranks = { 'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6 }
y_pred = pd.Series(pipeline.predict(test[features])).map(ranks)

submission = pd.concat([test['match_id'], y_pred], axis = 1).rename(columns = {0: 'rank'})
submission.to_csv('../submissions/logreg.csv', index = False)