In [None]:
# Import libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import chi2, f_classif, RFE, SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Set view options
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 100)

In [None]:
train = pd.read_csv('../data/train.csv')

train['percent_possession'] = train['possession_time'] / train['duration'] * 60
train['percent_in_side'] = train['time_in_side'] / train['duration'] * 60
train['saves_to_shots_against_ratio'] = train['saves'] / (train['shots_against'])
train['amount_stolen_per_min'] = train['amount_stolen'] / train['duration'] * 60
train['percent_stolen'] = train['amount_stolen'] / train['amount_collected'] * 100
train['amount_collected_big_per_min'] = train['amount_collected_big'] / train['duration'] * 60
train['amount_stolen_big_per_min'] = train['amount_stolen_big'] / train['duration'] * 60
train['amount_collected_small_per_min'] = train['amount_collected_small'] / train['duration'] * 60
train['amount_stolen_small_per_min'] = train['amount_stolen_small'] / train['duration'] * 60
train['amount_collected_big_to_small_ratio'] = train['amount_collected_big'] / train['amount_collected_small']
train['count_collected_big_per_min'] = train['count_collected_big'] / train['duration'] * 60
train['count_stolen_big_per_min'] = train['count_stolen_big'] / train['duration'] * 60
train['count_collected_small_per_min'] = train['count_collected_small'] / train['duration'] * 60
train['count_stolen_small_per_min'] = train['count_stolen_small'] / train['duration'] * 60
train['count_collected_big_to_small_ratio'] = train['count_collected_big'] / train['count_collected_small']
train['amount_overfill_per_min'] = train['amount_overfill'] / train['duration'] * 60
train['amount_overfill_stolen_per_min'] = train['amount_overfill_stolen'] / train['duration'] * 60
train['amount_used_while_supersonic_per_min'] = train['amount_used_while_supersonic'] / train['duration'] * 60
train['percent_boost_wasted'] = train['amount_used_while_supersonic'] / train['amount_collected'] * 100
train['total_distance_per_min'] = train['total_distance'] / train['duration'] * 60
train['percent_powerslide'] = train['time_powerslide'] / train['duration'] * 100
train['count_powerslide_per_min'] = train['count_powerslide'] / train['duration'] * 60
train['slow_speed_to_boost_speed_ratio'] = train['time_slow_speed'] / train['time_boost_speed']
train['slow_speed_to_supersonic_speed_ratio'] = train['time_slow_speed'] / train['time_supersonic_speed']
train['boost_speed_to_supersonic_speed_ratio'] = train['time_boost_speed'] / train['time_supersonic_speed']
train['ground_to_air_ratio'] = train['time_ground'] / (train['time_low_air'] + train['time_high_air'] + 1)
train['ground_to_low_air_ratio'] = train['time_ground'] / (train['time_low_air'] + 1)
train['low_air_to_high_air_ratio'] = train['time_low_air'] / (train['time_high_air'] + 1)
train['ground_to_high_air_ratio'] = train['time_ground'] / (train['time_high_air'] + 1)
train['demos_inflicted_per_min'] = train['demos_inflicted'] / train['duration'] * 60
train['demos_taken_per_min'] = train['demos_taken'] / train['duration'] * 60

variables = [
        'duration', 
        'percent_possession', 
        'percent_in_side', 
        'shots', 
        'goals', 
        'saves',
        'saves_to_shots_against_ratio',
        'score', 
        'shooting_percentage',
        'bpm',
        'bcpm', 
        'avg_amount', 
        'amount_stolen_per_min', 
        'percent_stolen',
        'amount_collected_big_per_min', 
        'amount_collected_small_per_min', 
        'amount_collected_big_to_small_ratio',
        'amount_stolen_big_per_min', 
        'amount_stolen_small_per_min', 
        'count_collected_big_per_min', 
        'count_collected_small_per_min', 
        'count_collected_big_to_small_ratio',
        'count_stolen_big_per_min',
        'count_stolen_small_per_min', 
        'amount_overfill_per_min', 
        'amount_overfill_stolen_per_min', 
        'amount_used_while_supersonic_per_min', 
        'percent_zero_boost', 
        'percent_full_boost',
        'percent_boost_0_25', 
        'percent_boost_25_50', 
        'percent_boost_50_75', 
        'percent_boost_75_100',
        'percent_boost_wasted',
        'avg_speed', 
        'total_distance_per_min', 
        'percent_powerslide', 
        'count_powerslide_per_min', 
        'avg_powerslide_duration', 
        'percent_slow_speed', 
        'percent_boost_speed', 
        'percent_supersonic_speed', 
        'slow_speed_to_boost_speed_ratio',
        'slow_speed_to_supersonic_speed_ratio',
        'boost_speed_to_supersonic_speed_ratio',
        'percent_ground', 
        'percent_low_air', 
        'percent_high_air',
        'ground_to_air_ratio',
        'ground_to_low_air_ratio',
        'ground_to_high_air_ratio',
        'low_air_to_high_air_ratio',
        'avg_distance_to_ball',
        'avg_distance_to_ball_possession',
        'avg_distance_to_ball_no_possession', 
        'percent_defensive_third',
        'percent_offensive_third', 
        'percent_neutral_third', 
        'percent_defensive_half', 
        'percent_offensive_half', 
        'percent_behind_ball',
        'demos_inflicted_per_min', 
        'demos_taken_per_min'
]

In [None]:
heatmap_df = train.groupby(['match_id', 'rank'])[variables].mean().reset_index()
heatmap_df = heatmap_df.drop(columns = 'match_id')
heatmap_df['rank'] = heatmap_df['rank'].map({'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6})

plt.figure(figsize = (32, 32), dpi = 600)
corr = heatmap_df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))
sns.heatmap(corr, mask = mask, annot = True, fmt = '.2f', cmap = plt.cm.RdBu_r, cbar = False)
plt.show()

In [None]:
heatmap_df = train.groupby(['match_id', 'rank'])[variables].std().reset_index()
heatmap_df = heatmap_df.drop(columns = {'match_id', 'duration'})
heatmap_df['rank'] = heatmap_df['rank'].map({'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6})

plt.figure(figsize = (32, 32), dpi = 600)
corr = heatmap_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask = mask, annot = True, fmt = '.2f', cmap = plt.cm.RdBu_r, cbar = False)
plt.show()

In [None]:
train_mean = train.groupby(['match_id', 'rank'])[variables].mean().reset_index()
train_std = train.groupby(['match_id', 'rank'])[variables].std().reset_index()            
train_prepped = train_mean.merge(train_std, on = ('match_id', 'rank'), suffixes = ('', '_std')).fillna(0)

variables_mean_std = variables + [var + '_std' for var in variables]
X = train_prepped[variables_mean_std].drop(columns = 'duration_std')
y = train_prepped['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 69420, stratify = y)

In [None]:
k = 22

X_train_std_scaled = StandardScaler().fit_transform(X_train)
X_train_std_minmax_scaled = MinMaxScaler().fit_transform(X_train_std_scaled)

chi2_KBest = SelectKBest(score_func = chi2, k = k)
chi2_KBest.fit(X_train_std_minmax_scaled, y_train)
chi2_KBest_features = X_train.columns[chi2_KBest.get_support(1)]

f_classif_KBest = SelectKBest(score_func = f_classif, k = k)
f_classif_KBest.fit(X_train, y_train)
f_classif_KBest_features = X_train.columns[f_classif_KBest.get_support(1)]

RFE_KBest = RFE(estimator = LogisticRegression(max_iter = 10000), n_features_to_select = k)
RFE_KBest.fit(X_train_std_scaled, y_train)
RFE_KBest_features = X_train.columns[RFE_KBest.get_support(1)]

In [None]:
features = list(set(chi2_KBest_features) | set(f_classif_KBest_features) | set(RFE_KBest_features))

pipeline = Pipeline(
    steps = [
        ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
        ('variancethreshold', VarianceThreshold()),
        ('scale', StandardScaler()),
        ('logreg', LogisticRegression(max_iter = 10000))
    ]
)

pipeline.fit(X_train[features], y_train)
pipeline.score(X_test[features], y_test)

In [None]:
print(classification_report(y_test, pipeline.predict(X_test[features]), zero_division = 0))

In [None]:
# features = [
#     'shots', 
#     'saves',
#     'score', 
#     'bpm', 
#     'bcpm',
#     'amount_collected_big_per_min',
#     'count_collected_small_per_min',
#     'count_stolen_big_per_min',
#     'percent_zero_boost',
#     'percent_boost_75_100',
#     'percent_full_boost',
#     'percent_full_boost_std',
#     'percent_boost_wasted',
#     'percent_boost_wasted_std',
#     'percent_powerslide',
#     'count_powerslide_per_min',
#     'avg_powerslide_duration',
#     'avg_speed',
#     'total_distance_per_min',
#     'percent_slow_speed',
#     'percent_boost_speed',
#     'percent_supersonic_speed',
#     'percent_ground',
#     'percent_ground_std',
#     'percent_low_air',
#     'percent_high_air',
#     'ground_to_air_ratio',
#     'ground_to_low_air_ratio',
#     'ground_to_low_air_ratio_std',
#     'avg_distance_to_ball',
#     'avg_distance_to_ball_std',
#     'avg_distance_to_ball_no_possession',
#     'percent_defensive_third',
#     'percent_neutral_third',
#     'percent_behind_ball',
# ]

# pipeline = Pipeline(
#     steps = [
#         ('pf', PolynomialFeatures(interaction_only = True, include_bias = False)),
#         ('variancethreshold', VarianceThreshold()),
#         ('scale', StandardScaler()),
#         ('logreg', LogisticRegression(max_iter = 10000))
#     ]
# )

# pipeline.fit(X_train[features], y_train)
# pipeline.score(X_test[features], y_test)

In [None]:
# print(classification_report(y_test, pipeline.predict(X_test[features]), zero_division = 0))

In [None]:
test = pd.read_csv('../data/test.csv')

test['percent_possession'] = test['possession_time'] / test['duration'] * 60
test['percent_in_side'] = test['time_in_side'] / test['duration'] * 60
test['saves_to_shots_against_ratio'] = test['saves'] / (test['shots_against'])
test['amount_stolen_per_min'] = test['amount_stolen'] / test['duration'] * 60
test['percent_stolen'] = test['amount_stolen'] / test['amount_collected'] * 100
test['amount_collected_big_per_min'] = test['amount_collected_big'] / test['duration'] * 60
test['amount_stolen_big_per_min'] = test['amount_stolen_big'] / test['duration'] * 60
test['amount_collected_small_per_min'] = test['amount_collected_small'] / test['duration'] * 60
test['amount_stolen_small_per_min'] = test['amount_stolen_small'] / test['duration'] * 60
test['amount_collected_big_to_small_ratio'] = test['amount_collected_big'] / test['amount_collected_small']
test['count_collected_big_per_min'] = test['count_collected_big'] / test['duration'] * 60
test['count_stolen_big_per_min'] = test['count_stolen_big'] / test['duration'] * 60
test['count_collected_small_per_min'] = test['count_collected_small'] / test['duration'] * 60
test['count_stolen_small_per_min'] = test['count_stolen_small'] / test['duration'] * 60
test['count_collected_big_to_small_ratio'] = test['count_collected_big'] / test['count_collected_small']
test['amount_overfill_per_min'] = test['amount_overfill'] / test['duration'] * 60
test['amount_overfill_stolen_per_min'] = test['amount_overfill_stolen'] / test['duration'] * 60
test['amount_used_while_supersonic_per_min'] = test['amount_used_while_supersonic'] / test['duration'] * 60
test['percent_boost_wasted'] = test['amount_used_while_supersonic'] / test['amount_collected'] * 100
test['total_distance_per_min'] = test['total_distance'] / test['duration'] * 60
test['percent_powerslide'] = test['time_powerslide'] / test['duration'] * 100
test['count_powerslide_per_min'] = test['count_powerslide'] / test['duration'] * 60
test['slow_speed_to_boost_speed_ratio'] = test['time_slow_speed'] / test['time_boost_speed']
test['slow_speed_to_supersonic_speed_ratio'] = test['time_slow_speed'] / test['time_supersonic_speed']
test['boost_speed_to_supersonic_speed_ratio'] = test['time_boost_speed'] / test['time_supersonic_speed']
test['ground_to_air_ratio'] = test['time_ground'] / (test['time_low_air'] + test['time_high_air'] + 1)
test['ground_to_low_air_ratio'] = test['time_ground'] / (test['time_low_air'] + 1)
test['low_air_to_high_air_ratio'] = test['time_low_air'] / (test['time_high_air'] + 1)
test['ground_to_high_air_ratio'] = test['time_ground'] / (test['time_high_air'] + 1)
test['demos_inflicted_per_min'] = test['demos_inflicted'] / test['duration'] * 60
test['demos_taken_per_min'] = test['demos_taken'] / test['duration'] * 60

In [None]:
test_mean = test.groupby('match_id')[variables].mean().reset_index()
test_std = test.groupby('match_id')[variables].std().reset_index()
test_prepped = test_mean.merge(test_std, on = ('match_id'), suffixes = ('', '_std')).fillna(0)

ranks = { 'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6 }
y_pred = pd.Series(pipeline.predict(test_prepped[features])).map(ranks)

submission = pd.concat([test_prepped['match_id'], y_pred], axis = 1).rename(columns = {0: 'rank'})
submission.to_csv('../submissions/kbest_features_logreg.csv', index = False)