# DataLouna, тестовое задание DS
## Пивнев Игорь, qw1gor.p@gmail.com

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier


SEED = 31415

# Загрузка и беглый осмотр данных

In [2]:
test = pd.read_csv('test.csv', index_col='index')
test['is_test'] = 1
test.head()

Unnamed: 0_level_0,map_id,team1_id,team2_id,map_name,is_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
713,309,5973,5752,Dust2,1
714,541,5973,5752,Vertigo,1
715,1,8297,7020,Nuke,1
716,392,8297,7020,Mirage,1
717,684,8297,7020,Overpass,1


In [3]:
train = pd.read_csv('train.csv')
train.index.rename('index', inplace=True)
train['is_test'] = 0
train.head()

Unnamed: 0_level_0,map_id,team1_id,team2_id,map_name,who_win,is_test
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,289,6665,7718,Ancient,0,0
1,715,4411,10577,Inferno,0,0
2,157,11251,9455,Nuke,1,0
3,524,4608,7532,Mirage,0,0
4,404,8637,6667,Overpass,1,0


In [4]:
players = pd.read_csv('players_feats.csv')
players.head()

Unnamed: 0,p1_id,p1_total_kills,p1_headshots,p1_total_deaths,p1_kd_ratio,p1_damage_per_round,p1_grenade_damage_per_round,p1_maps_played,p1_rounds_played,p1_kills_per_round,...,p5_kill_death_difference,p5_total_opening_kills,p5_total_opening_deaths,p5_opening_kill_ratio,p5_opening_kill_rating,p5_team_win_percent_after_first_kill,p5_first_kill_in_won_rounds,team_id,map_name,map_id
0,4954,90,42.2,112,0.8,76.3,5.9,6,156,0.58,...,5,25,12,2.08,1.28,84.0,25.0,6665,Ancient,635
1,5794,45,60.0,57,0.79,82.3,10.9,3,68,0.66,...,96,54,34,1.59,1.17,70.4,16.7,7532,Ancient,635
2,4954,156,51.9,167,0.93,63.5,3.4,10,265,0.59,...,22,26,19,1.37,1.1,88.5,20.5,6665,Dust2,583
3,5794,449,53.5,427,1.05,86.7,13.1,23,618,0.73,...,104,62,49,1.27,1.1,79.0,17.4,7532,Dust2,583
4,7998,173,32.9,130,1.33,82.4,2.9,9,225,0.77,...,19,27,25,1.08,1.08,81.5,16.2,4608,Dust2,439


# Исследование данных

In [5]:
data = pd.concat([train, test]).rename(columns={'team1_id': 'team_id'})
data = pd.merge(data, players.drop('map_name', axis=1), on=['team_id', 'map_id'])
data.drop('team_id', axis=1, inplace=True)

data = data.rename(columns={'team2_id': 'team_id'})
data = pd.merge(data, players.drop('map_name', axis=1), on=['team_id', 'map_id'])
data.drop(['team_id', 'map_id'], axis=1, inplace=True)
data.head()

Unnamed: 0,map_name,who_win,is_test,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,...,p5_kill_death_y,p5_kill_round_y,p5_rounds_with_kills_y,p5_kill_death_difference_y,p5_total_opening_kills_y,p5_total_opening_deaths_y,p5_opening_kill_ratio_y,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y
0,Ancient,0.0,0,4954,258,36.0,293,0.88,71.1,6.3,...,0.75,0.55,75,-34,21,20,1.05,1.06,76.2,19.0
1,Inferno,0.0,0,8611,178,39.3,208,0.86,64.1,6.5,...,1.43,0.78,197,90,64,25,2.56,1.39,75.0,23.3
2,Nuke,1.0,0,7938,494,52.8,397,1.24,94.0,5.8,...,0.81,0.55,101,-32,32,35,0.91,1.09,53.1,15.5
3,Mirage,0.0,0,7998,474,29.1,304,1.56,86.9,2.4,...,0.92,0.54,109,-13,15,21,0.71,0.82,73.3,7.9
4,Overpass,1.0,0,2898,217,55.8,248,0.88,64.8,4.6,...,1.17,0.72,122,26,20,10,2.0,1.01,80.0,12.4


In [6]:
data.map_name = LabelEncoder().fit_transform(data.map_name)
data.head()

Unnamed: 0,map_name,who_win,is_test,p1_id_x,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,...,p5_kill_death_y,p5_kill_round_y,p5_rounds_with_kills_y,p5_kill_death_difference_y,p5_total_opening_kills_y,p5_total_opening_deaths_y,p5_opening_kill_ratio_y,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y
0,0,0.0,0,4954,258,36.0,293,0.88,71.1,6.3,...,0.75,0.55,75,-34,21,20,1.05,1.06,76.2,19.0
1,2,0.0,0,8611,178,39.3,208,0.86,64.1,6.5,...,1.43,0.78,197,90,64,25,2.56,1.39,75.0,23.3
2,4,1.0,0,7938,494,52.8,397,1.24,94.0,5.8,...,0.81,0.55,101,-32,32,35,0.91,1.09,53.1,15.5
3,3,0.0,0,7998,474,29.1,304,1.56,86.9,2.4,...,0.92,0.54,109,-13,15,21,0.71,0.82,73.3,7.9
4,5,1.0,0,2898,217,55.8,248,0.88,64.8,4.6,...,1.17,0.72,122,26,20,10,2.0,1.01,80.0,12.4


In [7]:
print('Признаки с пропусками:',
      data.columns[data.isna().any()].tolist())

Признаки с пропусками: ['who_win', 'p1_team_win_percent_after_first_kill_x', 'p2_team_win_percent_after_first_kill_x', 'p3_team_win_percent_after_first_kill_x', 'p4_team_win_percent_after_first_kill_x', 'p5_team_win_percent_after_first_kill_x', 'p1_team_win_percent_after_first_kill_y', 'p2_team_win_percent_after_first_kill_y', 'p3_team_win_percent_after_first_kill_y', 'p4_team_win_percent_after_first_kill_y', 'p5_team_win_percent_after_first_kill_y']


In [8]:
print('Всего пропусков в датасете:',
      data.isna().sum().sum())

Всего пропусков в датасете: 48


In [9]:
data.fillna(0, inplace=True)
data = data.astype({'who_win': int})

for col_name in data.columns:
    for name in ['id']:
        if name in col_name:
            data.drop(col_name, axis=1, inplace=True)
data.head()

Unnamed: 0,map_name,who_win,is_test,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,p1_maps_played_x,...,p5_kill_death_y,p5_kill_round_y,p5_rounds_with_kills_y,p5_kill_death_difference_y,p5_total_opening_kills_y,p5_total_opening_deaths_y,p5_opening_kill_ratio_y,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y
0,0,0,0,258,36.0,293,0.88,71.1,6.3,17,...,0.75,0.55,75,-34,21,20,1.05,1.06,76.2,19.0
1,2,0,0,178,39.3,208,0.86,64.1,6.5,12,...,1.43,0.78,197,90,64,25,2.56,1.39,75.0,23.3
2,4,1,0,494,52.8,397,1.24,94.0,5.8,22,...,0.81,0.55,101,-32,32,35,0.91,1.09,53.1,15.5
3,3,0,0,474,29.1,304,1.56,86.9,2.4,18,...,0.92,0.54,109,-13,15,21,0.71,0.82,73.3,7.9
4,5,1,0,217,55.8,248,0.88,64.8,4.6,14,...,1.17,0.72,122,26,20,10,2.0,1.01,80.0,12.4


In [10]:
play_features = []
for col_name in data.columns:
    if 'p1' == col_name[:2] and col_name[-1] == 'x':
        play_features.append(col_name[3:-2])
print(play_features)

['total_kills', 'headshots', 'total_deaths', 'kd_ratio', 'damage_per_round', 'grenade_damage_per_round', 'maps_played', 'rounds_played', 'kills_per_round', 'assists_per_round', 'deaths_per_round', 'saved_by_teammate_per_round', 'saved_teammates_per_round', 'rating', 'kill_death', 'kill_round', 'rounds_with_kills', 'kill_death_difference', 'total_opening_kills', 'total_opening_deaths', 'opening_kill_ratio', 'opening_kill_rating', 'team_win_percent_after_first_kill', 'first_kill_in_won_rounds']


In [11]:
kd_cols = []
maps_cols = []
rating_cols = []
for col_name in data.columns:
    if 'kd_ratio' in col_name:
        kd_cols.append(col_name)
    elif 'maps_played' in col_name:
        maps_cols.append(col_name)
    elif 'rating' in col_name:
        rating_cols.append(col_name)

data['kd_ratio_x'] = data[kd_cols[0]]
data['kd_ratio_y'] = data[kd_cols[5]]
data['maps_x'] = data[maps_cols[0]]
data['maps_y'] = data[maps_cols[5]]
data['rating_x'] = data[rating_cols[0]]
data['rating_y'] = data[rating_cols[5]]

for i in range(1, 5):
    data['kd_ratio_x'] += data[kd_cols[i]]
    data['kd_ratio_y'] += data[kd_cols[i+5]]
    data['maps_x'] += data[maps_cols[i]]
    data['maps_y'] += data[maps_cols[i+5]]
    data['rating_x'] += data[rating_cols[i]]
    data['rating_y'] += data[rating_cols[i+5]]

data.head()

Unnamed: 0,map_name,who_win,is_test,p1_total_kills_x,p1_headshots_x,p1_total_deaths_x,p1_kd_ratio_x,p1_damage_per_round_x,p1_grenade_damage_per_round_x,p1_maps_played_x,...,p5_opening_kill_ratio_y,p5_opening_kill_rating_y,p5_team_win_percent_after_first_kill_y,p5_first_kill_in_won_rounds_y,kd_ratio_x,kd_ratio_y,maps_x,maps_y,rating_x,rating_y
0,0,0,0,258,36.0,293,0.88,71.1,6.3,17,...,1.05,1.06,76.2,19.0,5.38,4.83,74,35,4.97,5.51
1,2,0,0,178,39.3,208,0.86,64.1,6.5,12,...,2.56,1.39,75.0,23.3,4.68,5.64,76,152,4.83,5.54
2,4,1,0,494,52.8,397,1.24,94.0,5.8,22,...,0.91,1.09,53.1,15.5,5.67,5.42,105,65,5.97,5.03
3,3,0,0,474,29.1,304,1.56,86.9,2.4,18,...,0.71,0.82,73.3,7.9,5.71,5.09,87,76,5.89,4.96
4,5,1,0,217,55.8,248,0.88,64.8,4.6,14,...,2.0,1.01,80.0,12.4,5.27,5.53,67,45,4.85,5.63


# Подготовка и обучение модели

In [12]:
X = data[data.is_test == 0].drop(['who_win', 'is_test'], axis=1)
y = data[data.is_test == 0].who_win

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.18, random_state=SEED)

In [13]:
model = CatBoostClassifier(depth=7, l2_leaf_reg=5.0,
                           silent=True, random_state=SEED)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f26f4dd2410>

In [14]:
y_pred = model.predict(X_val)
print('ROC-AUC на валидации: ',
      roc_auc_score(y_val, y_pred))

ROC-AUC на валидации:  0.5930992736077483


# Выгрузка результата

In [15]:
test_data = data[data.is_test == 1].drop(['who_win', 'is_test'], axis=1)
predict = model.predict(test_data)

In [16]:
output = pd.DataFrame(columns=['index', 'who_win'])
output['index'] = test.index
output['who_win'] = predict
output.to_csv('prediction.csv', index=False)

In [17]:
output.head()

Unnamed: 0,index,who_win
0,713,0
1,714,0
2,715,1
3,716,0
4,717,0
