In [160]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [161]:
df_train = pd.read_csv('main_train.csv')
df_test = pd.read_csv('main_test.csv')
tf_train = pd.read_csv('teamfights_train.csv')
tf_test = pd.read_csv('teamfights_test.csv')

In [162]:
df_train.head()

Unnamed: 0,match_id,radiant,hero,gold_0,lh_0,xp_0,gold_60,lh_60,xp_60,gold_120,...,xp_600,level_180,level_240,level_300,level_360,level_420,level_480,level_540,level_600,radiant_win
0,0,1,Rubick,0.0,0.0,0.0,100.0,0.0,46.0,250.0,...,1741.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0,1,Wraith King,0.0,0.0,0.0,175.0,2.0,124.0,526.0,...,2319.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2,0,1,Riki,0.0,0.0,0.0,137.0,1.0,93.0,439.0,...,3859.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1
3,0,1,Tusk,0.0,0.0,0.0,100.0,0.0,62.0,200.0,...,1676.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,1,Templar Assassin,0.0,0.0,0.0,320.0,3.0,352.0,668.0,...,4453.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1


In [163]:
tf_train.head()

Unnamed: 0,teamfight_id,match_id,player_slot,radiant,buybacks,damage,deaths_player,gold_delta,xp_end,xp_start,start,end,last_death
0,3288,1,0,1,0,673,0,89,2681,2263,423,467,452
1,3288,1,1,1,0,154,0,199,2016,1694,423,467,452
2,3288,1,2,1,0,403,0,320,1640,1230,423,467,452
3,3288,1,3,1,0,201,0,158,2367,1957,423,467,452
4,3288,1,4,1,0,369,1,165,2445,2061,423,467,452


В качестве признаков возьмем суммарное количество золота, опыта и уровней на 10 минуте.

In [164]:
df_train_numpy = df_train.values

In [165]:
def convert_players_values_to_team(players_values):
    team_values = players_values.reshape(-1, 5,)
    team_values = np.sum(team_values, axis = 1)
    return team_values[::2], team_values[1::2]

level_index = np.where(df_train.columns == 'level_600')[0][0]
levels_rad, levels_dare = convert_players_values_to_team(df_train_numpy[:, level_index])

gold_index = np.where(df_train.columns == 'gold_600')[0][0]
gold_rad, gold_dare = convert_players_values_to_team(df_train_numpy[:, gold_index])

xp_index = np.where(df_train.columns == 'xp_600')[0][0]
xp_rad, xp_dare = convert_players_values_to_team(df_train_numpy[:, xp_index])

In [166]:
y_train = np.array(df_train_numpy[::10, -1], dtype=np.float32)

In [167]:
X_train = np.array(np.vstack([levels_rad, levels_dare, gold_rad, gold_dare, xp_rad, xp_dare]).T, dtype=np.float32)

In [168]:
np.mean(X_train, axis = 0)

array([            nan,             nan,  12937.47558594,  12890.57128906,
        13811.203125  ,  13812.41015625], dtype=float32)

Как мы видим, у нас есть пропущенные значения в суммарных уровнях игроков. Заполним их средним.

In [169]:
levels = X_train[:, :2]
X_train[np.isnan(X_train)] = np.mean(levels[np.logical_not(np.isnan(levels))])

In [170]:
from sklearn.model_selection import cross_val_score

In [171]:
np.mean(cross_val_score(X=X_train, y=y_train, estimator=LogisticRegression(), cv=5, scoring='roc_auc'))

0.7058422026032084

In [172]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Осталось предсказать на тесте.

In [173]:
df_test_numpy = df_test.values

level_index = np.where(df_test.columns == 'level_600')[0][0]
levels_rad, levels_dare = convert_players_values_to_team(df_test_numpy[:, level_index])

gold_index = np.where(df_test.columns == 'gold_600')[0][0]
gold_rad, gold_dare = convert_players_values_to_team(df_test_numpy[:, gold_index])

xp_index = np.where(df_test.columns == 'xp_600')[0][0]
xp_rad, xp_dare = convert_players_values_to_team(df_test_numpy[:, xp_index])

In [174]:
X_test = np.array(np.vstack([levels_rad, levels_dare, gold_rad, gold_dare, xp_rad, xp_dare]).T, dtype=np.float32)

In [175]:
np.mean(X_test, axis = 0)

array([            nan,             nan,  12920.08886719,  12877.01464844,
        13785.1875    ,  13796.19335938], dtype=float32)

In [176]:
X_test[np.isnan(X_test)] = np.mean(levels[np.logical_not(np.isnan(levels))])

In [177]:
predictions = clf.predict_proba(X_test)

In [178]:
indeces = np.array(df_test.match_id.values[::10], dtype=int)

In [179]:
submission = pd.DataFrame()
submission['index'] = indeces
submission['proba'] = predictions[:, 1]

In [180]:
submission.to_csv('baseline.csv', index=False)