In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import json
import pickle
import tqdm

from sklearn import linear_model
from sklearn.utils.testing import ignore_warnings

np.set_printoptions(precision=4, suppress=True)

from collections import Counter

In [2]:
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

----

### Задание

**Background**: в спортивном “Что? Где? Когда?” соревнующиеся команды отвечают на одни и те же вопросы. После минуты обсуждения команды записывают и сдают свои ответы на карточках; побеждает тот, кто ответил на большее число вопросов. Турнир обычно состоит из нескольких десятков вопросов (обычно 36 или 45, иногда 60, больше редко). Часто бывают синхронные турниры, когда на одни и те же вопросы отвечают команды на сотнях игровых площадок по всему миру, т.е. в одном турнире могут играть сотни, а то и тысячи команд. Соответственно, нам нужно:
- построить рейтинг-лист, который способен нетривиально предсказывать результаты будущих турниров;
- при этом, поскольку ЧГК — это хобби, и контрактов тут никаких нет, игроки постоянно переходят из команды в команду, сильный игрок может на один турнир сесть поиграть за другую команду и т.д.; поэтому единицей рейтинг-листа должна быть не команда, а отдельный игрок;


### Структура решения

I часть, в которой генерятся фичи для задачи предсказания вероятностности ответить на вопрос

II часть, в которой я понял, что на тесте будут ровно те же игроки, что и на трейне, а значит можно переобучаться сколько душе угодно

III часть, где предсказываются результаты турниров 2020 года с известными составами, но неизвестными вопросами

IV часть, где вводится ЕМ-алгоритм для учета влияния скрытой переменной "сила команды"

V часть, с рейтинг-листом турниров по сложности вопросов.

----

### I часть, в которой генерятся фичи и решается задача предсказания вероятностности ответить на вопрос

Читаем данные и генерируем фичи: 
- характеристики игрока
  - сколько турниров cыграл игрок
  - средний процент верных ответов 
  - средний процент верных ответов по маскам из questionQty
  - relative_team_position - относительная позициия в турнире (относительная т.к. есть турниры на 20 команд и на 1000+ команд)

In [5]:
%%time
tournaments = pickle.load(open('./data/tournaments.pkl', 'rb'))
results = pickle.load(open('./data/results.pkl', 'rb'))
players = pickle.load(open('./data/players.pkl', 'rb'))

CPU times: user 13.6 s, sys: 2.65 s, total: 16.3 s
Wall time: 16.8 s


In [6]:
[ (k, v['name']) for k,v in tournaments.items() if v['dateStart'][:4] == '2019' ][:5]

[(4772, 'Синхрон северных стран. Зимний выпуск'),
 (4973, 'Балтийский Берег. 3 игра'),
 (4974, 'Балтийский Берег. 4 игра'),
 (4975, 'Балтийский Берег. 5 игра'),
 (4986, 'ОВСЧ. 6 этап')]

In [5]:
len([ (k, v['name']) for k,v in tournaments.items() if v['dateStart'][:4] == '2017' ]), \
len([ (k, v['name']) for k,v in tournaments.items() if v['dateStart'][:4] == '2018' ]), \
len([ (k, v['name']) for k,v in tournaments.items() if v['dateStart'][:4] == '2019' ]), \
len([ (k, v['name']) for k,v in tournaments.items() if v['dateStart'][:4] == '2020' ]),

(533, 622, 687, 418)

In [6]:
tournaments[5465]

{'dateEnd': '2019-05-19T18:00:00+03:00',
 'dateStart': '2019-05-18T12:00:00+03:00',
 'id': 5465,
 'name': 'Чемпионат России',
 'orgcommittee': [{'id': 31038,
   'name': 'Владимир',
   'patronymic': 'Владимирович',
   'surname': 'Сушков'},
  {'id': 26469,
   'name': 'Алексей',
   'patronymic': 'Владимирович',
   'surname': 'Рабин'},
  {'id': 25882,
   'name': 'Максим',
   'patronymic': 'Оскарович',
   'surname': 'Поташев'},
  {'id': 144,
   'name': 'Сергей',
   'patronymic': 'Леонидович',
   'surname': 'Абрамов'}],
 'questionQty': {'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, '6': 15},
 'season': '/seasons/52',
 'synchData': None,
 'type': {'id': 2, 'name': 'Обычный'}}

In [7]:
results[5465][0]['mask'], [p['player']['id'] for p in results[5465][0]['teamMembers']]

('010111101111010110001000111101011010111011000111111111110011110100001010101111111111011110',
 [28751, 30152, 30270, 27822, 27403, 4270])

In [8]:
# for team in results[5465]:
#     summ = sum([int(s) for s in team['mask']])
#     print(team['mask'], [p['player']['id'] for p in team['teamMembers']], summ)

----

In [9]:
%%time
rows = []
for game_id in results:

    if tournaments[game_id]['dateStart'][:4] not in ['2019']:
        continue

    try:
        for team in results[game_id]:
            team_name = team['team']['name']
            position = team['position']
            team_id = team['team']['id']
            player_ids = sorted([p['player']['id'] for p in team['teamMembers']])
            team_mask = team['mask']
            tournaments_type = tournaments[game_id]['type']['name']
            tournaments_questionQty = tournaments[game_id]['questionQty']
            date_start = tournaments[game_id]['dateStart'][:10]
            
            rows.append((game_id, date_start, team_id, player_ids, team_mask, position, tournaments_type, tournaments_questionQty))
            
    except Exception as e:
        #print(game_id, e)
        continue

CPU times: user 3.79 s, sys: 1.68 s, total: 5.47 s
Wall time: 5.68 s


In [10]:
# 

In [11]:
df = pd.DataFrame(rows,)

In [12]:
df.shape

(86638, 8)

In [13]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,4772,2019-01-05,45556,"[6212, 15456, 18036, 18332, 22799, 26089]",111111111011111110111111111100010010,1.0,Синхрон,"{'1': 12, '2': 12, '3': 12}"
1,4772,2019-01-05,1030,"[1584, 1585, 10998, 16206, 40840]",111111111011110100101111011001011010,5.5,Синхрон,"{'1': 12, '2': 12, '3': 12}"
2,4772,2019-01-05,4252,"[10187, 18168, 21060, 23513, 31332, 35850]",111111111011110101101111001011110000,5.5,Синхрон,"{'1': 12, '2': 12, '3': 12}"


In [14]:
df.columns = ['game_id', 'date_start', 'team_id', 'team', 'tmask', 'team_position', 'type', 'questionQty']

In [15]:
df.head(3)

Unnamed: 0,game_id,date_start,team_id,team,tmask,team_position,type,questionQty
0,4772,2019-01-05,45556,"[6212, 15456, 18036, 18332, 22799, 26089]",111111111011111110111111111100010010,1.0,Синхрон,"{'1': 12, '2': 12, '3': 12}"
1,4772,2019-01-05,1030,"[1584, 1585, 10998, 16206, 40840]",111111111011110100101111011001011010,5.5,Синхрон,"{'1': 12, '2': 12, '3': 12}"
2,4772,2019-01-05,4252,"[10187, 18168, 21060, 23513, 31332, 35850]",111111111011110101101111001011110000,5.5,Синхрон,"{'1': 12, '2': 12, '3': 12}"


#### Немного чистки: 

заменить '?' на 0, 'X' на ''

Команды, у которых длина маски меньше положенного - заполняем знаком 'N' ('no data')

то есть неполную маску 101101010111001 мы превращаем в 101101010111001NNNNNNNNNNNNNNNNNNNNN

In [16]:
%%time
df = df[df.team.apply(len) != 0]
df = df[df.tmask.notnull()]

df.date_start = pd.to_datetime(df.date_start)

df.tmask = df.tmask.str.replace('?', '0')    # заменяем на ноль 
df.tmask = df.tmask.str.replace('X', '')     # 'X' - удаляем, т.к. у всех других команд на этом месте всегда тоже 'X'

df['tmask_sum'] = df.tmask.apply(lambda x: sum([int(s) for s in x ]))

# будем использовать как фичу
dd = {'Синхрон': '1', 'Асинхрон': '2', 'Обычный': '3', 'Общий зачёт': '3', 'Строго синхронный': '1'}
df.type = df.type.map(dd).fillna('1').astype(int)

# # 16 турниров, где у команд неодинаковая длина маски
# # турниры: [5462, 5553, 5554, 5703, 5705, 5760, 5864, 6026, 6085, 6090, 6249, 6254, 6255, 6265, 6307, 6308]
df['tmask_len'] = df.tmask.apply(len) 
zz = df.groupby('game_id').tmask_len.nunique().loc[lambda x: x != 1]
# можно удалить, но тогда теряем 10% датасета!
# df = df[~df.game_id.isin(zz.index)]
# поэтому мы будем заполнять символом N - No data
# то есть неполную маску 101101010111001 мы превращаем в 101101010111001NNNNNNNNNNNNNNNNNNNNN
df['tmask_max_len'] = df.game_id.map( df.groupby('game_id').tmask_len.max() )
condition = df.game_id.isin(zz.index)
df.loc[condition, 'tmask'] = df[condition].tmask.str.pad(width=999, side='right', fillchar='N')
df.loc[condition, 'tmask'] = df[condition].apply(lambda x: x.tmask[:x.tmask_max_len], axis=1)

CPU times: user 1.76 s, sys: 56.2 ms, total: 1.81 s
Wall time: 1.94 s


In [17]:
df.head()

Unnamed: 0,game_id,date_start,team_id,team,tmask,team_position,type,questionQty,tmask_sum,tmask_len,tmask_max_len
0,4772,2019-01-05,45556,"[6212, 15456, 18036, 18332, 22799, 26089]",111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36
1,4772,2019-01-05,1030,"[1584, 1585, 10998, 16206, 40840]",111111111011110100101111011001011010,5.5,1,"{'1': 12, '2': 12, '3': 12}",25,36,36
2,4772,2019-01-05,4252,"[10187, 18168, 21060, 23513, 31332, 35850]",111111111011110101101111001011110000,5.5,1,"{'1': 12, '2': 12, '3': 12}",25,36,36
3,4772,2019-01-05,5444,"[15381, 27375, 28939, 36742, 54289]",101111101111111110001101011001111010,5.5,1,"{'1': 12, '2': 12, '3': 12}",25,36,36
4,4772,2019-01-05,40931,"[12400, 17720, 26988, 28689, 30597, 69476]",111111101011111101000111001001111110,5.5,1,"{'1': 12, '2': 12, '3': 12}",25,36,36


In [18]:
%%time

# df2 = df[['game_id', 'date_start', 'team_id', 'team', 'tmask', 'team_position']].copy()
df2 = df.copy()

df2 = df2.explode('team')

df2 = df2.rename(columns={'team': 'player_id'})

df2['tmask_list'] = df2.tmask.apply(lambda x: [(question_id, y) for question_id, y in enumerate(x)])
# df2 = df2.drop(['tmask'], axis=1)

CPU times: user 4.64 s, sys: 1.69 s, total: 6.33 s
Wall time: 6.43 s


In [19]:
df2.head()

Unnamed: 0,game_id,date_start,team_id,player_id,tmask,team_position,type,questionQty,tmask_sum,tmask_len,tmask_max_len,tmask_list
0,4772,2019-01-05,45556,6212,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
0,4772,2019-01-05,45556,15456,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
0,4772,2019-01-05,45556,18036,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
0,4772,2019-01-05,45556,18332,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
0,4772,2019-01-05,45556,22799,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."


In [20]:
def tmask_1_rate(x):
    try: 
        mask_to_int = [int(i) for i in x if i != 'N']
        sum_ones = sum(mask_to_int)
        len_mask = len(mask_to_int)
        return sum_ones / len_mask
    except: 
        return np.NaN

In [21]:
def tmask_N_rate(x):
    mask_to_int = sum([1 for i in x if i == 'N'])
    return mask_to_int / len(x)

In [22]:
%%time
df2['questions_hit_rate'] = df2.tmask.apply(tmask_1_rate)
df2['questions_N_rate'] = df2.tmask.apply(tmask_N_rate)

CPU times: user 8.29 s, sys: 116 ms, total: 8.41 s
Wall time: 8.51 s


In [23]:
df2['relative_team_position'] = df2.team_position / df2.game_id.map( df2.groupby('game_id').team_id.nunique() )

In [24]:
df2.head()

Unnamed: 0,game_id,date_start,team_id,player_id,tmask,team_position,type,questionQty,tmask_sum,tmask_len,tmask_max_len,tmask_list,questions_hit_rate,questions_N_rate,relative_team_position
0,4772,2019-01-05,45556,6212,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329
0,4772,2019-01-05,45556,15456,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329
0,4772,2019-01-05,45556,18036,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329
0,4772,2019-01-05,45556,18332,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329
0,4772,2019-01-05,45556,22799,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329


### questionQty

Вопросы разделены на несколько ступеней, обычно 12 + 12 + 12 , но иногда бывает 15+15+15 или 36 + 36 + 36 + 36 + 36 + 36 и др.

Посчитаем по каждой стадии процент верных ответов т.к. есть гипотеза что первая стадия проще чем последняя.

In [25]:
df2.questionQty.astype(str).value_counts()[:20]

{'1': 12, '2': 12, '3': 12}                                        305758
{'1': 15, '2': 15, '3': 15}                                         27348
{'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, '6': 36}              17545
{'1': 12, '2': 12, '3': 12, '4': 12}                                16358
{'1': 36, '2': 36, '3': 36, '4': 36}                                11399
{'1': 13, '2': 13, '3': 13}                                         10524
{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15, '6': 15}               6638
{'1': 36, '2': 45, '3': 45, '4': 36, '5': 36, '6': 36}               6086
{'1': 15, '2': 15, '3': 15, '4': 15, '5': 15}                        5646
{'1': 18, '2': 18}                                                   4818
{'1': 36, '2': 36, '3': 36, '4': 36, '5': 36, '6': 36, '7': 36}      4753
{'1': 12, '2': 12, '3': 12, '4': 12, '5': 12, '6': 12}               4331
{'1': 15, '2': 15, '3': 15, '4': 15}                                 3695
{'1': 14, '2': 14, '3': 14}           

In [26]:
condition = df2.questionQty.apply(len) > 0
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[: len(x) // 3])
df2.loc[condition, 'tmask_p2'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 3 : 2 * len(x) // 3])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[2 * len(x) // 3 :])

condition = df2.questionQty.apply(len) % 2
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[: len(x) //2 ])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 2 :])

condition = df2.questionQty.apply(len) % 4 == 0
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[: len(x) // 4])
df2.loc[condition, 'tmask_p2'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 4 : 2 * len(x) // 4])
df2.loc[condition, 'tmask_p2'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 4 : 3 * len(x) // 4])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[3 * len(x) // 4 :])

condition = df2.questionQty.apply(len) % 3 == 0
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[: len(x) // 3])
df2.loc[condition, 'tmask_p2'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 3 : 2 * len(x) // 3])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[2 * len(x) // 3 :])

condition = df2.questionQty.apply(len) % 5 == 0
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[: len(x) // 5 ])
df2.loc[condition, 'tmask_p1'] = df2.loc[condition, 'tmask'].apply(lambda x: x[len(x) // 5 : 2 * len(x) // 5 ])
df2.loc[condition, 'tmask_p2'] = df2.loc[condition, 'tmask'].apply(lambda x: x[2 * len(x) // 5 : 3 * len(x) // 5 ])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[3 * len(x) // 5 : 4 * len(x) // 5 ])
df2.loc[condition, 'tmask_p3'] = df2.loc[condition, 'tmask'].apply(lambda x: x[4 * len(x) // 5  :])

In [27]:
df2['tmask_p1_hit_rate'] = df2.tmask_p1.apply(tmask_1_rate)
df2['tmask_p2_hit_rate'] = df2.tmask_p2.apply(tmask_1_rate)
df2['tmask_p3_hit_rate'] = df2.tmask_p3.apply(tmask_1_rate)

In [28]:
df2['tmask_len'] = df2.tmask.apply(len)

### Ключевой момент: джойн сам с собой + условие на дату, чтобы не заглядывать в будущее

In [29]:
%%time
dff = pd.merge(df2[['date_start', 'player_id', 'game_id']], df2, on='player_id')
dff = dff[dff.date_start_y < dff.date_start_x]

CPU times: user 12.1 s, sys: 4.67 s, total: 16.8 s
Wall time: 17.3 s


In [30]:
dff.head(5)

Unnamed: 0,date_start_x,player_id,game_id_x,game_id_y,date_start_y,team_id,tmask,team_position,type,questionQty,tmask_sum,tmask_len,tmask_max_len,tmask_list,questions_hit_rate,questions_N_rate,relative_team_position,tmask_p1,tmask_p2,tmask_p3,tmask_p1_hit_rate,tmask_p2_hit_rate,tmask_p3_hit_rate
8,2019-01-05,6212,4772,5083,2019-01-04,68894,001111011011100010101110101110001001,2.0,1,"{'1': 12, '2': 12, '3': 12}",20,36,36,"[(0, 0), (1, 0), (2, 1), (3, 1), (4, 1), (5, 1...",0.555556,0.0,0.026316,1111011011,100010101110,101110001001,0.666667,0.5,0.5
82,2019-01-25,6212,4973,4772,2019-01-05,45556,111111111011111110111111111100010010,1.0,1,"{'1': 12, '2': 12, '3': 12}",28,36,36,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.777778,0.0,0.004329,111111111011,111110111111,111100010010,0.916667,0.916667,0.5
88,2019-01-25,6212,4973,5055,2019-01-11,59865,110111111111111111010100111010110100101,5.0,1,"{'1': 13, '2': 13, '3': 13}",28,39,39,"[(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1...",0.717949,0.0,0.037037,1101111111111,1111101010011,1010110100101,0.923077,0.692308,0.538462
90,2019-01-25,6212,4973,5083,2019-01-04,68894,001111011011100010101110101110001001,2.0,1,"{'1': 12, '2': 12, '3': 12}",20,36,36,"[(0, 0), (1, 0), (2, 1), (3, 1), (4, 1), (5, 1...",0.555556,0.0,0.026316,1111011011,100010101110,101110001001,0.666667,0.5,0.5
91,2019-01-25,6212,4973,5097,2019-01-11,59865,111111111011111101111111110111111,5.5,1,"{'1': 12, '2': 12, '3': 12}",30,33,33,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.909091,0.0,0.01585,11111111101,11111011111,11110111111,0.909091,0.909091,0.909091


In [31]:
%%time
player_feats = \
    dff.groupby(['player_id', 'date_start_x', 'game_id_x'])\
       .agg(
            {'game_id_y': ['count']
            , 'team_position': ['max', 'min', 'mean']
            , 'relative_team_position': ['max', 'min', 'mean', 'std']
            , 'questions_hit_rate': ['min', 'max', 'mean', 'std']
            , 'questions_N_rate': ['min', 'max', 'mean', 'std']
            , 'tmask_p1_hit_rate': ['min', 'max', 'mean', 'std']
            , 'tmask_p2_hit_rate': ['min', 'max', 'mean', 'std']
            , 'tmask_p3_hit_rate': ['min', 'max', 'mean', 'std']
           })

CPU times: user 4.92 s, sys: 335 ms, total: 5.26 s
Wall time: 5.29 s


In [32]:
player_feats.columns = [i + '_' + j for i,j in player_feats.columns]

In [33]:
player_feats = player_feats.reset_index()

In [34]:
player_feats = player_feats.drop_duplicates(['player_id', 'date_start_x'])
player_feats = player_feats.drop('game_id_x', axis=1)

In [35]:
player_feats.head(11)

Unnamed: 0,player_id,date_start_x,game_id_y_count,team_position_max,team_position_min,team_position_mean,relative_team_position_max,relative_team_position_min,relative_team_position_mean,relative_team_position_std,questions_hit_rate_min,questions_hit_rate_max,questions_hit_rate_mean,questions_hit_rate_std,questions_N_rate_min,questions_N_rate_max,questions_N_rate_mean,questions_N_rate_std,tmask_p1_hit_rate_min,tmask_p1_hit_rate_max,tmask_p1_hit_rate_mean,tmask_p1_hit_rate_std,tmask_p2_hit_rate_min,tmask_p2_hit_rate_max,tmask_p2_hit_rate_mean,tmask_p2_hit_rate_std,tmask_p3_hit_rate_min,tmask_p3_hit_rate_max,tmask_p3_hit_rate_mean,tmask_p3_hit_rate_std
0,15,2019-02-08,2,513.5,38.0,275.75,0.513514,0.5,0.506757,0.009555,0.472222,0.527778,0.5,0.039284,0.0,0.0,0.0,0.0,0.333333,0.416667,0.375,0.058926,0.5,0.666667,0.583333,0.117851,0.5,0.583333,0.541667,0.058926
2,15,2019-04-12,4,513.5,38.0,190.875,0.664384,0.47807,0.538992,0.084861,0.371429,0.611111,0.495635,0.100571,0.0,0.0,0.0,0.0,0.333333,0.75,0.511364,0.18145,0.333333,0.666667,0.479167,0.142319,0.25,0.666667,0.5,0.180021
3,15,2019-05-17,5,513.5,38.0,170.4,0.903061,0.47807,0.611806,0.178635,0.305556,0.611111,0.457619,0.121704,0.0,0.0,0.0,0.0,0.333333,0.75,0.509091,0.157222,0.166667,0.666667,0.416667,0.186339,0.25,0.666667,0.45,0.191848
5,15,2019-06-28,7,513.5,38.0,199.642857,0.903061,0.47807,0.664194,0.175259,0.222222,0.611111,0.402268,0.139246,0.0,0.0,0.0,0.0,0.25,0.75,0.458874,0.161706,0.166667,0.666667,0.380952,0.1791,0.083333,0.666667,0.369048,0.214396
6,15,2019-07-26,8,513.5,38.0,184.9375,0.903061,0.47807,0.676965,0.16623,0.222222,0.611111,0.379762,0.143776,0.0,0.0,0.0,0.0,0.25,0.75,0.443182,0.156152,0.083333,0.666667,0.34375,0.196383,0.083333,0.666667,0.354167,0.202905
7,15,2019-08-30,9,513.5,38.0,186.888889,0.903061,0.47807,0.700865,0.171229,0.166667,0.611111,0.356085,0.152095,0.0,0.0,0.0,0.0,0.083333,0.75,0.403199,0.189006,0.083333,0.666667,0.333333,0.186339,0.083333,0.666667,0.333333,0.199826
8,15,2019-09-06,10,513.5,38.0,188.05,0.903061,0.47807,0.714182,0.166838,0.166667,0.611111,0.35381,0.143577,0.0,0.0,0.0,0.0,0.083333,0.75,0.396212,0.179561,0.083333,0.666667,0.333333,0.175682,0.083333,0.666667,0.333333,0.188398
10,15,2019-09-27,12,513.5,38.0,218.333333,0.933892,0.47807,0.750149,0.172722,0.155556,0.611111,0.321062,0.15072,0.0,0.832714,0.069393,0.240384,0.083333,0.75,0.355044,0.188763,0.083333,0.666667,0.315152,0.17724,0.083333,0.666667,0.321212,0.183196
11,15,2019-10-04,13,513.5,33.5,204.115385,0.933892,0.424051,0.725064,0.188485,0.155556,0.611111,0.33269,0.15027,0.0,0.832714,0.064055,0.230953,0.083333,0.75,0.359785,0.181533,0.083333,0.666667,0.323611,0.171514,0.083333,0.666667,0.343056,0.190356
12,15,2019-10-17,14,513.5,33.5,199.928571,0.933892,0.424051,0.733003,0.183511,0.155556,0.611111,0.337497,0.145491,0.0,0.832714,0.05948,0.222552,0.083333,0.75,0.362657,0.174742,0.083333,0.666667,0.321795,0.164343,0.083333,0.666667,0.362821,0.195689


In [36]:
player_feats = player_feats.rename(columns={'date_start_x':'date_start'})

### Создадим матрицу игрок-вопрос и заджойним к ней фичи игрока

In [37]:
%%time
usecols = ['game_id', 'date_start', 'team_id', 'player_id', 'tmask', 'tmask_list', 'team_position', 'type']
X = df2[usecols].explode('tmask_list')

# т.к. tmask_list это tuple (номер вопроса, результат) то расщепим это на question_id и y (наш таргет)
X['question_id'] = X.tmask_list.apply(lambda x: x[0])
X['y'] = X.tmask_list.apply(lambda x: x[1])

X = X.drop(['tmask_list', 'tmask'], axis=1)

CPU times: user 26.9 s, sys: 5.2 s, total: 32 s
Wall time: 32.3 s


In [38]:
# удалить из обучения вопросы где не было ответа (No data)
X = X[X.y != 'N']
X.y = X.y.astype(int)

In [39]:
X.head(15)

Unnamed: 0,game_id,date_start,team_id,player_id,team_position,type,question_id,y
0,4772,2019-01-05,45556,6212,1.0,1,0,1
0,4772,2019-01-05,45556,6212,1.0,1,1,1
0,4772,2019-01-05,45556,6212,1.0,1,2,1
0,4772,2019-01-05,45556,6212,1.0,1,3,1
0,4772,2019-01-05,45556,6212,1.0,1,4,1
0,4772,2019-01-05,45556,6212,1.0,1,5,1
0,4772,2019-01-05,45556,6212,1.0,1,6,1
0,4772,2019-01-05,45556,6212,1.0,1,7,1
0,4772,2019-01-05,45556,6212,1.0,1,8,1
0,4772,2019-01-05,45556,6212,1.0,1,9,0


In [40]:
X.shape

(20911025, 8)

In [41]:
%%time
X = pd.merge(X, player_feats, on=['player_id', 'date_start'], how='left')

CPU times: user 6.77 s, sys: 3.11 s, total: 9.87 s
Wall time: 10.1 s


In [42]:
import gc; gc.collect()

0

In [43]:
X.shape

(20911025, 36)

In [44]:
X.rename(columns={'player_id': 'ID'}, inplace=True)  
# X.columns = [i if i != 'player_id' else 'ID' for i in X.columns]

In [45]:
X.head()

Unnamed: 0,game_id,date_start,team_id,ID,team_position,type,question_id,y,game_id_y_count,team_position_max,team_position_min,team_position_mean,relative_team_position_max,relative_team_position_min,relative_team_position_mean,relative_team_position_std,questions_hit_rate_min,questions_hit_rate_max,questions_hit_rate_mean,questions_hit_rate_std,questions_N_rate_min,questions_N_rate_max,questions_N_rate_mean,questions_N_rate_std,tmask_p1_hit_rate_min,tmask_p1_hit_rate_max,tmask_p1_hit_rate_mean,tmask_p1_hit_rate_std,tmask_p2_hit_rate_min,tmask_p2_hit_rate_max,tmask_p2_hit_rate_mean,tmask_p2_hit_rate_std,tmask_p3_hit_rate_min,tmask_p3_hit_rate_max,tmask_p3_hit_rate_mean,tmask_p3_hit_rate_std
0,4772,2019-01-05,45556,6212,1.0,1,0,1,1.0,2.0,2.0,2.0,0.026316,0.026316,0.026316,,0.555556,0.555556,0.555556,,0.0,0.0,0.0,,0.666667,0.666667,0.666667,,0.5,0.5,0.5,,0.5,0.5,0.5,
1,4772,2019-01-05,45556,6212,1.0,1,1,1,1.0,2.0,2.0,2.0,0.026316,0.026316,0.026316,,0.555556,0.555556,0.555556,,0.0,0.0,0.0,,0.666667,0.666667,0.666667,,0.5,0.5,0.5,,0.5,0.5,0.5,
2,4772,2019-01-05,45556,6212,1.0,1,2,1,1.0,2.0,2.0,2.0,0.026316,0.026316,0.026316,,0.555556,0.555556,0.555556,,0.0,0.0,0.0,,0.666667,0.666667,0.666667,,0.5,0.5,0.5,,0.5,0.5,0.5,
3,4772,2019-01-05,45556,6212,1.0,1,3,1,1.0,2.0,2.0,2.0,0.026316,0.026316,0.026316,,0.555556,0.555556,0.555556,,0.0,0.0,0.0,,0.666667,0.666667,0.666667,,0.5,0.5,0.5,,0.5,0.5,0.5,
4,4772,2019-01-05,45556,6212,1.0,1,4,1,1.0,2.0,2.0,2.0,0.026316,0.026316,0.026316,,0.555556,0.555556,0.555556,,0.0,0.0,0.0,,0.666667,0.666667,0.666667,,0.5,0.5,0.5,,0.5,0.5,0.5,


In [46]:
drops = ['ID', 'y', 'game_id', 'date_start', 'team_id' ,'player_id', 'team_position', 'question_id']
use_cols = [c for c in X.columns if c not in drops]
all_features = list(X.head()[use_cols].columns)

In [47]:
X.sample(15)

Unnamed: 0,game_id,date_start,team_id,ID,team_position,type,question_id,y,game_id_y_count,team_position_max,team_position_min,team_position_mean,relative_team_position_max,relative_team_position_min,relative_team_position_mean,relative_team_position_std,questions_hit_rate_min,questions_hit_rate_max,questions_hit_rate_mean,questions_hit_rate_std,questions_N_rate_min,questions_N_rate_max,questions_N_rate_mean,questions_N_rate_std,tmask_p1_hit_rate_min,tmask_p1_hit_rate_max,tmask_p1_hit_rate_mean,tmask_p1_hit_rate_std,tmask_p2_hit_rate_min,tmask_p2_hit_rate_max,tmask_p2_hit_rate_mean,tmask_p2_hit_rate_std,tmask_p3_hit_rate_min,tmask_p3_hit_rate_max,tmask_p3_hit_rate_mean,tmask_p3_hit_rate_std
760760,4986,2019-02-15,69492,191326,905.5,1,25,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10299314,5742,2019-06-28,55867,85216,1.0,1,20,1,71.0,455.0,1.0,28.190141,0.781787,0.005618,0.142554,0.150609,0.277778,0.909091,0.632818,0.123203,0.0,0.846154,0.021127,0.126027,0.25,0.916667,0.653611,0.157038,0.166667,1.0,0.617785,0.164061,0.083333,1.0,0.620011,0.162977
3130624,5217,2019-02-01,49888,6523,65.5,1,17,1,6.0,203.0,5.5,86.666667,0.637931,0.02381,0.392617,0.289822,0.333333,0.694444,0.459877,0.148341,0.0,0.307692,0.051282,0.125615,0.333333,0.75,0.498932,0.148376,0.166667,0.75,0.435897,0.212024,0.333333,0.583333,0.430556,0.110763
6020518,5511,2019-03-15,66744,185531,52.0,1,19,0,12.0,787.5,9.0,207.625,0.994208,0.428571,0.711248,0.171654,0.0,0.527778,0.294393,0.139884,0.0,0.0,0.0,0.0,0.0,0.727273,0.331731,0.180114,0.0,0.666667,0.267857,0.185716,0.0,0.666667,0.300595,0.202929
10342780,5744,2019-08-23,38598,18200,179.5,1,34,0,9.0,631.0,10.5,141.277778,0.676471,0.139319,0.461807,0.182695,0.333333,0.611111,0.451852,0.109326,0.0,0.0,0.0,0.0,0.25,0.666667,0.475926,0.139471,0.166667,0.75,0.414815,0.176864,0.266667,0.583333,0.464815,0.129219
6956529,5554,2019-02-01,40400,92090,108.0,3,70,1,1.0,191.5,191.5,191.5,0.186465,0.186465,0.186465,,0.638889,0.638889,0.638889,,0.0,0.0,0.0,,0.75,0.75,0.75,,0.5,0.5,0.5,,0.666667,0.666667,0.666667,
7503512,5590,2019-03-21,54157,129068,4.0,3,4,1,1.0,9.5,9.5,9.5,0.678571,0.678571,0.678571,,0.472222,0.472222,0.472222,,0.0,0.0,0.0,,0.583333,0.583333,0.583333,,0.5,0.5,0.5,,0.333333,0.333333,0.333333,
143915,4973,2019-01-25,42722,67404,513.5,1,23,1,3.0,237.5,101.0,178.166667,0.684438,0.322368,0.481345,0.185022,0.5,0.638298,0.55115,0.075852,0.0,0.0,0.0,0.0,0.727273,0.818182,0.765152,0.047311,0.416667,0.583333,0.484848,0.087368,0.333333,0.583333,0.426768,0.136434
19695702,6255,2019-09-20,3173,102635,154.0,3,24,1,4.0,373.5,20.0,179.375,0.76,0.253165,0.425861,0.233375,0.322222,0.583333,0.472619,0.112781,0.0,0.0,0.0,0.0,0.3,0.833333,0.546591,0.23662,0.233333,0.5,0.35,0.129815,0.433333,0.666667,0.525,0.099536
2814772,5161,2019-03-29,3133,24543,11.5,1,14,0,9.0,117.0,3.0,24.611111,0.308824,0.010252,0.093831,0.103111,0.416667,0.861111,0.714152,0.151366,0.0,0.846154,0.166667,0.334195,0.4,1.0,0.768613,0.205147,0.333333,0.866667,0.630506,0.20832,0.416667,0.833333,0.636054,0.180714


In [48]:
X.game_id_y_count = X.game_id_y_count.fillna(0)

Убираем из обучения игроков, которые ранее не сыграли ни одной игры.

In [49]:
X = X[X.game_id_y_count != 0]

In [50]:
X.sample(15)

Unnamed: 0,game_id,date_start,team_id,ID,team_position,type,question_id,y,game_id_y_count,team_position_max,team_position_min,team_position_mean,relative_team_position_max,relative_team_position_min,relative_team_position_mean,relative_team_position_std,questions_hit_rate_min,questions_hit_rate_max,questions_hit_rate_mean,questions_hit_rate_std,questions_N_rate_min,questions_N_rate_max,questions_N_rate_mean,questions_N_rate_std,tmask_p1_hit_rate_min,tmask_p1_hit_rate_max,tmask_p1_hit_rate_mean,tmask_p1_hit_rate_std,tmask_p2_hit_rate_min,tmask_p2_hit_rate_max,tmask_p2_hit_rate_mean,tmask_p2_hit_rate_std,tmask_p3_hit_rate_min,tmask_p3_hit_rate_max,tmask_p3_hit_rate_mean,tmask_p3_hit_rate_std
17383272,6048,2019-11-21,57218,163352,144.5,1,0,1,29.0,666.0,59.5,237.224138,0.95977,0.1334,0.592536,0.218258,0.111111,0.666667,0.397659,0.144124,0.0,0.846154,0.094492,0.258535,0.0,0.666667,0.401732,0.195943,0.083333,0.833333,0.386951,0.209797,0.166667,0.666667,0.405778,0.131003
5154708,5451,2019-03-16,50821,36385,59.5,3,22,0,23.0,291.5,6.0,98.043478,0.822581,0.081996,0.330208,0.215587,0.244444,0.742857,0.526274,0.154247,0.0,0.692308,0.07979,0.212788,0.25,0.916667,0.563168,0.219711,0.0,0.833333,0.500816,0.231918,0.2,0.75,0.528013,0.155651
5560028,5483,2019-03-23,42158,36813,71.5,1,13,0,3.0,264.0,84.0,175.0,0.808219,0.477273,0.652416,0.166319,0.177778,0.371429,0.247884,0.107317,0.0,0.0,0.0,0.0,0.166667,0.272727,0.213131,0.054236,0.133333,0.416667,0.294444,0.145615,0.083333,0.416667,0.233333,0.169148
6809635,5553,2019-01-26,65915,157615,252.5,3,11,0,6.0,456.0,20.0,185.25,0.620178,0.27027,0.408674,0.133246,0.297297,0.638889,0.538005,0.130752,0.0,0.0,0.0,0.0,0.166667,0.818182,0.594988,0.227845,0.416667,0.636364,0.536082,0.098713,0.307692,0.583333,0.48174,0.102388
13595334,5819,2019-10-11,6767,73885,288.5,1,8,1,22.0,634.5,3.0,178.25,0.900316,0.139727,0.584738,0.201442,0.145833,0.666667,0.398312,0.151794,0.0,0.330233,0.015011,0.070406,0.083333,0.733333,0.463775,0.198425,0.166667,0.75,0.382828,0.18534,0.133333,1.0,0.365152,0.216508
16238284,5928,2019-11-21,42167,78384,13.5,1,22,0,55.0,1270.5,2.0,94.463636,0.916667,0.008916,0.250638,0.188586,0.333333,0.805556,0.569042,0.131812,0.0,0.846154,0.044158,0.185811,0.083333,1.0,0.607451,0.206986,0.133333,0.916667,0.560302,0.180171,0.2,0.833333,0.536538,0.164165
5517671,5480,2019-03-15,39018,91977,93.5,1,16,0,12.0,630.0,18.5,220.333333,0.804348,0.183673,0.523816,0.22429,0.243243,0.666667,0.439652,0.152063,0.0,0.807692,0.120879,0.284492,0.291667,0.75,0.481772,0.163058,0.142857,0.666667,0.437229,0.182814,0.153846,0.75,0.456456,0.202961
6307341,5526,2019-03-09,69849,48319,1.0,3,42,0,4.0,61.5,3.0,25.0,0.078947,0.007166,0.043559,0.032031,0.616667,0.857143,0.757341,0.10097,0.0,0.0,0.0,0.0,0.8,1.0,0.866667,0.090267,0.533333,0.75,0.695833,0.108333,0.6,0.833333,0.733333,0.097183
5681126,5491,2019-03-29,66129,157867,156.0,1,19,1,4.0,687.0,70.5,303.625,0.668939,0.48834,0.599905,0.085376,0.222222,0.472222,0.340278,0.111976,0.0,0.0,0.0,0.0,0.083333,0.5,0.291667,0.173472,0.25,0.833333,0.458333,0.259094,0.083333,0.583333,0.270833,0.239357
16928011,5980,2019-10-19,64457,170771,7.5,3,8,0,5.0,16.5,2.0,7.9,0.423077,0.023342,0.165641,0.164233,0.52,0.777778,0.662889,0.095567,0.0,0.0,0.0,0.0,0.466667,0.866667,0.71,0.15438,0.466667,0.833333,0.65,0.150923,0.333333,0.916667,0.583333,0.258736


для фич std заполним пропуски нулем

In [51]:
std_cols = [i for i in X.columns if i.endswith('std')]

for col in std_cols:
    X[col] = X[col].fillna(0)

для остальных колонок заполним пропуски средним значением

In [52]:
for col in all_features:
    mean = X[col].mean()
    X[col] = X[col].fillna(mean)

In [53]:
%%time
X.to_hdf('./data/X_logreg.h5', key='qwerty')

CPU times: user 2.72 s, sys: 5.64 s, total: 8.36 s
Wall time: 15.2 s


In [54]:
%%time
X = pd.read_hdf('./data/X_logreg.h5')

CPU times: user 10.2 s, sys: 6.29 s, total: 16.4 s
Wall time: 2min 45s


In [55]:
drops = ['ID', 'y', 'game_id', 'date_start', 'team_id' ,'player_id', 'team_position', 'question_id']
use_cols = [c for c in X.columns if c not in drops]
all_features = list(X.head()[use_cols].columns)

In [56]:
from sklearn.linear_model import LogisticRegression

In [57]:
%%time
lr = LogisticRegression(max_iter=10000, n_jobs=1)
lr.fit(X[all_features], X.y)

CPU times: user 13min 11s, sys: 51.1 s, total: 14min 2s
Wall time: 14min 23s


In [58]:
list(zip(lr.coef_.tolist()[0], all_features))

[(0.03604726713535464, 'type'),
 (0.001630718087039463, 'game_id_y_count'),
 (2.7510195857435734e-05, 'team_position_max'),
 (0.00021724462756133857, 'team_position_min'),
 (-0.00040347685207986733, 'team_position_mean'),
 (-0.18271284262628643, 'relative_team_position_max'),
 (-0.38280507725048457, 'relative_team_position_min'),
 (-0.5542492004639924, 'relative_team_position_mean'),
 (0.0218652800752021, 'relative_team_position_std'),
 (0.029997184923498052, 'questions_hit_rate_min'),
 (0.11387193608901616, 'questions_hit_rate_max'),
 (0.1627114229791831, 'questions_hit_rate_mean'),
 (0.012150117649977947, 'questions_hit_rate_std'),
 (-0.010215136234209034, 'questions_N_rate_min'),
 (-0.0198180296284516, 'questions_N_rate_max'),
 (-0.02601840096865586, 'questions_N_rate_mean'),
 (-0.021378376017272892, 'questions_N_rate_std'),
 (0.04063644275039678, 'tmask_p1_hit_rate_min'),
 (0.13698332667951518, 'tmask_p1_hit_rate_max'),
 (0.1730194942641991, 'tmask_p1_hit_rate_mean'),
 (0.009311301

### II часть, в которой я понял, что на тесте будут ровно те же игроки, что и на трейне, а значит можно переобучаться сколько душе угодно

обучили логистическую регрессию, теперь делаем predict_proba сам на себя (на игроков из трейна) и строим рейтинг игрока как мат.ожидание набранных баллов на "среднем" турнире со "средними" по сложности вопросами

формируем "средний" турнир со "средними" вопросами. 

На самом деле нужен даже не целый турнир, а просто один "средний" вопрос. 

Мы будем предсказывать вероятность дать правильный ответ - это и будет рейтинг игрока.

In [68]:
# фичи берем по последней игре данного игрока т.к. там информация о всех прошлых играх
player_feats['max_date_start'] = player_feats.player_id.map(player_feats.groupby('player_id').date_start.max())
XX = player_feats[player_feats.date_start == player_feats.max_date_start]

In [69]:
# вместо фич по вопросам подставляем "средние" значения

In [70]:
XX['type'] = 2   # тип соревнования 

In [71]:
# аналогично заполняем пропуски

std_cols = [i for i in XX.columns if i.endswith('std')]
for col in std_cols:
    XX[col] = XX[col].fillna(0)
    
for col in all_features:
    mean = XX[col].mean()
    XX[col] = XX[col].fillna(mean)

In [74]:
rating = XX[['player_id']].rename(columns={'player_id': 'ID'})

In [75]:
rating['rating'] = lr.predict_proba(XX[all_features])[:, 1]

In [76]:
rating['player'] = rating.ID.map( {k: v.get('name', '') + ' '+ v.get('surname', '') for k,v in players.items()} )

In [85]:
rating = rating.sort_values('rating', ascending=False).reset_index(drop=True)

In [86]:
rating.head(20)

Unnamed: 0,ID,rating,player
0,30152,0.679776,Артём Сорожкин
1,28751,0.676234,Иван Семушин
2,110920,0.67329,Алексей Дворянчиков
3,27403,0.672669,Максим Руссо
4,30270,0.67067,Сергей Спешков
5,121433,0.669187,София Савенко
6,27822,0.668848,Михаил Савченков
7,7008,0.668762,Алексей Гилёв
8,26640,0.668067,Вадим Раскумандрин
9,4270,0.666419,Александра Брутер


In [89]:
top100_real = ['Артём Сорожкин',
'Михаил Савченков',
'Иван Семушин',
'Сергей Спешков',
'Максим Руссо',
'Александра Брутер',
'Александр Либер',
'Михаил Левандовский',
'Ким Галачян',
'Сергей Николенко',
'Тимур Кафиатуллин',
'Юрий Выменец',
'Антон Чернин',
'Наиль Фарукшин',
'Сергей Коновалов',
'Андрей Островский',
'Игорь Тюнькин',
'Татьяна Фёдорова',
'Екатерина Лобкова',
'Андрей Волыхов',
'Кирилл Чернышёв',
'Дмитрий Ожигов',
'Наталья Кудряшова',
'Дмитрий Петров',
'Руслан Хаиткулов',
'Елизавета Овдеенко',
'Дмитрий Карякин',
'Сергей Терентьев',
'Егор Дружинин',
'Алексей Гилёв',
'Максим Поташев',
'Дмитрий Великов',
'Вадим Яковлев',
'Евгений Коватенков',
'Николай Некрылов',
'Инна Семёнова',
'Александр Салита',
'Станислав Мереминский',
'Серафим Шибанов',
'Михаил Малкин',
'Алексей Дворянчиков',
'Ольга Сарницкая',
'Владимир Степанов',
'Александр Карчевский',
'Екатерина Новикова',
'Ринат Сибаев',
'Сергей Иванченко',
'Наталия Рыжанова',
'Александр Мартынов',
'Михаил Новосёлов',
'Александр Марков',
'Иван Ефремов',
'Рузель Халиуллин',
'Алексей Шередега',
'Эльдар Эльман',
'Никита Поверинов',
'Ася Самойлова',
'Мария Подрядчикова',
'Александр Печеный',
'Юлия Архангельская',
'Глеб Николаев',
'Александр Фингеров',
'Владислав Декалов',
'Павел Ершов',
'Ирина Прокофьева',
'Андрей Белов',
'Анвар Мухаметкалиев',
'Иделия Айзятулова',
'Дмитрий Тарарыков',
'Наталья Комар',
'Вячеслав Колосов',
'Юлия Дидбаридзе',
'Владислав Харитонов',
'Игорь Мокин',
'Евгений Перпер',
'Леонид Михлин',
'Николай Крапиль',
'Тимур Боков',
'Владимир Сушков',
'Николай Порцель',
'Ирина Проскурина',
'Карина Файзуллина',
'Анастасия Шестакова',
'Дмитрий Литвинов',
'Игорь Биткин',
'Андрей Цепаев',
'Екатерина Шевцова',
'Станислав Мальчёнков',
'Егор Кузьменко',
'Вадим Раскумандрин',
'Эльмира Гулуева',
'Валерия Кудрявцева',
'Сергей Евсеев',
'Алексей Чебыкин',
'Антон Бочкарёв',
'Денис Галиакберов',
'Мария Кленницкая',
'Анастасия Рубашкина',
'Алексей Шуб',
'Антон Пинчук', ]

top100_real_enumerated = {j:i for i,j in enumerate(top100_real, start=1)}

In [91]:
top100 = rating.head(100)

In [92]:
top100['is_in_real_top100'] = top100.player.isin(top100_real).astype(int)
top100['real_rating'] = top100.player.map(top100_real_enumerated)

In [93]:
top100.head(20)

Unnamed: 0,ID,rating,player,is_in_real_top100,real_rating
0,30152,0.679776,Артём Сорожкин,1,1.0
1,28751,0.676234,Иван Семушин,1,3.0
2,110920,0.67329,Алексей Дворянчиков,1,41.0
3,27403,0.672669,Максим Руссо,1,5.0
4,30270,0.67067,Сергей Спешков,1,4.0
5,121433,0.669187,София Савенко,0,
6,27822,0.668848,Михаил Савченков,1,2.0
7,7008,0.668762,Алексей Гилёв,1,30.0
8,26640,0.668067,Вадим Раскумандрин,1,90.0
9,4270,0.666419,Александра Брутер,1,6.0


In [95]:
top100.is_in_real_top100.sum()

34

### 34 игрока из топ-100 официального рейтинга ЧГК попали в топ-100 по модели.

----

### III часть, где предсказываются результаты турниров 2020 года с известными составами, но неизвестными вопросами

Качество рейтинг-системы оценивается качеством предсказаний результатов турниров. 

Поэтому предложите способ предсказать результаты нового турнира с известными составами, но неизвестными вопросами, в виде ранжирования команд;

Мат.ожадание набранных командой баллов это просто сумма вероятностей ответить на вопросы. Так как заранее мы не знаем сложность вопросов, можно просто оценить вероятность ответить на "средний" вопрос. Рейтинг команды считаем как вероятность, что команда ответит на этот "средний" вопрос. Она равняется тому, что хотя бы один член команды ответит правильно. (сделаем упрощающее предположение, что если хотя бы один член команды дает верный ответ, то и вся команда дает верный ответ)

$$P_{team}(y=1)= 1 - \prod	P_i(y=0)$$

где P_i(0) - индивидуальная вероятность, что член команды не дал правильного ответа на вопрос

----

### IV часть, где вводится ЕМ-алгоритм для учета влияния скрытой переменной "сила команды"


Теперь главное: ЧГК — это всё-таки командная игра. Поэтому:

предложите способ учитывать то, что на вопрос отвечают сразу несколько игроков; скорее всего, понадобятся скрытые переменные; не стесняйтесь делать упрощающие предположения, но теперь переменные “игрок X ответил на вопрос Y” при условии данных должны стать зависимыми для игроков одной и той же команды;


- Е-шаг: по вероятностям (рейтингам) игроков считаем рейтинг команды - вероятность команды верно ответить на вопрос = 1 - П P(y = 0)
- М-шаг: с помощью вероятности команды пересчитываем вероятность игрока ответить на вопрос (то есть теперь уже при условии команды)

----

### V часть, с рейтинг-листом турниров по сложности вопросов.