# Что, где, когда?

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix
from scipy.stats import kendalltau
from scipy.stats import spearmanr

## 1. Обработка данных

In [2]:
players = pd.read_pickle('/content/drive/MyDrive/adv-ml/players.pkl')
tournaments = pd.read_pickle('/content/drive/MyDrive/adv-ml/tournaments.pkl')
results = pd.read_pickle('/content/drive/MyDrive/adv-ml/results.pkl')

In [3]:
# Превью загруженных данных
print('Игроки')
print(players[1], players[22799], sep='\n')
print('Турниры')
print(tournaments[1], tournaments[6483], sep='\n')
print('Результаты')
print(results[1], results[3232], sep='\n')

Игроки
{'id': 1, 'name': 'Алексей', 'patronymic': None, 'surname': 'Абабилов'}
{'id': 22799, 'name': 'Сергей', 'patronymic': 'Игоревич', 'surname': 'Николенко'}
Турниры
{'id': 1, 'name': 'Чемпионат Южного Кавказа', 'dateStart': '2003-07-25T00:00:00+04:00', 'dateEnd': '2003-07-27T00:00:00+04:00', 'type': {'id': 2, 'name': 'Обычный'}, 'season': '/seasons/1', 'orgcommittee': [], 'synchData': None, 'questionQty': None}
{'id': 6483, 'name': 'Онлайн: 19:00 (а)Синхрон-lite. Лига старта. Эпизод XV (NEW!)', 'dateStart': '2020-05-08T19:00:00+03:00', 'dateEnd': '2020-05-08T21:30:00+03:00', 'type': {'id': 2, 'name': 'Обычный'}, 'season': '/seasons/53', 'orgcommittee': [{'id': 7533, 'name': 'Денис', 'patronymic': 'Андреевич', 'surname': 'Гончар'}], 'synchData': None, 'questionQty': {'1': 12, '2': 12, '3': 12}}
Результаты
[{'team': {'id': 242, 'name': 'Команда Азимова', 'town': {'id': 21, 'name': 'Баку'}}, 'mask': None, 'current': {'name': 'Команда Азимова', 'town': {'id': 21, 'name': 'Баку'}}, 'que

In [4]:
players_df = pd.DataFrame.from_dict(players, orient='index')
tournaments_df = pd.DataFrame.from_dict(tournaments, orient='index')
results_df = pd.DataFrame.from_dict(results, orient='index')

In [5]:
tournaments_df['dateStart'] = pd.to_datetime(tournaments_df['dateStart'], utc=True)
tournaments_df['dateEnd'] = pd.to_datetime(tournaments_df['dateEnd'], utc=True)

In [6]:
players_df.head(3)

Unnamed: 0,id,name,patronymic,surname
1,1,Алексей,,Абабилов
10,10,Игорь,,Абалов
11,11,Наталья,Юрьевна,Абалымова


In [7]:
tournaments_df.tail(3)

Unnamed: 0,id,name,dateStart,dateEnd,type,season,orgcommittee,synchData,questionQty
6483,6483,Онлайн: 19:00 (а)Синхрон-lite. Лига старта. Эп...,2020-05-08 16:00:00+00:00,2020-05-08 18:30:00+00:00,"{'id': 2, 'name': 'Обычный'}",/seasons/53,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",,"{'1': 12, '2': 12, '3': 12}"
6484,6484,"Онлайн: 22:00 Не числом, а умением - 2 (NEW!)",2020-05-04 19:00:00+00:00,2020-05-04 20:40:00+00:00,"{'id': 2, 'name': 'Обычный'}",/seasons/53,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",,"{'1': 12, '2': 12}"
6485,6485,"Онлайн: 19:00 Не числом, а умением",2020-05-06 16:00:00+00:00,2020-05-06 17:45:00+00:00,"{'id': 2, 'name': 'Обычный'}",/seasons/53,"[{'id': 7533, 'name': 'Денис', 'patronymic': '...",,"{'1': 12, '2': 12}"


### Отбор

In [8]:
train_tournaments = tournaments_df[(tournaments_df['dateStart'] >= '2019-01-01') & (tournaments_df['dateStart'] <= '2019-12-31')]
test_tournaments = tournaments_df[(tournaments_df['dateStart'] >= '2020-01-01') & (tournaments_df['dateStart'] <= '2020-12-31')]

In [9]:
def clean_tournaments(tournaments: pd.DataFrame, results: pd.DataFrame) -> pd.DataFrame:
  cleaned_tournaments = tournaments.copy()
  for id in cleaned_tournaments['id']:
    if not results.loc[id, 0] or ('mask' in results.loc[id, 0].keys() and not results.loc[id, 0]['mask']):
      cleaned_tournaments.drop([id], inplace=True)
  
  return cleaned_tournaments

In [10]:
train_tournaments_cleaned = clean_tournaments(train_tournaments, results_df)
test_tournaments_cleaned = clean_tournaments(test_tournaments, results_df)

In [11]:
print('Shape train до чистки:', train_tournaments.shape, 'после', train_tournaments_cleaned.shape)
print('Shape test до чистки:', test_tournaments.shape, 'после', test_tournaments_cleaned.shape)

Shape train до чистки: (687, 9) после (675, 9)
Shape test до чистки: (415, 9) после (191, 9)


### Вспомогательные функции

In [12]:
def get_tournaments_results(tournaments: pd.DataFrame, results: dict) -> dict:
  tournaments_results = {}

  for id in tournaments['id']:
    tournament_result = pd.DataFrame(columns=['team_id', 'team_name', 'team_members', 'mask', 'position'])
    teams_results = results[id]
    tournament_result['team_id'] = [team_result['team']['id'] for team_result in teams_results]
    tournament_result['team_name'] = [team_result['team']['name'] for team_result in teams_results]
    tournament_result['team_members'] = [[player['player']['id'] for player in team_result['teamMembers']] 
                                         for team_result in teams_results] # if team_result['teamMembers']
    tournament_result['mask'] = [team_result['mask'] if 'mask' in team_result.keys() else '' for team_result in teams_results]
    tournament_result['position'] = [team_result['position'] if 'position' in team_result.keys() else 100000 for team_result in teams_results]

    tournaments_results[id] = tournament_result
  
  return tournaments_results

In [13]:
def get_players_info(tournaments_results: dict) -> tuple:
  tournaments_of_player = {}
  all_players = set()

  for id in tournaments_results.keys():
    tournament_result = tournaments_results[id]
    players = [player for players in tournament_result['team_members'] for player in players]
    all_players.update(players)
    for player in players:
      if player not in tournaments_of_player:
        tournaments_of_player[player] = [id]
      else:
        tournaments_of_player[player].append(id)
  
  return tournaments_of_player, all_players

## 2. Baseline-модель

* Baseline-модель на основе логистической регрессии.
* Повопросные результаты команды относятся к каждому из её игроков.
* Учитываем сложность вопросов (по уровню сложности турнира).

После обучения модели получим:
* веса навыков $s_i$,
* веса сложности вопросов $c_j$,
* общее смещение $b$. 

Вероятность правильного ответа на вопрос будет равна: $$ P(s_i, c_j) = \sigma(s_i + c_j + b)$$

In [14]:
train_tournaments_results = get_tournaments_results(train_tournaments_cleaned, results)
train_tournaments_of_player, train_all_players = get_players_info(train_tournaments_results)
sorted_players = sorted(train_tournaments_of_player.keys(), key=lambda id: len(train_tournaments_of_player[id]), reverse=True)
players_indexes = {id: idx for idx, id in enumerate(sorted_players)}

In [134]:
# Готовим матрицу
def create_matrix(tournaments_results: dict,
                  sorted_players: list,
                  tournaments_of_player: dict,
                  players_indexes: dict):

  tournaments = []
  for player in sorted_players:
    for tournament in tournaments_of_player[player]:
      if tournament not in tournaments:
        tournaments.append(tournament)

  combinations = 0
  rows, columns, data = [], [], []
  target = []
  t_cnt, c_cnt = len(sorted_players), 0
  cur_mask_len = 0
  reverse = {}
  
  for id in tournaments:
    tournament_result = tournaments_results[id]
    
    for _, team in tournament_result.iterrows():
      players = team['team_members']
      mask = team['mask']
      if not players or not mask:
        continue
      cur_mask_len = len(mask)
      # cur_mask_len = sum(train_tournaments.loc[id, 'questionQty'].values())
      # combinations += len(players) * cur_mask_len

      for player in players:
        for q_num in range(cur_mask_len):
          columns.append(players_indexes[player])
          rows.append(c_cnt)
          data.append(1)
          columns.append(t_cnt + q_num)
          rows.append(c_cnt)
          data.append(1)

          # if len(mask) < cur_mask_len:
          #   mask += '0' * (cur_mask_len - len(mask))

          t = mask[q_num]
          t = int(t if t in '01' else 0)
          target.append(t)
          reverse[(players_indexes[player], t_cnt + q_num)] = c_cnt
          c_cnt += 1
    t_cnt += cur_mask_len
  
  matrix = csr_matrix((data, (rows, columns)))

  return matrix, target, combinations, reverse

In [135]:
matrix, target, combinations, reverse = create_matrix(train_tournaments_results, sorted_players,
                                                      train_tournaments_of_player, players_indexes)

In [136]:
model = LogisticRegression(solver='saga')
model.fit(matrix, target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
with open('/content/drive/MyDrive/adv-ml/model.pkl','wb') as f:
    pickle.dump(model, f)

In [None]:
with open('/content/drive/MyDrive/adv-ml/model.pkl', 'rb') as f:
  model = pickle.load(f)

## 3. Качество baseline-модели

Для оценки качества воспльзуемся ранговыми корреляциями Спирмена и Кендалла.

Отсортируем команды по вероятности правильного ответа команды на вопрос.

Вероятность правильного ответа команды на вопрос с учетом силы (навыков) каждого игрока и сложности вопроса q:
$$P = 1 - \prod\limits_{i = 1}(1 - \sigma(s_i + c + b))$$
Сложность вопроса для baseline-модели выберем любое число от 0 до 1.





In [137]:
test_tournaments_results = get_tournaments_results(test_tournaments_cleaned, results)
test_tournaments_of_player, test_all_players = get_players_info(test_tournaments_results)

In [138]:
def answer_prob(s, c, b):
  return 1 - (1 / (1 + np.exp(-(s + c + b))))

In [139]:
def get_coefs(tournaments_results: dict, players_indexes: dict, skills: np.ndarray) -> tuple:
  kendal_coefs = []
  spearman_coefs = []

  for tournament_id, tournament_result in tournaments_results.items():
    true_pos = list(tournament_result['position'])
    if np.mean(true_pos) == true_pos[0]:
      true_pos[-1] += 0.00000000001
    preds = []

    for _, team_result in tournament_result.iterrows():
      team_members = list(team_result['team_members'])
    
      command_skill = 1
      for player in team_members:
        if player in players_indexes:
          command_skill *=  answer_prob(skills[players_indexes[player]], 0, model.intercept_[0])
      preds.append(command_skill)
    
    spearman_coef, _ = spearmanr(true_pos, preds)
    kendal_coef, _ = kendalltau(true_pos, preds)
   
    if not (np.isnan(kendal_coef) or np.isnan(spearman_coef)):
      spearman_coefs.append(spearman_coef)
      kendal_coefs.append(kendal_coef)

  return spearman_coefs, kendal_coefs

In [181]:
skills = model.coef_[0][0:len(sorted_players)]
spearman_coefs, kendal_coefs = get_coefs(test_tournaments_results, players_indexes, skills)
print('_________________________')
print('mean spearman coef:', round(np.mean(spearman_coefs), 4))
print('mean kendall coef:', round(np.mean(kendal_coefs), 4))

_________________________
mean spearman coef: 0.7393
mean kendall coef: 0.5946


Также для оценки качества рейтинг-системы можем сравнивать рейтинг игроков предсказанный моделью с рейтингом игроков из результатов ЧГК по результатам 2019 года.

In [141]:
players_rating = pd.read_csv('/content/drive/MyDrive/adv-ml/players-rating.csv')
new_cols = {' ИД': 'id', 'Имя': 'name', 'Отчество': 'patronymic', 'Фамилия': 'surname',
            'ИД базовой команды': 'base_team_id', 'Базовая команда': 'base_team',
            'Место': 'place', 'Рейтинг': 'rating'}
players_rating.rename(columns=new_cols, inplace=True)

In [142]:
players_rating.head(3)

Unnamed: 0,id,name,patronymic,surname,base_team_id,base_team,place,rating
0,30152,Артём,Сергеевич,Сорожкин,,"Москва, Долгопрудный, Санкт-Петербург, Калуга,...",1,14897
1,28751,Иван,Николаевич,Семушин,,"Москва, Долгопрудный, Санкт-Петербург, Киров, ...",2,14789
2,27822,Михаил,Владимирович,Савченков,,"Москва, Могилёв, Серпухов, Минск, Калининград,...",3,14726


In [143]:
def get_coefs_for_players(players_indexes: dict,
                          players_rating: pd.DataFrame,
                          model: LogisticRegression,
                          players_info: dict):
  true_rating = []
  pred_rating = []
  players_ids = []

  model_coef = model.coef_
  if model_coef.shape[0] == 1:
    model_coef = model_coef[0]

  for i, player in players_rating.iterrows():
    if player['id'] in players_indexes:
        pred_rating.append(model_coef[players_indexes[player['id']]])
        true_rating.append(player['rating'])
        players_ids.append(player['id'])

  spearman_coef, _ = spearmanr(true_rating, pred_rating) 
  kendal_coef, _ = kendalltau(true_rating, pred_rating)

  spearman_coef_100, _ = spearmanr(true_rating[:100], pred_rating[:100]) 
  kendal_coef_100, _ = kendalltau(true_rating[:100], pred_rating[:100])

  pred_table = pd.DataFrame(columns=['pred_player_id', 'pred_rating'])
  pred_table['pred_player_id'] = players_ids
  pred_table['pred_rating'] = pred_rating
  pred_table.sort_values(by=['pred_rating'], ascending=False, inplace=True)
  pred_names = [players_info[id]['name'] + ' ' + players_info[id]['surname']
                for id in pred_table['pred_player_id']]
  
  true_table = pd.DataFrame(columns=['player_id', 'name'])
  true_table['player_id'] = players_ids
  names = [players_info[id]['name'] + ' ' + players_info[id]['surname']
           for id in true_table['player_id']]

  comp_table = pd.DataFrame(columns=['pred_id', 'pred_name', 'id', 'name'])
  comp_table['pred_id'] = pred_table['pred_player_id'].tolist()
  comp_table['pred_name'] = pred_names
  comp_table['id'] = true_table['player_id'].tolist()
  comp_table['name'] = names

  return spearman_coef, kendal_coef, spearman_coef_100, kendal_coef_100, comp_table

In [144]:
sp_coef, kend_coef, sp_coef_100, kend_coef_100, comp_table = get_coefs_for_players(players_indexes,
                                                                                   players_rating,
                                                                                   model,
                                                                                   players)
print('Для всей базы игроков:')
print('spearman coef:', sp_coef)
print('kendal coef:', kend_coef)
print('______________________')
print('Для топ-100 игроков:')
print('spearman coef:', sp_coef_100)
print('kendal coef:', kend_coef_100)

Для всей базы игроков:
spearman coef: 0.8241117487052292
kendal coef: 0.632759451707202
______________________
Для топ-100 игроков:
spearman coef: 0.4709680725748712
kendal coef: 0.33299650405553305


Посмотрим сравнительную таблицу

In [145]:
top_n = 40
comp_table.head(top_n)

Unnamed: 0,pred_id,pred_name,id,name
0,27403,Максим Руссо,30152,Артём Сорожкин
1,4270,Александра Брутер,28751,Иван Семушин
2,28751,Иван Семушин,27822,Михаил Савченков
3,30270,Сергей Спешков,30270,Сергей Спешков
4,27822,Михаил Савченков,27403,Максим Руссо
5,30152,Артём Сорожкин,4270,Александра Брутер
6,18036,Михаил Левандовский,18332,Александр Либер
7,20691,Станислав Мереминский,7008,Алексей Гилёв
8,22799,Сергей Николенко,6212,Юрий Выменец
9,26089,Ирина Прокофьева,15456,Сергей Коновалов


In [146]:
intersect_10 = pd.Index(comp_table['id'][:10]).intersection(pd.Index(comp_table['pred_id'][:10])).shape[0]
intersect_20 = pd.Index(comp_table['id'][:20]).intersection(pd.Index(comp_table['pred_id'][:20])).shape[0]
intersect_50 = pd.Index(comp_table['id'][:50]).intersection(pd.Index(comp_table['pred_id'][:50])).shape[0]
intersect_100 = pd.Index(comp_table['id'][:100]).intersection(pd.Index(comp_table['pred_id'][:100])).shape[0]
print('Пересечения в топ-10:', intersect_10)
print('Пересечения в топ-20:', intersect_20)
print('Пересечения в топ-50:', intersect_50)
print('Пересечения в топ-100:', intersect_100)

Пересечения в топ-10: 6
Пересечения в топ-20: 11
Пересечения в топ-50: 19
Пересечения в топ-100: 40


## Модель с EM-алгоритмом

Построим модель со скрытыми переменными:
* $z_{ij}$ — игрок $i$ из команды $n$ ответил на вопрос $j$.

Видимая переменная:
* $x_{nj}$ — команда $n$ ответила на вопрос $j$.

Параметры модели — прежние:
* веса навыков $s_i$,
* веса сложности вопросов $c_j$,
* общее смещение $b$. 

**EM-схема:**
1. **E-шаг.** Зафиксируем начальные параметры модели $s_i$ и $c_j$ (используем параметры baseline-модели) и вычислим ожидание скрытых переменных $z_{ij}$.

 $E[z_{ij}]$ =
 * 0, при $x_{nj} = 0$
 * $p(z_{ij}=1|x_{nj} = 1) = \frac{\sigma(s_i + c_j + b)}{1 - \prod\limits_{i=1}{(1 - \sigma(s_i + c_j + b))}}$, при $x_{nj} = 1$
2. **M-шаг.** Зафиксируем полученные значения и обучим логистическую регрессию со значениями, полученных на предыдущем шаге:

 $E[z_{ij}]$ ~ $\sigma(s_i + c_j + b)$

In [112]:
def inverse_sigma(sigma):
  if sigma > 1 or abs(sigma - 1) < 1 - 1/(1 + np.exp(-6)): 
    return 6
  if abs(sigma) < 1 / (1 + np.exp(6)):  
    return -6
  return -np.log(1 / sigma - 1)

In [173]:
def em_alg(sigma_preds: list,
           tournaments_results: dict,
           sorted_players: list,
           tournaments_of_player: dict,
           players_indexes: dict,
           reverse: dict):
  
  t_cnt, c_cnt = len(sorted_players), 0
  cur_mask_len = 0
  target = []
  preds = []

  tournaments = []
  for player in sorted_players:
    for tournament in tournaments_of_player[player]:
      if tournament not in tournaments:
        tournaments.append(tournament)
  
  for id in tournaments:
    tournament_result = tournaments_results[id]
    
    for _, team in tournament_result.iterrows():
      players = team['team_members']
      mask = team['mask']
      if not players or not mask:
        continue
      cur_mask_len = len(mask)
      # cur_mask_len = sum(train_tournaments.loc[id, 'questionQty'].values())
      # if len(mask) < cur_mask_len:
      #   mask += '0' * (cur_mask_len - len(mask))

      q_preds = []
      for q_num in range(cur_mask_len):
        p = 1
        if mask[q_num] == '1':
          for player in players:
            p *= (1 - sigma_preds[reverse[players_indexes[player], t_cnt + q_num]])
          preds.append(1 - p)
        q_preds.append(1 - p)

      for player in players:
        for q_num in range(cur_mask_len):
          if mask[q_num] == '1':
            s_pred = sigma_preds[reverse[players_indexes[player], t_cnt + q_num]] / q_preds[q_num]
            if abs(s_pred) > 1.000001:
              s_pred = inverse_sigma(s_pred)
          else:
            s_pred = -6
          target.append(s_pred)
          c_cnt += 1

    t_cnt += cur_mask_len

  p_mean = np.mean(preds)

  return preds, p_mean, target

In [174]:
sigma_preds = model.predict_proba(matrix)[:, 1]

In [159]:
em_preds, em_mean, em_target = em_alg(sigma_preds, train_tournaments_results,
                                      sorted_players, train_tournaments_of_player,
                                      players_indexes, reverse)



In [115]:
def sigma(x):
  return 1 / (1 + np.exp(-x))

In [175]:
def em_iterator(n: int,
                model,
                sigma_preds,
                matrix,
                train_tournaments_results,
                test_tournaments_results,
                players_indexes,
                reverse,
                sorted_players):
  
  all_models = [model]
  
  iter_log = {}
  coef_preds = model.coef_[0]

  for i in range(1, n):
    em_preds, em_mean, em_target = em_alg(sigma_preds, train_tournaments_results,
                                      sorted_players, train_tournaments_of_player,
                                      players_indexes, reverse)
    model_1 = LinearRegression()
    model_1.fit(matrix, em_target)
    all_models.append(model_1)

    m1_preds = model_1.predict(matrix)
    sigma_preds = list(map(sigma, m1_preds))
    sigma_preds_1 = list(map(sigma, em_target))

    skills = model_1.coef_[0:len(sorted_players)]
    complexity = model_1.coef_[len(sorted_players):]

    spearman_coefs, kendal_coefs = get_coefs(test_tournaments_results, players_indexes, skills)
    mse_1 = metrics.mean_squared_error(sigma_preds, sigma_preds_1)
    mse_2 = metrics.mean_squared_error(coef_preds, model_1.coef_)

    iter_log[i] = {
        'team_right_ans_prob': em_mean,
        'spearman_coefs': np.mean(spearman_coefs),
        'kendal_coefs': np.mean(kendal_coefs),
        'mse_target': mse_1,
        'mse_weights': mse_2,
    }

    coef_preds = model_1.coef_
    print(f'model {i} - done')

  return iter_log, all_models

In [None]:
results = 0
results_df = 0

In [176]:
n = 21
log, models = em_iterator(n, model, sigma_preds, matrix, 
                train_tournaments_results,
                test_tournaments_results,
                players_indexes, reverse, sorted_players)

model 1 - done
model 2 - done
model 3 - done
model 4 - done
model 5 - done
model 6 - done
model 7 - done
model 8 - done
model 9 - done
model 10 - done
model 11 - done
model 12 - done
model 13 - done
model 14 - done
model 15 - done
model 16 - done
model 17 - done
model 18 - done
model 19 - done
model 20 - done


In [180]:
with open('/content/drive/MyDrive/adv-ml/models2.pkl','wb') as f:
    pickle.dump(models, f)

with open('/content/drive/MyDrive/adv-ml/log2.pkl','wb') as f:
    pickle.dump(log, f)

In [185]:
with open('/content/drive/MyDrive/adv-ml/models2.pkl', 'rb') as f:
  models = pickle.load(f)

with open('/content/drive/MyDrive/adv-ml/log2.pkl', 'rb') as f:
  log = pickle.load(f)

In [188]:
models

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='saga', tol=0.0001, verbose=0,
                    warm_start=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegres

In [189]:
compare = pd.DataFrame(columns=['param'] + ['iter_' + str(i) for i in range(1, n)])
compare['param'] = list(log[1].keys())
for i in range(1, n):
  compare['iter_' + str(i)] = list(map(lambda x: round(x, 6), log[i].values()))

In [219]:
compare

Unnamed: 0,param,iter_1,iter_2,iter_3,iter_4,iter_5,iter_6,iter_7,iter_8,iter_9,iter_10,iter_11,iter_12,iter_13,iter_14,iter_15,iter_16,iter_17,iter_18,iter_19,iter_20
0,team_right_ans_prob,0.940845,0.355276,0.317376,0.314632,0.314371,0.314343,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339,0.314339
1,spearman_coefs,0.494258,0.489808,0.489338,0.489314,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311,0.489311
2,kendal_coefs,0.389902,0.386041,0.385622,0.385596,0.385594,0.385593,0.385594,0.385594,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593,0.385593
3,mse_target,0.129143,0.098233,0.097687,0.097657,0.097655,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654,0.097654
4,mse_weights,1.797659,0.011227,0.000129,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Видим, что целевые метрики растут на первых 5 итерациях, после чего метрики не изменяются. Поэтому выберем лучшую модель под номером 5.

In [231]:
best_model = models[4]

## 5. Рейтинг-лист турниров

Для составления рейтинга возьмем турниры 2019 года. Сложность турнира будем оценивать как среднюю сложность вопросов этого турнира.

In [232]:
def count_tournament_complexity(tournaments_results,
                                train_tournaments,
                                model,
                                tournaments_of_player,
                                sorted_players):
  
  start = len(sorted_players)
  complexity = model.coef_
    
  tournaments_complexity_list = []
  tournaments = []
  tournaments_names = []

  for player in sorted_players:
    for tournament in tournaments_of_player[player]:
      if tournament not in tournaments:
        tournaments.append(tournament)
        tournaments_names.append(train_tournaments.loc[tournament, 'name'])

  tournaments_complexity = pd.DataFrame(columns=['id', 'tournament_name', 'complexity'])
  tournaments_complexity['id'] = tournaments
  tournaments_complexity['tournament_name'] = tournaments_names

  for id in tournaments:
    mask = tournaments_results[id].loc[0, 'mask']
    n_q = len(mask)
    end = start + n_q
    tournaments_complexity_list.append(np.mean(complexity[start:end]))
    start += n_q

  tournaments_complexity['complexity'] = tournaments_complexity_list

  return tournaments_complexity

In [233]:
tournaments_complexity = count_tournament_complexity(train_tournaments_results, 
                                                     train_tournaments,
                                                     best_model, 
                                                     train_tournaments_of_player, sorted_players)

tournaments_complexity.dropna(subset=["complexity"], inplace=True)
tournaments_complexity = tournaments_complexity.sort_values(by='complexity', ascending=False)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [234]:
# топ-50 сложных турниров
tournaments_complexity.head(40)

Unnamed: 0,id,tournament_name,complexity
444,5537,Полесские хроники,5.473292
441,6018,Синхрон ОК СПбГУ,5.473292
439,5916,Кубок красной кнопки,5.389485
426,5584,Гран-при Бауманки. 4 этап,3.507585
427,5698,(а)Синхрон-lite. Лига старта. Эпизод VII,3.425007
602,5458,Чемпионат школы №1568,3.405038
464,5726,Первый турнир имени Джоуи Триббиани,3.288335
465,6003,Второй тематический турнир имени Джоуи Триббиани,3.271169
327,5495,Столикий синхрон,3.26547
425,5533,Саратов,3.250454


In [235]:
# 50 "средних" по сложности турниров
mid = tournaments_complexity.shape[0] // 2 - 20
mid_tournaments = tournaments_complexity.iloc[mid:, :]

In [236]:
mid_tournaments.head(40)

Unnamed: 0,id,tournament_name,complexity
344,6049,Третя октава. Ліга націй: Україна,-0.048887
79,5517,Летний Синхронный Умлаут,-0.051197
402,5632,"Кубок Победы, или Ультиматум - 17 (КВ)",-0.056447
297,5606,Беларускае люстэрка. Дзень другi,-0.06194
410,6011,Щит и Меч,-0.063852
375,5185,Игра Первопрестольной. Четвёртый сезон,-0.076103
130,5690,Пущинские Дали,-0.078766
561,6051,Трэцяя актава. Ліга нацый: Беларусь,-0.085864
520,5630,Чемпионат МГУ. Открытая лига. Четвёртый игрово...,-0.088337
511,5521,Гран-при Текстильной столицы,-0.103707


In [237]:
# 50 легких турниров (последние 50)
tournaments_complexity.tail(40)

Unnamed: 0,id,tournament_name,complexity
388,5978,Кубок соседней галактики. Бета,-1.733449
250,5557,Гран-при Славянки. 3 этап,-1.736543
62,5432,Крутое пике,-1.748967
501,5945,Чемпионат Мира. Этап 3. Группа А,-1.75145
279,6009,Синхронный турнир Mediterranean Cup,-1.763135
430,5983,5 o'clock (зеркало),-1.792482
251,5591,Зефир,-1.803172
257,5686,"Хороший, плохой, синхрон",-1.844001
273,5894,Осенняя поляна,-1.89309
494,5652,Чемпионат Минска. Лига А. Тур шестой,-1.902696


Если к сложным турнирам относить крупные (региональные, государственного масштаба), то их можно встретить как в начале, так и в конце списка.
Это можно объяснить тем, что сложность турниров кореллирует с навыками игроков. 

## 6. Топ-игроков с новой моделью

In [238]:
sp_coef_new, kend_coef_new, sp_coef_100_new, kend_coef_100_new, comp_table_new = get_coefs_for_players(players_indexes,
                                                                                   players_rating,
                                                                                   best_model,
                                                                                   players)
print('Для всей базы игроков:')
print('spearman coef:', sp_coef)
print('kendal coef:', kend_coef)
print('______________________')
print('Для топ-100 игроков:')
print('spearman coef:', sp_coef_100)
print('kendal coef:', kend_coef_100)

Для всей базы игроков:
spearman coef: 0.8241117487052292
kendal coef: 0.632759451707202
______________________
Для топ-100 игроков:
spearman coef: 0.4709680725748712
kendal coef: 0.33299650405553305


In [239]:
comp_table_new = comp_table_new.drop(columns=['id', 'name'])

In [240]:
def add_question_count(rating,
                       train_tournaments,
                       tournaments_of_player):
  
  questions_nums = []
  for player_id in rating['pred_id']:
    q_s = 0
    for tournament_id in tournaments_of_player[player_id]:
      q_s += sum(train_tournaments.loc[tournament_id, 'questionQty'].values())
    questions_nums.append(q_s)
  
  rating['questions_num'] = questions_nums

  return rating

In [241]:
top_rating_with_q = add_question_count(comp_table_new, train_tournaments, train_tournaments_of_player)

In [247]:
top_rating_with_q.head(40)

Unnamed: 0,pred_id,pred_name,questions_num
1,27403,Максим Руссо,3059
3,4270,Александра Брутер,3791
4,28751,Иван Семушин,4883
5,27822,Михаил Савченков,4460
6,30152,Артём Сорожкин,6109
7,30270,Сергей Спешков,4991
9,18036,Михаил Левандовский,1740
10,20691,Станислав Мереминский,1955
11,21698,Александр Мосягин,1393
12,26089,Ирина Прокофьева,1351


Действительно, в топе присутствуют игроки с небольшим количеством ответов на вопросы.
Попробуем сделать отсечку по мин. количеству игр = 5, то есть минимальное количество вопросов = 200 при среднем 40 за игру.

In [243]:
min_q = 200

In [244]:
def cut_noobs(rating, min_q):
  new_rating = rating

  for idx, player in rating.iterrows():
    if player['questions_num'] < 200:
      new_rating.drop(index=idx, inplace=True)

  return new_rating

In [245]:
cleaned_top_rating_with_q = cut_noobs(top_rating_with_q, min_q)

In [246]:
cleaned_top_rating_with_q.reset_index(drop=True).head(40)

Unnamed: 0,pred_id,pred_name,questions_num
0,27403,Максим Руссо,3059
1,4270,Александра Брутер,3791
2,28751,Иван Семушин,4883
3,27822,Михаил Савченков,4460
4,30152,Артём Сорожкин,6109
5,30270,Сергей Спешков,4991
6,18036,Михаил Левандовский,1740
7,20691,Станислав Мереминский,1955
8,21698,Александр Мосягин,1393
9,26089,Ирина Прокофьева,1351


После отсечки по количество ответов — в топ-10 присутствуют игроки только с количеством ответов 1000, а в топ-5 - более 3000.

Выкинуть игроков, не прошедших минимальный порог и просто переобучить модель - не выйдет, так как изменится размерность матрицы.
Если вместо отсечения игроков просто обнулять результаты данных игроков, то изменится качество модели — вклад игроков с небольшим количеством ответов на вопросы может значительным, за счет количество таких игроков.

Оптимальным решением будет увеличение весов навыков у игроков учитывая количество ответов на вопросы и количество правильных ответов на вопросы.