In [151]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
import pickle

## Task 1

In [2]:
players = pd.read_pickle('./data_hw3/players.pkl')
tournaments = pd.read_pickle('./data_hw3/tournaments.pkl')
results = pd.read_pickle('./data_hw3/results.pkl')

In [3]:
def has_teamMembers_and_mask(tournament):
    flg = True
    for team in tournament:
        if (len(team['teamMembers']) == 0) or (team.get('mask') is None):
            flg = False
    return flg

In [4]:
# поправка на наличие инфо о составе и ответах по вопросам
results_corrected  = {i : results[i] for i in results.keys() if has_teamMembers_and_mask(results[i])}

In [5]:
# берем турниры 2019 в train 2020 в test
tournaments_2019 = {i : tournaments[i] for i in results_corrected.keys() if tournaments[i]['dateStart'][:4] == '2019'}
tournaments_2020 = {i : tournaments[i] for i in results_corrected.keys() if tournaments[i]['dateStart'][:4] == '2020'}

results_2019  = {i : results_corrected[i] for i in results_corrected.keys() if tournaments[i]['dateStart'][:4] == '2019'}
results_2020  = {i : results_corrected[i] for i in results_corrected.keys() if tournaments[i]['dateStart'][:4] == '2020'}
print('число турниров в 2019:', len(tournaments_2019))
print('число турниров в 2029:', len(tournaments_2020))

число турниров в 2019: 657
число турниров в 2029: 386


## Task 2

Как обучаем рейтинг игроков: 
    
    1) Обучаем модель повопросных результатов для каждого из играков
    2) Добавим в признаки среднее число правильных ответов на вопрос и число правильных ответов на турнире
    3) Закодируем игроков - onehot вектором
    4) Обучаем на этих признаках
    5) Используем веса при onehot векторе игрока, как его силу
    6) Команду будем понимать как сумма весов игроков в этой команде
    7) И посмотрим метрики ранговой коррелиации Спирмана и Кенделла для 2020 года

In [6]:
def to_int(x):
    try: 
        return int(x)
    except:
        return 0

In [7]:
data = {
    'tournament_id': [],
    'team_id': [],
    'player_id': [],
    'question_num': [],
    'score': []
}

for i in results_2019.keys():
    for team in results_2019[i]:
        for player in team['teamMembers']: 
            for j in range(len(team['mask'])):
                data['tournament_id'].append(i)
                data['team_id'].append(team['team']['id'])
                data['question_num'].append(j+1)
                data['player_id'].append(player['player']['id'])
                data['score'].append(to_int(team['mask'][j]))

data = pd.DataFrame.from_dict(data)

In [8]:
# team_player = {
#     'tournament_id': [],
#     'team_id': [],
#     'player_id': []
# }

# for i in results_2019.keys():
#     for team in results_2019[i]:
#         for player in team['teamMembers']:
#             team_player['tournament_id'].append(i)
#             team_player['team_id'].append(team['team']['id'])
#             team_player['player_id'].append(player['player']['id'])
            

# team_player = pd.DataFrame.from_dict(team_player)

In [9]:
# team_player.tail()

In [10]:
data.tail()

Unnamed: 0,tournament_id,team_id,player_id,question_num,score
15268625,6191,76301,217859,32,0
15268626,6191,76301,217859,33,0
15268627,6191,76301,217859,34,1
15268628,6191,76301,217859,35,0
15268629,6191,76301,217859,36,0


In [11]:
question_score = data.groupby(['tournament_id', 'question_num'])['score'].mean().reset_index()
question_score.head()

Unnamed: 0,tournament_id,question_num,score
0,4772,1,0.892295
1,4772,2,0.776305
2,4772,3,0.463132
3,4772,4,0.541839
4,4772,5,0.888981


In [12]:
tournament_score = data.groupby(['tournament_id'])['score'].mean().reset_index()
tournament_score.head()

Unnamed: 0,tournament_id,score
0,4772,0.475605
1,4973,0.480189
2,4974,0.474859
3,4975,0.444517
4,5000,0.606838


In [13]:
data = data.merge(tournament_score, on = 'tournament_id', suffixes = ('','_tournament') )
data = data.merge(question_score, on = ['tournament_id', 'question_num'], suffixes = ('','_question'))

In [14]:
data.tail()

Unnamed: 0,tournament_id,team_id,player_id,question_num,score,score_tournament,score_question
15268625,6191,76301,217855,36,0,0.28252,0.243902
15268626,6191,76301,217856,36,0,0.28252,0.243902
15268627,6191,76301,217857,36,0,0.28252,0.243902
15268628,6191,76301,217858,36,0,0.28252,0.243902
15268629,6191,76301,217859,36,0,0.28252,0.243902


In [15]:
encoder = OneHotEncoder().fit(data['player_id'].to_frame())

In [16]:
X_train = hstack([csr_matrix(data.drop(columns = ['tournament_id', 'team_id', 'player_id', 'question_num', 'score']).values),
        encoder.transform(data['player_id'].to_frame())])
y_train = data['score']

In [17]:
model = LogisticRegression(solver = 'saga').fit(X_train, y_train)

In [18]:
model.coef_

array([[ 0.36454272,  5.62393033, -0.08720916, ..., -1.04266369,
        -0.03922646,  1.26033551]])

In [19]:
with open('baseline.pkl', 'wb') as file:
    pickle.dump(model, file)

## Task 3

In [20]:
from scipy.stats import spearmanr, kendalltau

In [23]:
weights = model.coef_[0][2:]

In [77]:
players = []
for i in results_2020.keys():
    for team in results_2020[i]:
        for player in team['teamMembers']: 
            players.append(player['player']['id'])
            
players = np.unique(players)

In [98]:
players_w = {}
for player_id in players:
    if player_id in encoder.categories_[0]:
        players_w[player_id] = weights[encoder.transform([[player_id]]).toarray()[0]==1.]
    else:
        players_w[player_id] = 0.

In [118]:
spearman = []
kendall = []

for i in results_2020.keys():
    positions = []
    predicted_pos = []
    for team in results_2020[i]:
        positions.append(team['position'])
        w = 0
        for player in team['teamMembers']: 
            w = w + players_w[player['player']['id']]
        predicted_pos.append(w)
    spearman.append(spearmanr(positions,predicted_pos)[0])
    kendall.append(kendalltau(positions,predicted_pos)[0])
    
spearman = [el for el in spearman if str(el) != 'nan']
kendall = [el for el in kendall if str(el) != 'nan']

In [119]:
np.mean(spearman)

-0.7407596025220449

In [120]:
np.mean(kendall)

-0.578495905718968

Замечание: почему коррелиация отрицательная

Веса игроков тем больше, чем лучше игрок отвечает на вопросы.
чем лучше игрок играет тем, он выше в рейтинге => его позиция в турнире меньше

## Task 4

Будем использовать метод Presence-Only метод и EM алгоритм:

    Если команда отвечает на вопрос -> кто-то ответил на вопрос и мы не можем сказать кто именно 
    Если команда не отвечает на вопрос -> никто не ответил на вопрос 

In [173]:
from sklearn.linear_model import LinearRegression

In [304]:
pi = 0.1
n_p = len(data['score'])-sum(data['score'])
n_u = sum(data['score'])

In [305]:
def em_step(X_train, y_train, y_true, first = True):
    y_train_mod = np.log(np.clip(y_train, 1e-6, 1-1e-6)/(1-np.clip(y_train, 1e-6, 1-1e-6)))
    if first:
        y_train = y_train.copy()
        y_train[y_true == 1.] = pi
    lm = LinearRegression().fit(X_train, y_train_mod)
    y_pred_mod = lm.predict(X_train) - np.log((n_p + pi * n_u)/(pi * n_u))
    y_pred = np.exp(y_pred_mod)/(1 + np.exp(y_pred_mod))
    y_pred[y_true == 0] = 0.
    return y_pred, lm.coef_

In [307]:
y, _ = em_step(X_train, y_train, y_train)

In [308]:
y, _ = em_step(X_train, y, y_train, first = False)

In [309]:
y, coef = em_step(X_train, y, y_train, first = False)

In [310]:
weights = coef[2:]

In [311]:
players_w = {}
for player_id in players:
    if player_id in encoder.categories_[0]:
        players_w[player_id] = weights[encoder.transform([[player_id]]).toarray()[0]==1.]
    else:
        players_w[player_id] = 0.

In [312]:
spearman = []
kendall = []

for i in results_2020.keys():
    positions = []
    predicted_pos = []
    for team in results_2020[i]:
        positions.append(team['position'])
        w = 0
        for player in team['teamMembers']: 
            w = w + players_w[player['player']['id']]
        predicted_pos.append(w)
    spearman.append(spearmanr(positions,predicted_pos)[0])
    kendall.append(kendalltau(positions,predicted_pos)[0])
    
spearman = [el for el in spearman if str(el) != 'nan']
kendall = [el for el in kendall if str(el) != 'nan']

In [313]:
np.mean(spearman)

-0.7125747310536112

In [314]:
np.mean(kendall)

-0.5528752732449508

In [None]:
## Task 5