# Load data

In [None]:
!wget -O chgk.zip https://www.dropbox.com/s/s4qj0fpsn378m2i/chgk.zip?dl=0
!mkdir chgk
!unzip chgk.zip -d chgk

In [2]:
import pickle
import pandas as pd

In [3]:
def load_data(path: str):
    with open(path, 'rb') as fin:
        data = pickle.load(fin)
    return data

def load_to_df(path: str):
    data = load_data(path)
    df = pd.DataFrame(data).T.set_index('id')
    return df

In [None]:
players = load_to_df('chgk/players.pkl')
players

In [None]:
results = load_data('chgk/results.pkl')
tournaments = load_to_df('chgk/tournaments.pkl')
tournaments['year'] = tournaments['dateStart'].apply(lambda x: int(x[:4]))
tournaments = tournaments[tournaments['year'].isin([2019, 2020])]
tournaments

# Data Preparation

In [7]:
# let's consider ? and X in the mask as 0

def fix_mask(mask, mask_len):
    fixed_mask = mask.replace('X', '0').replace('?', '0')
    if len(fixed_mask) < mask_len:
      fixed_mask += '0' * (mask_len - len(fixed_mask))
    assert all(x == '0' or x == '1' for x in fixed_mask) and len(fixed_mask) == mask_len
    return fixed_mask

In [8]:
import numpy as np

def calculate_questions_scores(tournament_id: int):
    """Calculates difficulty scores for questions. 
        Ranges from 0 to 1 for each question where 0 is the easiest question and 1 is the most difficult one.
        question_score = 1 - len(teams_answered_correctly) / len(teams)

    UPD. Please, do not pay attention to this function, it's a part of a failed experiment
    """
    tournament = results[tournament_id]
    max_mask_len = 0
    for team in tournament:
        mask = team.get('mask')
        if not mask:
          continue
        max_mask_len = max(max_mask_len, len(mask))
    if not max_mask_len:
      return None

    answered_correctly = np.zeros(max_mask_len, dtype=np.float32)

    for team in tournament:
        mask = team.get('mask')
        if not mask:
          continue
        for i, result in enumerate(fix_mask(mask, max_mask_len)):
          answered_correctly[i] += int(result)
    scores = 1 - answered_correctly / len(tournament)
    return scores

In [9]:
# from tqdm.notebook import tqdm
from tqdm import tqdm

relevant_results = []
for tournament_id in tqdm(tournaments.index):
    question_scores = calculate_questions_scores(tournament_id)
    if question_scores is None:
      continue
    for team in results[tournament_id]:
        if not team.get('mask'):
            continue
        team_id = team['team']['id']
        position = team['position']
        mask = fix_mask(team['mask'], len(question_scores))
        for player in team['teamMembers']:
          for question in range(len(mask)):
            question_id = f'{tournament_id}_{question}'
            player_id = player['player']['id']
            question_score = question_scores[question]
            question_answered = int(question_score != 0)
            data = (tournament_id, player_id, team_id, position, question_id, question_score, question_answered)
            relevant_results.append(data)

100%|██████████| 1105/1105 [02:52<00:00,  6.42it/s]


In [10]:
columns = ['tournament_id', 'player_id', 'team_id', 'position', 'question_id', 'question_score', 'question_answered']
results_df = pd.DataFrame(relevant_results, columns=columns)
results_df

Unnamed: 0,tournament_id,player_id,team_id,position,question_id,question_score,question_answered
0,4772,6212,45556,1.0,4772_0,0.116883,1
1,4772,6212,45556,1.0,4772_1,0.220779,1
2,4772,6212,45556,1.0,4772_2,0.554113,1
3,4772,6212,45556,1.0,4772_3,0.480519,1
4,4772,6212,45556,1.0,4772_4,0.121212,1
...,...,...,...,...,...,...,...
30338754,6456,224329,63129,7.0,6456_34,1.000000,1
30338755,6456,224329,63129,7.0,6456_35,0.285714,1
30338756,6456,224329,63129,7.0,6456_36,1.000000,1
30338757,6456,224329,63129,7.0,6456_37,0.285714,1


In [11]:
PREPARED_RESULTS_PATH = 'prepared_results.csv'
results_df.to_csv(PREPARED_RESULTS_PATH, index=False)
del relevant_results

In [12]:
results_df = pd.read_csv(PREPARED_RESULTS_PATH, index_col=None)

In [13]:
train_tournaments_id = set(tournaments[tournaments['year'] == 2019].index)
test_tournaments_id = set(tournaments[tournaments['year'] == 2020].index)
train_df = results_df[results_df['tournament_id'].isin(train_tournaments_id)]
test_df = results_df[results_df['tournament_id'].isin(test_tournaments_id)]

In [14]:
train_df

Unnamed: 0,tournament_id,player_id,team_id,position,question_id,question_score,question_answered
0,4772,6212,45556,1.0,4772_0,0.116883,1
1,4772,6212,45556,1.0,4772_1,0.220779,1
2,4772,6212,45556,1.0,4772_2,0.554113,1
3,4772,6212,45556,1.0,4772_3,0.480519,1
4,4772,6212,45556,1.0,4772_4,0.121212,1
...,...,...,...,...,...,...,...
29461801,6255,217156,76130,2019.0,6255_211,0.941060,1
29461802,6255,217156,76130,2019.0,6255_212,0.882120,1
29461803,6255,217156,76130,2019.0,6255_213,0.900446,1
29461804,6255,217156,76130,2019.0,6255_214,0.984151,1


# Baseline Model

In [15]:
from sklearn.preprocessing import OneHotEncoder

ohe_columns = ['player_id', 'question_id']

ohe = OneHotEncoder(dtype=int, handle_unknown='ignore')
X_train = ohe.fit_transform(train_df[ohe_columns])
X_test = ohe.transform(test_df[ohe_columns])
Y_train = train_df['question_answered'].tolist()
Y_test = test_df['question_answered'].tolist()

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=2, penalty='l2', n_jobs=-1, fit_intercept=False).fit(X_train, Y_train)

In [None]:
players_scores = {
    int(column.replace('player_id_', '')) : model.coef_[0][i]
    for i, column in enumerate(ohe.get_feature_names(ohe_columns))
    if column.startswith('player_id_') 
}

In [18]:
baseline_players_table = players.loc[players_scores.keys()]
baseline_players_table['score'] = players_scores.values()
baseline_players_table.sort_values('score', ascending=False)

Unnamed: 0_level_0,name,patronymic,surname,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
131908,Иван,Сергеевич,Киселёв,5.056806
149724,Дмитрий,Михайлович,Пелихов,4.969398
171798,Александр,Анатольевич,Стеценко,4.889345
149723,Валерий,Алексеевич,Виноградов,4.879081
78609,Ольга,Александровна,Багина,4.875633
...,...,...,...,...
204237,Елена,Александровна,Шорник,0.124108
127014,Елизавета,Халимовна,Куценко,0.124108
204482,Александр,Андреевич,Каляев,0.124108
224403,Анастасия,Евгеньевна,Кузнецова,0.109862


In [19]:
baseline_players_table[baseline_players_table.surname == 'Николенко'].sort_values('score', ascending=False)

Unnamed: 0_level_0,name,patronymic,surname,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22799,Сергей,Игоревич,Николенко,4.278325
115591,Александр,Владимирович,Николенко,2.340919
174391,Антонина,Александровна,Николенко,2.154724
22797,Любовь,Владимировна,Николенко,1.956845
190298,Дмитрий,Олегович,Николенко,0.893401
115351,Кристина,Юрьевна,Николенко,0.619676
179372,Олег,,Николенко,0.487939


# Predictions and evaluation

In [20]:
test_df

Unnamed: 0,tournament_id,player_id,team_id,position,question_id,question_score,question_answered
43452,4957,30152,49804,1.0,4957_0,0.717391,1
43453,4957,30152,49804,1.0,4957_1,0.880435,1
43454,4957,30152,49804,1.0,4957_2,0.141304,1
43455,4957,30152,49804,1.0,4957_3,0.423913,1
43456,4957,30152,49804,1.0,4957_4,0.739130,1
...,...,...,...,...,...,...,...
30338754,6456,224329,63129,7.0,6456_34,1.000000,1
30338755,6456,224329,63129,7.0,6456_35,0.285714,1
30338756,6456,224329,63129,7.0,6456_36,1.000000,1
30338757,6456,224329,63129,7.0,6456_37,0.285714,1


In [None]:
from scipy.stats.mstats import spearmanr, kendalltau


def team_score_evaluation(tournament_id, team_id):
    team_members = test_df[(test_df['tournament_id'] == tournament_id) & (test_df['team_id'] == team_id)]['player_id'].unique()
    score = 0
    unknown_players = 0
    for player in team_members:
        if player in players_scores:
            score += players_scores[player]
        else:
            unknown_players += 1
    if len(team_members) == unknown_players:
        return team_members, 0
    average_player_score = score / (len(team_members) - unknown_players)
    team_score = average_player_score * len(team_members)
    return team_members, team_score

tournament_table_columns = ['player_id', 'position']

spearman = []
kendall = []

for tournament_id in tqdm(test_tournaments_id):
    tournament_df = test_df[test_df['tournament_id'] == tournament_id]
    scores = []
    for team_id in tournament_df['team_id'].unique():
        score = team_score_evaluation(tournament_id, team_id)
        scores.append(score)
    
    predicted_positions = []
    for i, (team_members, team_score) in enumerate(sorted(scores, key=lambda x: x[1], reverse=True)):
        for player in team_members:
            predicted_positions.append((player, i + 1))
   
    predicted_tournament_table = pd.DataFrame(
        data=predicted_positions,
        columns=['player_id', 'predicted_position'],
    ).set_index('player_id')
    true_tournament_table = tournament_df[tournament_table_columns].drop_duplicates().set_index('player_id')    
    tournament_table = true_tournament_table.join(predicted_tournament_table)
    true_values = tournament_table['position']
    predicted_values = tournament_table['predicted_position']
    spearman.append(spearmanr(true_values, predicted_values).correlation)
    kendall.append(kendalltau(true_values, predicted_values).correlation)


In [72]:
spearman_mean = np.array(spearman)
spearman_mean = spearman_mean[~np.isnan(spearman_mean)].mean()
kendall_mean = np.array(kendall)
kendall_mean = kendall_mean[~np.isnan(kendall_mean)].mean()
print('Spearman correlation:', spearman_mean)
print('Kendall correlation:', kendall_mean)

Spearman correlation: 0.6064074906340875
Kendall correlation: 0.4618415794096175
