# Necessary imports

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt, rcParams
rcParams.update({'font.size': 21})

# Constants and helper functions

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July']

def create_pivot(data, xcol, ycol, bins = 10):
    data['xcol'] = pd.qcut(LB[xcol], bins, duplicates = 'drop')
    data['ycol'] = pd.qcut(LB[ycol], bins, duplicates = 'drop')
    data['ones'] = 1
    pt = data.pivot_table(index = 'ycol', columns = 'xcol', values = 'ones', aggfunc = len).fillna(0).astype(int)
    data.drop(columns = ['xcol', 'ycol', 'ones'], inplace = True)
    return pt

def create_plots_for_data(LB, title_prefix):
    plt.figure(figsize = (20, 10))
    sns.barplot(data = LB['TeamMembersCnt'].value_counts().reset_index(), x = 'index', y = 'TeamMembersCnt')
    plt.title(title_prefix + 'How many members were inside teams?')
    plt.xlabel('teamMembersCnt')
    plt.ylabel('Count')
    plt.grid(True)
    
    plt.figure(figsize = (20, 10))
    sns.histplot(data = LB, x = 'ChangeWithPublic', bins = 100)
    plt.title(title_prefix + 'LB Shakeup')
    plt.xlabel('# positions win/lose on private LB')
    plt.ylabel('Count')
    plt.grid(True)
    
    plt.figure(figsize = (20, 10))
    sns.heatmap(create_pivot(LB, 'ChangeWithPublic', 
                             'SubmissionsCount', bins = 10), annot = True, fmt = 'd')
    plt.title(title_prefix + 'Submissions count vs. shakeup');
    plt.xlabel('# positions win/lose on private LB');
    plt.ylabel('Submissions count');
    
    plt.figure(figsize = (20, 10))
    sns.heatmap(create_pivot(LB, 'ChangeWithPublic', 
                             'Score', bins = 10), annot = True, fmt = 'd')
    plt.title(title_prefix + 'Submissions scores vs. shakeup')
    plt.xlabel('# positions win/lose on private LB')
    plt.ylabel('Submission scores');
    
    plt.figure(figsize = (20, 10))
    sns.heatmap(create_pivot(LB, 'DaysDiffBetweenLastSubAndDeadline', 
                             'Score', bins = 10), annot = True, fmt = 'd')
    plt.title(title_prefix + 'Submission scores vs. difference between last submit and deadline')
    plt.xlabel('Difference between last submit and deadline (in days)')
    plt.ylabel('Submissions count');
    
    
def get_change_with_public_lb(data):
    change_with_public = []
    for team, change in list(zip(data['TeamMembersList'].str.split(';').values,
                                 data['ChangeWithPublic'])):
        if str(team) == 'nan':
            continue
        change_with_public += [[user, change] for user in team]
    return pd.DataFrame(change_with_public, columns = ['User', 'ChangeWithPublic'])

# EDA for the private leaderboards

The graphs for all datasets are collapsed - open the output of the cell below to see them:

In [None]:
change_with_public = []

for month in months[:-1]:
    print(month)

    LB = pd.read_csv('../input/tps-competitions-private-leaderboards/TPS_{}_leaderboard.csv'.format(month))
    create_plots_for_data(LB, 'TPS {}: '.format(month))
    plt.show()

    print(LB[['ChangeWithPublic', 
        'Score', 
        'SubmissionsCount', 
        'TeamMembersCnt', 
        'DaysDiffBetweenLastSubAndDeadline']].describe(percentiles=np.arange(0.1, 1, 0.1)))
    
    change_with_public.append(get_change_with_public_lb(LB))
    
    print('=' * 30)

### For the latest competition we create the same EDA in separate cell below:

In [None]:
month = months[-1]
print(month)

LB = pd.read_csv('../input/tps-competitions-private-leaderboards/TPS_{}_leaderboard.csv'.format(month))
create_plots_for_data(LB, 'TPS {}: '.format(month))
plt.show()

print(LB[['ChangeWithPublic', 
    'Score', 
    'SubmissionsCount', 
    'TeamMembersCnt', 
    'DaysDiffBetweenLastSubAndDeadline']].describe(percentiles=np.arange(0.1, 1, 0.1)))

change_with_public.append(get_change_with_public_lb(LB))

# It's time to build the aggregated table for each user participated in TPS competitions

In [None]:
change_df = pd.concat(change_with_public).reset_index(drop = True)
agg_df = change_df.groupby('User')['ChangeWithPublic'].agg(CompetitionsCnt = len, 
                                                           MedianChange = np.median,
                                                           WorstDrop = np.min,
                                                           BestJump = np.max).reset_index()
agg_df

### Cool. 6888 users have already done that (maybe several times as well). But how many of them participated in the specific number of competitions?

In [None]:
vc = (agg_df['CompetitionsCnt'].value_counts() / 
          len(agg_df) * 100).reset_index().rename({'index': 'CompsCnt'}, axis = 1)

print(vc)

plt.figure(figsize = (20, 10))
sns.barplot(data = vc, x = 'CompsCnt', y = 'CompetitionsCnt')
plt.title('What percent of users participate in specific number of TPS competitions?')
plt.xlabel('Number of TPS competitions')
plt.ylabel('Users percent')
plt.grid(True)

### Only ~15% participated in 2 competitions, the more the number of competitions - the less users fit.

# Let's check different user tops

In [None]:
median_change_sort = agg_df.sort_values('MedianChange')
print(median_change_sort.shape)
print('Worst-10 median changes:')
median_change_sort.head(10)

In [None]:
print('Top-10 median changes:')
median_change_sort.tail(10).iloc[::-1, :]

In [None]:
median_change_sort_3 = agg_df[agg_df['CompetitionsCnt'] >= 3].sort_values('MedianChange').reset_index(drop = True)
print(median_change_sort_3.shape)
print('Worst-10 median changes for users with 3 comps and more:')
median_change_sort_3.head(10)

In [None]:
print('Top-10 median changes for users with 3 comps and more:')
median_change_sort_3.tail(10).iloc[::-1, :]

In [None]:
median_change_sort_3[median_change_sort_3['User'] == 'alexryzhkov']

In [None]:
best_jumpers = agg_df.sort_values('BestJump', ascending = False).reset_index(drop = True)
print(best_jumpers.shape)
print('Top-15 jumpers on Private LB:')
best_jumpers.head(10)

In [None]:
best_jumpers[best_jumpers['User'] == 'alexryzhkov']