# User/Competition Submission Traces

This notebook shows public/private submission scores over time for nearly all competitions a user has submitted to. (Xmas optimization competitions don't have scores available in Meta Kaggle, and competitions with few submissions are skipped.)

Evaluation metrics vary over competitions and the direction of 'better' submissions changes - to indicate this, the peak score in each competition for both public & private scores are shown as a dotted line.

Public leaderboard scores are in blue and private in red; and the submissions that are used for the public and private LB are marked with a dot.

So, the name of the game when Kaggling is to get the red line, the red dotted line, and the red point all to coincide at the same point, that is: to select your submission with the best private test set score.

I'm using my own username as a demo, and if you want to ask about any of the traces shown, feel free! But the real purpose is for you to fork and see your own submissions history, so fork it with the "Copy and edit" button, set your username below and commit it to see your own competitions history :)

See also [this notebook][1] that generates these traces for the winning team of each competition.

 [1]: https://www.kaggle.com/jtrotman/winning-team-submission-traces



In [1]:
# change this!
USERNAME = 'jtrotman'

In [2]:
%matplotlib inline
import gc, os, sys, time
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from itertools import combinations
from IPython.display import HTML, display

COLORS = dict(Public='blue', Private='red')

IN_DIR = '../input'

MEDALS = {
    'gold':'&#129351;',
    'silver':'&#129352;',
    'bronze':'&#129353;',
    1.:'&#129351;',
    2.:'&#129352;',
    3.:'&#129353;',
}

def read_csv_filtered(csv, col, values):
    dfs = [df.loc[df[col].isin(values)]
           for df in pd.read_csv(f'{IN_DIR}/{csv}', chunksize=100000, low_memory=False)]
    return pd.concat(dfs, axis=0)

users = read_csv_filtered('Users.csv', 'UserName', {USERNAME})
assert users.shape[0] == 1
USERID = users.iloc[0].Id

tmemb = read_csv_filtered('TeamMemberships.csv', 'UserId', {USERID})
teams = read_csv_filtered('Teams.csv', 'Id', tmemb.TeamId).set_index('Id')

comps = read_csv_filtered('Competitions.csv', 'Id', teams.CompetitionId).set_index('Id')
idx = comps.EvaluationAlgorithmName.isnull()
comps.loc[idx, 'EvaluationAlgorithmName'] = comps.loc[idx, 'EvaluationAlgorithmAbbreviation']
comps['Year'] = pd.to_datetime(comps.DeadlineDate).dt.year
comps['RewardQuantity'].fillna('', inplace=True)

subs = read_csv_filtered('Submissions.csv', 'TeamId', tmemb.TeamId)

asfloats = ['PublicScoreLeaderboardDisplay',
            'PublicScoreFullPrecision',
            'PrivateScoreLeaderboardDisplay',
            'PrivateScoreFullPrecision',]

subs[asfloats] = subs[asfloats].astype(float)
# subs.IsAfterDeadline.mean()

subs = subs.query('not IsAfterDeadline').copy()
subs['CompetitionId'] = subs.TeamId.map(teams.CompetitionId)
subs['SubmissionDate'] = pd.to_datetime(subs['SubmissionDate']).dt.date

teams = teams.dropna(subset=['PublicLeaderboardSubmissionId', 'PrivateLeaderboardSubmissionId'])

# key to define sort order competitions are displayed in: best results first
c2key = teams.set_index('CompetitionId').apply(lambda r: f'{r.PrivateLeaderboardRank:4.0f} {r.name}', 1)
subs['Key'] = subs.CompetitionId.map(c2key.get)

def comp_id_for_field(value, field='Slug'):
    idx = comps[field]==value
    if idx.sum() < 1:
        return -1
    return comps.loc[idx].index[0]

# replace mercari score 99 with nan
idx = (subs.CompetitionId==comp_id_for_field('mercari-price-suggestion-challenge'))
scols = ['PrivateScoreFullPrecision','PrivateScoreLeaderboardDisplay']
subs.loc[idx, scols] = subs.loc[idx, scols].replace({99.0:np.nan})

FIGSIZE=(14, 6)

for comp, df in subs.groupby('Key'):
    if df.shape[0] < 3:
        continue
    if df.PublicScoreFullPrecision.count() < 1:
        continue
    if df.PrivateScoreFullPrecision.count() < 1:
        continue
    
    df = df.sort_values('Id').reset_index()
    comp_id = df.iloc[0].CompetitionId
    scores = (df.PublicScoreFullPrecision.dropna().tolist() +
              df.PrivateScoreFullPrecision.dropna().tolist())
    c = comps.loc[comp_id]
    if c.EvaluationAlgorithmIsMax:
        f = 'max'
        top = eval(f)(scores)
        bottom = np.quantile(scores, 0.05)
    else:
        f = 'min'
        top = np.quantile(scores, 0.95)
        bottom = eval(f)(scores)
    
    rg = (top - bottom)
    top += rg / 20
    bottom -= rg / 20
    
    xs = np.arange(df.shape[0])
    yb = np.ones(df.shape[0])

    team = teams.query(f'CompetitionId=={c.name}').iloc[0]
    nusers = df.SubmittedUserId.nunique()
    
    display(HTML(f'<h1 id="{c.Slug}">{c.Title}</h1>'))

    display(HTML(f'Type: {c.HostSegmentTitle} &mdash; <i>{c.Subtitle}</i> &mdash; <a href="https://www.kaggle.com/c/{c.Slug}/leaderboard">Leaderboard</a><br/>'
             f'Dates: <b>{c.EnabledDate}</b> &mdash; <b>{c.DeadlineDate}</b><br/>'
             f'<b>{c.TotalTeams}</b> teams; <b>{c.TotalCompetitors}</b> competitors; <b>{c.TotalSubmissions}</b> submissions<br/>'
             f'Leaderboard percentage: <b>{c.LeaderboardPercentage}</b><br/>'
             f'Evaluation: <a title="{c.EvaluationAlgorithmDescription}">{c.EvaluationAlgorithmName}</a><br/>'
             f'Reward: <b>{c.RewardType}</b> {c.RewardQuantity} [{c.NumPrizes} prizes]<br/>'))

    display(HTML(
        f' <h3>Team "{team.TeamName}"</h3>'
        f' Submissions: <b>{df.shape[0]}</b> by <b>{nusers}</b> user{"s" if nusers>1 else ""}'
        f'; From kernels: <b>{df.SourceKernelVersionId.count()}</b>'
        f' (Kernel versions: <b>{df.SourceKernelVersionId.nunique()}</b>)'
        '<br/>'
        f' First sub: <b>{df.SubmissionDate.min()}</b>'
        f'; Last sub: <b>{df.SubmissionDate.max()}</b>'
        f'; Duration: <b>{(df.SubmissionDate.max()-df.SubmissionDate.min()).days}</b>'
        f'; Days active: <b>{df.SubmissionDate.nunique()}</b>'
        '<br/>'
        f'  Public rank: <b>{team.PublicLeaderboardRank:.0f}</b>'
        f'; Private rank: <b>{team.PrivateLeaderboardRank:.0f}</b>'
        f' <b>{MEDALS.get(team.Medal, "")}</b>'
    ))
    
    title = f'{c.Title} - "{team.TeamName}" - [public {team.PublicLeaderboardRank:.0f} | private {team.PrivateLeaderboardRank:.0f}]'
    
    for t in ['Public', 'Private']:
        ax = df[f'{t}ScoreFullPrecision'].plot(legend=True, color=COLORS[t], figsize=FIGSIZE)

        ser = df.Id.isin(teams[f'{t}LeaderboardSubmissionId'])
        q = df.loc[ser]
        plt.scatter(np.where(ser)[0], q[f'{t}ScoreFullPrecision'], color=COLORS[t])

        # dotted line of peak score
        plt.plot(xs, yb * df[f'{t}ScoreFullPrecision'].apply(f), linestyle=':', color=COLORS[t])

    if df.shape[0] > 4:
        plt.ylim(bottom, top)
    plt.title(title, fontsize=16)
    plt.ylabel(c.EvaluationAlgorithmName, fontsize=14)
    plt.xlabel('Submission Index', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlim(-1, df.shape[0])
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.show()