# How many submissions are best?

This notebook simulates, with some assumptions, the impact of the number of submissions allowed can have on the "luck" of the leaderboard.

Assumptions used:
- Competition with 1010 participants
- 1000 "public kernel" participants with avg score of 0.950, std 0.005
- 10 "smart" participants with avg score 0.955, std 0.005
- Simulate with 1, 2, and 3 potential submissions
- Each participants max submission is chosen and given a rank.
- bootstrap this simulation 1000x for each submission count (1, 2, or 3)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm import tqdm

## One Final Submission Allowed

In [None]:
one_sub = []
bootstrap = 1000
for b in tqdm(range(bootstrap)):
    mu, sigma = 0.950, 0.005
    lucky_ppl_sub1 = np.random.normal(mu, sigma, 1000)
    mu, sigma = 0.955, 0.005
    smart_ppl_sub1 = np.random.normal(mu, sigma, 10)

    luckydf = pd.DataFrame([lucky_ppl_sub1]).T
    smartdf = pd.DataFrame([smart_ppl_sub1]).T

    luckydf['usedPublicKernel'] = True
    smartdf['usedPublicKernel'] = False

    results = pd.concat([luckydf, smartdf])
    results['highest_scoring_sub'] = results[[0]].max(axis=1)
    results['rank'] = results['highest_scoring_sub'].rank(ascending=False)
    one_sub.append(results)

In [None]:
pd.concat(one_sub).groupby('usedPublicKernel')['rank'] \
    .plot(kind='kde',
          title='Public Kernel vs Smart - 1 sub 1000x bootstrap',
          figsize=(10, 5))
plt.legend()
plt.show()

In [None]:
pd.concat(one_sub).groupby('usedPublicKernel')['rank'].describe()

# Two Final Submissions Allowed

In [None]:
two_subs = []
bootstrap = 1000
for b in tqdm(range(bootstrap)):
    mu, sigma = 0.950, 0.005
    lucky_ppl_sub1 = np.random.normal(mu, sigma, 1000)
    lucky_ppl_sub2 = np.random.normal(mu, sigma, 1000)
    mu, sigma = 0.955, 0.005
    smart_ppl_sub1 = np.random.normal(mu, sigma, 10)
    smart_ppl_sub2 = np.random.normal(mu, sigma, 10)

    luckydf = pd.DataFrame([lucky_ppl_sub1, lucky_ppl_sub2]).T
    smartdf = pd.DataFrame([smart_ppl_sub1, smart_ppl_sub2]).T

    luckydf['usedPublicKernel'] = True
    smartdf['usedPublicKernel'] = False

    results = pd.concat([luckydf, smartdf])
    results['highest_scoring_sub'] = results[[0, 1]].max(axis=1)
    results['rank'] = results['highest_scoring_sub'].rank(ascending=False)
    two_subs.append(results)

In [None]:
pd.concat(two_subs).groupby('usedPublicKernel')['rank'] \
    .plot(kind='kde',
          title='Public Kernel vs Smart - 2 subs 1000x bootstrap',
          figsize=(10, 5))
plt.legend()
plt.show()

In [None]:
pd.concat(two_subs).groupby('usedPublicKernel')['rank'].describe()

# Three Final Submissions Allowed

In [None]:
three_subs = []
bootstrap = 1000
for b in tqdm(range(bootstrap)):
    mu, sigma = 0.950, 0.005
    lucky_ppl_sub1 = np.random.normal(mu, sigma, 1000)
    lucky_ppl_sub2 = np.random.normal(mu, sigma, 1000)
    lucky_ppl_sub3 = np.random.normal(mu, sigma, 1000)
    mu, sigma = 0.955, 0.005
    smart_ppl_sub1 = np.random.normal(mu, sigma, 10)
    smart_ppl_sub2 = np.random.normal(mu, sigma, 10)
    smart_ppl_sub3 = np.random.normal(mu, sigma, 10)

    luckydf = pd.DataFrame([lucky_ppl_sub1, lucky_ppl_sub2, lucky_ppl_sub3]).T
    smartdf = pd.DataFrame([smart_ppl_sub1, smart_ppl_sub2, smart_ppl_sub3]).T

    luckydf['usedPublicKernel'] = True
    smartdf['usedPublicKernel'] = False

    results = pd.concat([luckydf, smartdf])
    results['highest_scoring_sub'] = results[[0, 1, 2]].max(axis=1)
    results['rank'] = results['highest_scoring_sub'].rank(ascending=False)
    three_subs.append(results)

In [None]:
pd.concat(three_subs).groupby('usedPublicKernel')['rank'] \
    .plot(kind='kde',
          title='Public Kernel vs Smart - 2 subs 1000x bootstrap',
          figsize=(10, 5))
plt.legend()
plt.show()

In [None]:
pd.concat(three_subs).groupby('usedPublicKernel')['rank'].describe()

# Compare Ranks of Non-Public Kernel People

In [None]:
one_s = pd.concat(one_sub)
two_s = pd.concat(two_subs)
three_s = pd.concat(three_subs)
one_s['subs'] = 1
two_s['subs'] = 2
three_s['subs'] = 3

In [None]:
combined = pd.concat([one_s, two_s, three_s])

In [None]:
combined.query('not usedPublicKernel') \
    .groupby('subs')['rank'] \
    .plot(kind='kde',
          figsize=(10, 5),
          title='Distribution of Simulated Rank of Non-Public Kernel Teams by # of Subs Allowed')
plt.legend()
plt.show()