In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.io import show,output_notebook
from bokeh.layouts import column
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, CustomJS, TextInput
output_notebook()

# Public EDA

In [None]:
def process_df(df):
    df['SubmissionDate'] = pd.to_datetime(df['SubmissionDate'])
    df['Rank'] = df['Score'].rank()
    df['Rank'] = df['Rank'].max() - df['Score'].rank() + 1
    df['SubmissionDays'] = (df['SubmissionDate'].max() - df['SubmissionDate']).dt.days
    firstScoreDate = df.groupby('Score')['SubmissionDate'].transform('min')
    uniqueScore = df.groupby('Score')['SubmissionDate'].transform('count') == 1
    df['color'] = 'red'
    df.loc[uniqueScore, 'color'] = 'blue'
    df.loc[firstScoreDate < df['SubmissionDate'], 'color'] = 'yellow'

In [None]:
# Load data and prepare position for teams

leaderboard_public = pd.read_csv('../input/jane-street-market-prediction-leaderboards/jane-street-market-prediction-publicleaderboard.csv')

process_df(leaderboard_public)
leaderboard_public.head()

In [None]:
# Check distributions of scores

leaderboard_public['Score'].plot.hist(bins=50, color='green', figsize=(10,3));

In [None]:
# count of unique cases and ratio of unique cases

leaderboard_public['Score'].nunique(), leaderboard_public['Score'].nunique() / len(leaderboard_public)

In [None]:
# count of cases with Score > 10000 and ratio of unique cases with Score > 10000

topscores = leaderboard_public[leaderboard_public['Score'] > 10000]['Score']

len(topscores), topscores.nunique(), topscores.nunique() / len(topscores)

In [None]:
# Check distributions of top scores

leaderboard_public.where(leaderboard_public['Score']>10000)['Score'].plot.hist(bins=50, color='green', figsize=(10,3));

In [None]:
# unique days
leaderboard_public['SubmissionDays'].unique()

In [None]:
# Check distributions of SubmissionDays

leaderboard_public['SubmissionDays'].plot.hist(bins=25, color='green', figsize=(10,3), xlim=(leaderboard_public['SubmissionDays'].max(), leaderboard_public['SubmissionDays'].min()));

In [None]:
# scatter of Date and Score
leaderboard_public.plot.scatter(x='SubmissionDate', y='Score', c='color', alpha=0.3, figsize=(10,10));

In [None]:
# top 20 the most frequent scores
leaderboard_public['Score'].value_counts().head(20)

In [None]:
# scatter of Date and Rank
leaderboard_public.plot.scatter(x='SubmissionDate', y='Rank', c='color', alpha=0.3, figsize=(10,10), ylim=(leaderboard_public['Rank'].max(), leaderboard_public['Rank'].min()));

# 2021-03-05 (First Private LB)

In [None]:
# Load data and prepare position for teams

private1 = pd.read_csv('../input/jane-street-market-prediction-leaderboards/jane-street-market-prediction-20200305.csv')

process_df(private1)
private1.head()

In [None]:
# Check distributions of scores

private1['Score'].plot.hist(bins=50, color='green', figsize=(10,3));

In [None]:
# count of unique cases and ratio of unique cases

private1['Score'].nunique(), private1['Score'].nunique() / len(private1)

In [None]:
# count of cases with Score > 4000 and ratio of unique cases with Score > 4000

topscores = private1[private1['Score'] > 4000]['Score']

len(topscores), topscores.nunique(), topscores.nunique() / len(topscores)

In [None]:
# top 20 the most frequent scores
private1['Score'].value_counts().head(20)

# Private/Public results

In [None]:
private1['PastScore'] = private1['TeamId'].map(leaderboard_public.set_index('TeamId')['Score'])
private1['PastRank'] = private1['TeamId'].map(leaderboard_public.set_index('TeamId')['Rank'])
private1['PastSubmissionDate'] = private1['TeamId'].map(leaderboard_public.set_index('TeamId')['SubmissionDate'])
private1['PastSubmissionDays'] = private1['TeamId'].map(leaderboard_public.set_index('TeamId')['SubmissionDays'])

In [None]:
# scatter of New/Past Scores
private1.plot.scatter(x='PastScore', y='Score', c='color', alpha=0.3, figsize=(10,10));

In [None]:
# scatter of New/Past Rank
private1.plot.scatter(x='PastRank', y='Rank', c='color', alpha=0.3, figsize=(10,10), ylim=(private1['Rank'].max(), private1['Rank'].min()), xlim=(private1['PastRank'].max(), private1['PastRank'].min()));

In [None]:
# scatter of Past Submission Date / Score
private1.plot.scatter(x='SubmissionDate', y='Score', c='color', alpha=0.3, figsize=(10,10));

In [None]:
private1['PastScoreMin'] = private1['PastScore'].cummin()
private1['PastRankMax'] = private1['PastRank'].cummax()
private1['PastSubmissionDaysMax'] = private1['SubmissionDays'].cummax()

private1.head()

In [None]:
# What is minimum score public Score you should have for getting high score
private1.plot(x='Score', y='PastScoreMin');

In [None]:
# What is minimum score public Rank you should have for getting high rank
private1.plot(x='Score', y='PastRankMax', ylim=(private1['PastRankMax'].max()+200, private1['PastRankMax'].min()-200));

In [None]:
# What is maximum days for getting high score?
private1.plot(x='Score', y='PastSubmissionDaysMax');

# Corr matrixes

In [None]:
# Corr matrix
private1[['Score', 'PastScore']].corr()

In [None]:
# Corr matrix
private1[['Rank', 'PastRank']].corr()

In [None]:
# Corr matrix
private1[['Score', 'PastSubmissionDays']].corr()

In [None]:
# Bokeh visualization of Private/Public Scores

source = ColumnDataSource(private1)
source_visible = ColumnDataSource(private1)

plot = figure(
    x_axis_label = "Public Score",
    y_axis_label = "Private Score",
    tools="pan,wheel_zoom,zoom_in,zoom_out,box_zoom,reset",
    plot_width=800,
    plot_height=1000,
)
plot.circle(x="PastScore",y="Score",source = source_visible, radius=3, alpha=0.5, color='color')
plot.text(x='PastScore',y='Score', text='TeamName',source  = source_visible,
       text_baseline="middle", text_align="left", text_font_size='8pt', text_font='Arial', alpha=0.5)

hover = HoverTool(tooltips = [
    ('Team', '@TeamId / @TeamName'), 
    ('Public Score/ Private Score', '@PastScore{i} / @Score{i}')])
plot.add_tools(hover)

callback = CustomJS(args=dict(source_visible=source_visible,
              source=source), code="""
        var f = cb_obj.value
        var data = source.data;
        
        var data_visible = {'SubmissionDate': [], TeamId':[], 'TeamName': [], 'Score': [], 'PastScore': [], 'color': []}
        
        for (var i = 0; i < data['TeamId'].length; i++) {
            if (data['TeamName'][i].includes(f)) {
                data_visible['SubmissionDate'].push(data['SubmissionDate'][i])
                data_visible['TeamId'].push(data['TeamId'][i])
                data_visible['TeamName'].push(data['TeamName'][i])
                data_visible['Score'].push(data['Score'][i])
                data_visible['PastScore'].push(data['PastScore'][i])
                data_visible['color'].push(data['color'][i])
            }
        }
        
        source_visible.data = data_visible
        source_visible.change.emit();
    """)

text_input = TextInput(value="", title="Filter by TeamName:")
text_input.js_on_change("value", callback)

show(column(text_input, plot))

In [None]:
private1.groupby('TeamId').size().value_counts()

# Latest Private LB (2021-03-17)

In [None]:
# Load data and prepare position for teams

private2 = pd.read_csv('../input/jane-street-market-prediction-leaderboards/jane-street-market-prediction-20210317.csv')

process_df(private2)
private2.head()

In [None]:
# Check distributions of scores

private2['Score'].plot.hist(bins=50, color='green', figsize=(10,3));

In [None]:
# top 20 the most frequent scores
private2['Score'].value_counts().head(20)

# First/Last Private Analysis

In [None]:
private1['LastScore'] = private1['TeamId'].map(private2.set_index('TeamId')['Score'])
private1['LastRank'] = private1['TeamId'].map(private2.set_index('TeamId')['Rank'])

private1['ScoreDiff'] =  private1['LastScore'] - private1['Score']
private1['RankDiff'] =  private1['LastRank'] - private1['Rank']

In [None]:
# How many teams have decreased Score?
(private1['ScoreDiff']<0).sum()

In [None]:
# scatter of First/Last Scores
private1.plot.scatter(x='Score', y='LastScore', c='color', alpha=0.3, figsize=(10,10));

In [None]:
# scatter of First/Last Rank
private1.plot.scatter(x='Rank', y='LastRank', c='color', alpha=0.3, figsize=(10,10), ylim=(private1['LastRank'].max(), private1['LastRank'].min()), xlim=(private1['Rank'].max(), private1['Rank'].min()));

In [None]:
# scatter of LastScore/Diff
private1.plot.scatter(x='LastScore', y='ScoreDiff', c='color', alpha=0.3, figsize=(10,10));

In [None]:
# RankDiff distribution
private1['RankDiff'].plot.hist(bins=200, figsize=(15,3))

In [None]:
# ScoreDiff distribution
private1['ScoreDiff'].plot.hist(bins=200, figsize=(15,3))

In [None]:
private1['ScoreDiff'].apply(['min', 'mean', 'median', 'max'])

In [None]:
# Teams with highest diff (looks like the had some failed kernels in the first run)
private1.sort_values('ScoreDiff', ascending=False).head(20)

In [None]:
# Teams with lowest diff (looks like the have some failed kernels in the last run)
private1.sort_values('ScoreDiff', ascending=False).tail(20)

In [None]:
# Top 200 ScoreDiff distribution
private1['ScoreDiff'].head(200).plot.hist(bins=20, figsize=(15,3))

In [None]:
private1['ScoreDiff'].head(200).apply(['min', 'mean', 'median', 'max'])

In [None]:
private1.sort_values('LastRank').head(50)

# Naive score prediction after 12 reruns (6 months = 24 weeks = 12 runs) :)

In [None]:
rolling_mean = private1['ScoreDiff'].rolling(50, center=True, min_periods=1).mean()
rolling_std = private1['ScoreDiff'].rolling(50, center=True, min_periods=1).std()
high_level = rolling_mean + 0.5*rolling_std
low_level = rolling_mean - 0.5*rolling_std

private1['ScoreDiff_cutted'] = private1['ScoreDiff']
private1.loc[private1['ScoreDiff'] > high_level, 'ScoreDiff_cutted'] = high_level[private1['ScoreDiff'] > high_level]
private1.loc[private1['ScoreDiff'] < low_level, 'ScoreDiff_cutted'] = low_level[private1['ScoreDiff'] < low_level]
private1.loc[private1['ScoreDiff_cutted'].isnull(), 'ScoreDiff_cutted'] = 0

private1['Score12runs'] = np.maximum(private1['Score'].fillna(0), private1['LastScore'].fillna(0) - private1['ScoreDiff_cutted'])  + 12 * private1['ScoreDiff_cutted']

In [None]:
private1.sort_values('Score12runs', ascending=False).head(50).style.bar(subset=['Rank', 'LastRank', 'Score', 'LastScore', 'Score12runs'], color='#d65f5f')

In [None]:
# scatter of First/Predicted Scores
private1.plot.scatter(x='Score', y='Score12runs', c='color', alpha=0.3, figsize=(10,10));