# Leaderboard Probing

This notebook is directly or indirectly based on work by @jmcslk, @criskiev, @grayjay, @javiervallejos, @adityasharma01, @sfktrkl, @motloch, @chaudharypriyanshu and others.

We first model the target distribution as a partition of 18 chunks, where each chunk has a fixed probability determined by [leaderboard probing](https://www.kaggle.com/ambrosm/tpsnov21-012-leaderboard-probing) and then blend the resulting model with the output of two [postprocessed](https://www.kaggle.com/ambrosm/tpsnov21-007-postprocessing) high-scoring public notebooks using the weights 91 : 8 : 1.


In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.svm import LinearSVC

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
pure_df = pd.read_csv('../input/november21/train.csv')
test_df['chunk'] = test_df.id // 60000

In [None]:
def postprocess_separate(submission_df, test_df=None, pure_df=None):
    """Update submission_df so that the predictions for the two sides of the hyperplane don't overlap.
    
    Parameters
    ----------
    submission_df : pandas DataFrame with columns 'id' and 'target'
    test_df : the competition's test data
    pure_df : the competition's original training data
    
    From https://www.kaggle.com/ambrosm/tpsnov21-007-postprocessing
    """
    if pure_df is None: pure_df = pd.read_csv('../input/november21/train.csv')
    if pure_df.shape != (600000, 102): raise ValueError("pure_df has the wrong shape")
    if test_df is None: test_df = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
    if test_df.shape[0] != submission_df.shape[0] or test_df.shape[1] != 101: raise ValueError("test_df has the wrong shape")

    # Find the separating hyperplane for pure_df, step 1
    # Use an SVM with almost no regularization
    model1 = make_pipeline(StandardScaler(), LinearSVC(C=1e5, tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=1))
    model1.fit(pure_df.drop(columns=['id', 'target']), pure_df.target)
    pure_pred = model1.predict(pure_df.drop(columns=['id', 'target']))
    #print((pure_pred != pure_df.target).sum(), (pure_pred == pure_df.target).sum()) # 1 599999
    # model1 is not perfect: it predicts the wrong class for 1 of 600000 samples

    # Find the separating hyperplane for pure_df, step 2
    # Fit a second SVM to a subset of the points which contains the support vectors
    pure_pred = model1.decision_function(pure_df.drop(columns=['id', 'target']))
    subset_df = pure_df[(pure_pred > -5) & (pure_pred < 0.9)]
    model2 = make_pipeline(StandardScaler(), LinearSVC(C=1e5, tol=1e-7, penalty='l2', dual=False, max_iter=2000, random_state=1))
    model2.fit(subset_df.drop(columns=['id', 'target']), subset_df.target)
    pure_pred = model2.predict(pure_df.drop(columns=['id', 'target']))
    #print((pure_pred != pure_df.target).sum(), (pure_pred == pure_df.target).sum()) # 0 600000
    # model2 is perfect: it predicts the correct class for all 600000 training samples
    
    pure_test_pred = model2.predict(test_df.drop(columns=['id', 'target'], errors='ignore'))
    lmax, rmin = submission_df[pure_test_pred == 0].target.max(), submission_df[pure_test_pred == 1].target.min()
    if lmax < rmin:
        print("There is no overlap. No postprocessing needed.")
        return
    # There is overlap. Remove this overlap
    submission_df.loc[pure_test_pred == 0, 'target'] -= lmax + 1
    submission_df.loc[pure_test_pred == 1, 'target'] -= rmin - 1
    print(submission_df[pure_test_pred == 0].target.min(), submission_df[pure_test_pred == 0].target.max(),
          submission_df[pure_test_pred == 1].target.min(), submission_df[pure_test_pred == 1].target.max())


In [None]:
# name = name of chunk as in https://www.kaggle.com/ambrosm/tpsnov21-012-leaderboard-probing
# len = number of samples in this chunk (len.sum() == 540000)
# auc = public leaderboard score of this chunk
# diff = difference of this chunk's auc score minus the baseline of 0.74723 = area of the added triangle
# ratio = unused

probes_l = '''name	len	auc	diff	ratio
10H0	30343	74653	-70	 -0.00231 
17H0	36335	74671	-52	 -0.00143 
16H0	41892	74720	-3	 -0.00007 
13H0	36383	74724	1	 0.00003 
18H0	23746	74729	6	 0.00025 
11H0	20501	74732	9	 0.00044 
14H0	25270	74747	24	 0.00095 
12H0	40308	74763	40	 0.00099 
15H0	25965	74762	39	 0.00150 
'''

probes_r = '''
name	len	auc	diff	ratio
10H1	29657	74695	-28	 -0.00094 
11H1	39499	74760	37	 0.00094 
12H1	19692	74750	27	 0.00137 
13H1	23617	74690	-33	 -0.00140 
14H1	34730	74735	12	 0.00035 
15H1	34035	74815	92	 0.00270 
16H1	18108	74682	-41	 -0.00226 
17H1	23665	74694	-29	 -0.00123 
18H1	36254	74683	-40	 -0.00110 
'''

probes_l_df = pd.read_csv(io.StringIO(probes_l), sep='\t')
probes_r_df = pd.read_csv(io.StringIO(probes_r), sep='\t')


In [None]:
# Left side
l_dict = {}
plt.figure(figsize=(10,10))
for row in probes_l_df.itertuples():
    #print(row)
    y0 = row.diff / 100000 * 8
    plt.plot([0, 0.25], [y0, y0+0.75], color='r') # parallel for all points with this auc difference
    plt.plot([0, row.len/270000], [row.len/270000, 0], color='g') # all points for this row.len
    x = (row.len/270000 - y0) / 4
    y = 3 * x + y0
    plt.scatter([x], [y], color='k')
    #print(f"{row.name} {y/x:.5f} {x/(x+y):.5f}")
    l_dict[int(row.name[:2])] = x/(x+y)
    print(f"{row.name[:2]}: {x/(x+y):.5f},")
plt.plot([0, 0.25, 1], [0, 0.75, 1], color="y", lw=1) # baseline roc curve (two segments)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") # diagonal
plt.gca().set_aspect('equal')
if False:
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
else:
    plt.xlim([0.0, 0.2])
    plt.ylim([0.0, 0.2])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
l_dict

In [None]:
# Right side
r_dict = {}
plt.figure(figsize=(10,10))
for row in probes_r_df.itertuples():
    y0 = row.diff / 100000 * 8
    plt.plot([0.25-y0, 1-y0], [0.75, 1], color='r') # parallel for all points with this auc difference
    plt.plot([1-row.len/270000, 1], [1, 1-row.len/270000], color='g') # all points for this row.len
    #x = (row.len/270000 - y0) / 4
    #y = 3 * x + y0
    x = (4 - 3*row.len/270000 - y0) / 4
    y = 2 - row.len/270000 - x
    plt.scatter([x], [y], color='k')
    #print(f"{row.name} {y/x:.5f} {x/(x+y):.5f}")
    r_dict[int(row.name[:2])] = (1-x)/(row.len/270000)
    print(f"{row.name[:2]}: {r_dict[int(row.name[:2])]:.5f},")
plt.plot([0, 0.25, 1], [0, 0.75, 1], color="y", lw=1) # baseline roc curve (two segments)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") # diagonal
plt.gca().set_aspect('equal')
if False:
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
else:
    plt.xlim([0.8, 1])
    plt.ylim([0.8, 1])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
r_dict

In [None]:
baseline = pd.DataFrame({'id': test_df.id, 'target': 0})
postprocess_separate(baseline, test_df=test_df.drop(columns='chunk'), pure_df=pure_df)


In [None]:
# 18 probabilities for the 18 half-chunks -> lb 0.75110
sub = baseline.copy()
for chunk in range(10, 19):
    sub.loc[(test_df.chunk == chunk) & (baseline.target < 0), 'target'] = l_dict[chunk]
    sub.loc[(test_df.chunk == chunk) & (baseline.target >= 0), 'target'] = r_dict[chunk]

sub['target']=sub['target'].rank(pct=True)
sub.to_csv(f'submission_probed.csv', index=False)
sub.head(20)

In [None]:
# 8 % of @jmcslk's submission (which has lb 0.74996) -> lb 0.75209
jmcslk_submission = pd.read_csv('../input/tps-nov-2021-simple-single-nn-3/submission.csv')
postprocess_separate(jmcslk_submission, test_df=test_df.drop(columns='chunk'), pure_df=pure_df)
sub_8b = sub.copy()

sub_8b['target'] += jmcslk_submission.target.rank(pct=True)

sub_8b.to_csv(f'submission_probed_blended_8b.csv', index=False)
sub_8b.head(20)

In [None]:
# 8 % of @sfktrkl's submission (which has lb 0.75002) -> lb 0.75204
sfktrkl_submission = pd.read_csv('../input/tps-nov-2021-power-averaging/submission.csv')
postprocess_separate(sfktrkl_submission, test_df=test_df.drop(columns='chunk'), pure_df=pure_df)
sub_8c = sub.copy()
sub_8c['target'] += sfktrkl_submission.target.rank(pct=True)
sub_8c.to_csv(f'submission_probed_blended_8c.csv', index=False)
sub_8c.head(20)
