In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
d = '/kaggle/input/jane-street-market-prediction/'

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from scipy.stats import norm, laplace
import scipy.stats as stats
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv(d + "train.csv")

First let's gather some basic information about the data and save it so we don't have to recompute it every time:

In [None]:
gauss = {}
for col in train:
    if 'feature' in col:
        gauss[col] = (train[col].min(), train[col].max(), train[col].mean(), train[col].std())

start = end = int(train.iloc[0].date)
end = int(train.iloc[-1].date)
days = [i for i in range(start, end + 2)]
batch_idx = train.date.values.searchsorted(days)

laplacians = {}
for col in train:
    if 'resp' in col:
        laplacians[col] = (train[col].median(), (train[col] - train[col].median()).abs().mean())

gaussians_df = pd.DataFrame.from_dict(gauss).rename(index={0: 'min', 1: 'max', 2: 'mean', 3: 'std'})
laplacians_df = pd.DataFrame.from_dict(laplacians).rename(index={0: 'median', 1: 'mad'})
batch_idx_df = pd.DataFrame(batch_idx)
laplacians_df.to_csv('laplacians.csv')
gaussians_df.to_csv('gaussians.csv')
batch_idx_df.to_csv('batch_idx.csv')

First thing I'd like to do is to find out what's the distribution of the 'resp' data. If X is our data and F is the CDF of our guess then we can verify whether it's a good guess by plotting a histogram of F(X). If it's close to uniform then our guess was good. We can verify this method by applying it to some samples from standard normal:

In [None]:
rand_normal = np.random.normal(size=1000000)

In [None]:
_ = plt.hist(norm.cdf(rand_normal), bins=100)

As we can see these are approximately uniformly distributed so that means. Now let's look at the resp data

In [None]:
fig, axes = plt.subplots(1, 2)
axs1, axs2 = axes.flatten()
axs1.hist(train.resp, bins=100)
_ = axs2.hist(train.resp, bins=100, log=True)

Above are the plots of distribution of 'resp' values. The second one is scaled by logarithm for better visibility. At this point my guess is that the data has a log-laplace distribution but there are also other possibilities. For now let's plot the CDF of 'resp':

In [None]:
_ = plt.plot(train['resp'].sort_values().values, np.array([i for i in range(len(train))]) / len(train))

Aside from the graph being very steep around 0 (approximately the mean of 'resp') I don't see any characteristics that would point towards a particular distribution. Let's implement some known CDFs for now:

In [None]:
def laplace_cdf(x, mu, b):
    p = x < mu
    n = x > mu
    p = p * np.exp((x - mu) / b) / 2
    n = n * (1 - np.exp(- (x - mu) / b) / 2)
    return n + p

def log_laplace_cdf(y, mu, b):
    s = (np.log(y) < mu) * 2 - 1
    return (1 + s * (1 - np.exp(-np.abs(np.log(y) - mu)/b))) / 2
def cauchy_cdf(x, x0, gamma):
    return np.arctan((x - x0) / gamma) / np.pi + 1/2

In [None]:
_ = cauchy_cdf(train['resp'], 5.662163451492418e-05, 0.00718).hist(bins=20)

So cauchy is not the worst guess, but we should be able to do better.

In [None]:
t1 = laplace_cdf(train.resp,  laplacians_df.loc['median', 'resp'],  laplacians_df.loc['mad', 'resp'])
_ = t1.hist(bins=20)

That's a bit weird looking graph. Maybe we could flatten it out more with some other parameters?

In [None]:
t2 = laplace_cdf(train.resp,  laplacians_df.loc['median', 'resp'],  laplacians_df.loc['mad', 'resp'] / 2)
_ = t2.hist(bins=20)

That looks somewhat better, but we still have these heavy tails near 0 and 1. Let's try log-laplace now

In [None]:
print(laplacians_df.loc['median', 'resp'] + 1,  laplacians_df.loc['mad', 'resp'])

In [None]:
_ = plt.hist(np.log(train.resp + 1), bins=100, log=True)

In [None]:
stats.laplace.fit(np.log(train.resp + 1))