# <u>Jane Street Market Prediction Competition</u>
*Lucian Craciun,*
*February 7th, 2021*

# I. Explanatory Data Analysis
(section based on the following notebook: https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance)

## Initial Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt

import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn import manifold

import warnings
import pickle
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from hyperopt import Trials, STATUS_OK, hp, tpe, fmin
import catboost
from catboost import *
import lightgbm as lgb
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, log_loss, precision_recall_curve, auc, roc_curve

In [None]:
experiment_name = 'janestreet_v1_new_wc'

In [None]:
sns.set_style("darkgrid")

plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 20

In [None]:
pickle_dir = '/kaggle/working/pickle_files/'
if os.path.isdir(pickle_dir) == False:
    os.mkdir(pickle_dir)

## Data Import

Using the `datatable` library, very suitable for large datasets (as the train data adds up to `~5.77GB`), the data import is done in less than 15 seconds (vs more than 2 minutes using the pandas `read_csv` usual way).

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            c_sum = df[col].sum()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_sum < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_sum < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_sum < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_sum < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_sum < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_sum < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
df_train = (
    dt.fread('../input/jane-street-market-prediction/train.csv')
      .to_pandas()
      # .query('weight > 0')
      .pipe(reduce_mem_usage)
)

feature_names = df_train.columns[df_train.columns.str.contains('feature')]

In [None]:
# df_example_test = (
#     dt.fread('../input/jane-street-market-prediction/example_test.csv')
#       .to_pandas()
#       # .query('weight > 0')
#       .pipe(reduce_mem_usage)
# )

In [None]:
# # Data loading function
# def load_data():
#     data_path = '/kaggle/input/jane-street-market-prediction/'
#     data_files_list = ['train', 'features', 'example_test', 'example_sample_submission']
    
#     df_train_datatable = dt.fread(data_path + data_files_list[0] + ".csv")
#     df_features_datatable = dt.fread(data_path + data_files_list[1] + ".csv")
#     df_example_test_datatable = dt.fread(data_path + data_files_list[2] + ".csv")
#     df_example_sample_submission_datatable = dt.fread(data_path + data_files_list[3] + ".csv")
    
#     df_train = df_train_datatable.to_pandas()
#     df_features = df_features_datatable.to_pandas()
#     df_example_test = df_example_test_datatable.to_pandas()
#     df_example_sample_submission = df_example_sample_submission_datatable.to_pandas()
    
#     return df_train, df_features, df_example_test, df_example_sample_submission

In [None]:
%%time
# df_train, df_features, df_example_test, df_example_sample_submission = load_data()

## Data Overview

In [None]:
# Dataset shape
print("Train Dataset Shape: " + str(df_train.shape))

In [None]:
# Dataset columns
df_train.columns

In [None]:
# Dataset fields and types
df_train.info()

In [None]:
df_train.dtypes[df_train.dtypes=='int32']

In [None]:
# Dataset head
df_train.head()

In [None]:
# Checking for duplicates
df_train.duplicated().sum() # 0 - no duplicated rows

In [None]:
df_train.describe(include=[np.number]).transpose()

## Validation & Cleaning

**Data Quantity**

There are 500 days in the data (equivalently, 2 years of data considering ~250 trading days per year for NYSE). Moreover, some patterns seem to change after the 85th day, potentially a change in the Jane Street trading models, fact emphasized in the following sections.

In [None]:
id_field = 'ts_id'
date_field = 'date'

In [None]:
# Description of the no. of potential transactions per day (mean, std, quantiles etc.), by date
df_train.groupby(date_field).count()[id_field].describe()

In [None]:
dft = df_train.set_index(df_train[date_field]).sort_index()
count = pd.DataFrame(dft.groupby(dft.index)[id_field].count())
count.columns = ['Number of daily entries']

ax = count.plot(title = str(len(df_train[id_field].unique())-1) + ' total observations')
ax.set_xlabel("Date")
ax.set_ylabel("Number of entries")
ax.legend(loc="upper right")

ax.axvline(x=85, linestyle='--', alpha=0.3, c='red', lw=1)
ax.axvspan(0, 85 , color=sns.xkcd_rgb['grey'], alpha=0.1)

del dft
del count

plt.show()

In [None]:
dft = df_train.set_index(df_train[date_field]).sort_index()
count = pd.DataFrame(dft.groupby(dft.index)[id_field].count())

fig, ax = plt.subplots(figsize=(15, 5))
plt.plot(6.5 * 60 * 60 / count)
ax.set_xlabel ("Day")
ax.set_ylabel ("Av. time between trades (s)", fontsize=18)
ax.set_title ("Average time between trades for each day (assuming a 6.5h daily trading interval)")
ax.axvline(x=85, linestyle='--', alpha=0.3, c='red', lw=1)
ax.axvspan(0, 85 , color=sns.xkcd_rgb['grey'], alpha=0.1)
ax.set_xlim(xmin=0)
ax.set_xlim(xmax=500)
ax.set_ylim(ymin=0)
ax.set_ylim(ymax=20)
plt.show()

In [None]:
df_seas = df_train.groupby(date_field)[[id_field]].count()
df_seas.hist(bins=100)

plt.xlabel("No. of days")
plt.ylabel("No. of daily entries")
plt.title("Histogram of the no. of daily entries")
plt.show()

In [None]:
# Considering a 'rolling' 30-days window and displaying the evolution over time of the no. of observations covered every day
dft = df_train.set_index(df_train[date_field]).sort_index()
count = pd.DataFrame(dft.groupby(dft.index)[id_field].count().rolling(30, min_periods=1).mean())
count.columns = ['30-day rolling mean of number of observations per day']

ax = count.plot(title = str(len(df_train[id_field].unique())-1) + ' total observations')
ax.set_xlabel("Date")
ax.set_ylabel("Number of entries")
ax.legend(loc="upper right", prop={'size': 18})
ax.axvline(x=85, linestyle='--', alpha=0.3, c='red', lw=1)
ax.axvspan(0, 85 , color=sns.xkcd_rgb['grey'], alpha=0.1)

plt.show()

In [None]:
del dft
del count
del df_seas
gc.collect()

In [None]:
# Entries no. autocorrelation

from statsmodels.graphics.tsaplots import plot_acf
plot_acf(df_train.groupby(date_field)[id_field].count().values, lags=21) # monthly
plot_acf(df_train.groupby(date_field)[id_field].count().values, lags=252) # yearly
plt.show()

**Missing Rates**

In [None]:
# Checking the missing entries per field

missings_df = df_train.isna().sum() * 100 / len(df_train)
ax = missings_df.plot(kind='bar', title='% of data points with missing values')
ax.set_xlabel("Data fields")
ax.set_ylabel('% of missing entries')

ax.set_xticklabels([], rotation=30, horizontalalignment='right')

display()

In [None]:
del missings_df
gc.collect()

## Profiling

**1. 'resp' fields**

In [None]:
plt.rcParams['figure.figsize'] = (15, 5)

ax = sns.distplot(df_train['resp'], 
             bins=3000, 
             kde_kws={"clip":(-0.05,0.05)}, 
             hist_kws={"range":(-0.05,0.05)},
             color='darkcyan', 
             kde=False);
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of the 'resp' values")
plt.title("'resp' Distribution")
plt.show()

print("Minimum value: " + str(np.round(df_train['resp'].min(), 2)))
print("Maximum value: " + str(np.round(df_train['resp'].max(), 2)))


plt.rcParams['figure.figsize'] = (15, 5)
fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)

var_list = ['resp_1', 'resp_2', 'resp_3', 'resp_4']
ax_list = [ax[0,0], ax[0,1], ax[1,0], ax[1,1]]

for var, ax in zip(var_list, ax_list):
    ax = sns.distplot(df_train[var],
                 label = "'" + str(var) + "' distn",
                 ax = ax,
                 bins = 3000, 
                 kde_kws = {"clip":(-0.05,0.05)}, 
                 hist_kws = {"range":(-0.05,0.05)},
                 color = 'darkcyan', 
                 kde = False);
    values = np.array([rec.get_height() for rec in ax.patches])
    norm = plt.Normalize(values.min(), values.max())
    colors = plt.cm.jet(norm(values))
    for rec, col in zip(ax.patches, colors):
        rec.set_color(col)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))

df = df_train.copy()

balance= pd.Series(df['resp']).cumsum()

# Normalizing the longer horizons returns
resp_1= pd.Series(np.power(df['resp_1'] + 1, 1) - 1).cumsum()
resp_2= pd.Series(np.power(df['resp_2'] + 1, 1/2) - 1).cumsum()
resp_3= pd.Series(np.power(df['resp_3'] + 1, 1/3) - 1).cumsum()
resp_4= pd.Series(np.power(df['resp_4'] + 1, 1/4) - 1).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_title ("Cumulative resp and time horizons 1, 2, 3, and 4 (500 days)")

balance.plot(lw=3)
resp_1.plot(lw=3)
resp_2.plot(lw=3)
resp_3.plot(lw=3)
resp_4.plot(lw=3)
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5));

del resp_1
del resp_2
del resp_3
del resp_4

del df
gc.collect()

Considering a cumulative sum of daily returns, it seems that best returns are achieved for shorter horizons.

**2. 'weight'**

In [None]:
plt.rcParams['figure.figsize'] = (15, 5)

ax = sns.distplot(df_train['weight'], 
             bins=3000, 
             color='darkcyan', 
             kde=False);
# ax.set(xscale='log')
ax.set(yscale='log')

values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of the 'weight' values")
plt.title("'weight' Distribution")
plt.show()

print("Minimum value: " + str(np.round(df_train['weight'].min(), 2)))
print("Maximum value: " + str(np.round(df_train['weight'].max(), 2)))

print("% of 0.0 values: " + 
      str(np.round((df_train['weight'].values == 0).sum() / len(df_train) * 100, 2)) + "%")

In [None]:
dft = df_train.set_index(df_train[date_field]).sort_index()
count = pd.DataFrame(dft.groupby(dft.index)[id_field].count())
count.columns = ['Number of daily entries']

from scipy.stats import pearsonr

df_corr = df_train.groupby(date_field)[['weight']].sum().merge(count, on='date')
df_corr['ratio'] = df_corr['weight'] / df_corr['Number of daily entries']

corr, _ = pearsonr(df_corr['weight'], df_corr['Number of daily entries'])
print('Pearsons correlation: %.3f' % corr)

plt.rcParams['figure.figsize'] = (7, 7)

plt.scatter(df_corr['weight'], df_corr['Number of daily entries'])
plt.xlabel('cummulative weight')
plt.ylabel('no. of daily entries')
plt.show()

del dft
del count

The sum of daily weights looks very correlated with the no. of daily potential trades, with a pearsonr coefficient of 0.877.

In [None]:
plt.rcParams['figure.figsize'] = (15, 5)
df_corr['ratio'].plot(title="Daily ration between cummulative weight and # potential trades")
plt.show()

In [None]:
plt.figure(figsize = (15,5))
ax = sns.distplot(df_train['weight'], 
             bins=1400, 
             kde_kws={"clip":(0.001,1.4)}, 
             hist_kws={"range":(0.001,1.4)},
             color='darkcyan', 
             kde=False);
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of non-zero weights (up to 1.4)")
plt.show();

del values
del df_corr
gc.collect()

In [None]:
df_train_nonZero = df_train.query('weight > 0').reset_index(drop = True)
plt.figure(figsize = (10,4))
ax = sns.distplot(np.log(df_train_nonZero['weight']), 
             bins=1000, 
             kde_kws={"clip":(-4,5)}, 
             hist_kws={"range":(-4,5)},
             color='darkcyan', 
             kde=False);
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of the logarithm of the non-zero weights")
plt.show()

del df_train_nonZero
gc.collect()

In [None]:
df_train.groupby(date_field)['weight'].sum().plot(title='Cummulative daily weight')
plt.show()

**3. Cummulative return: 'weight' * 'resp'**

In [None]:
df = df_train.copy()
df = df[df['weight'] > 0]
df['wr'] = df['weight'] * df['resp']

plt.rcParams['figure.figsize'] = (15, 5)

ax = sns.distplot(df['wr'], 
             bins=2000, 
             kde_kws={"clip":(-0.02,0.02)}, 
             hist_kws={"range":(-0.02,0.02)},
             color='darkcyan', 
             kde=False);
# ax.set(yscale='log')
values = np.array([rec.get_height() for rec in ax.patches])
norm = plt.Normalize(values.min(), values.max())
colors = plt.cm.jet(norm(values))
for rec, col in zip(ax.patches, colors):
    rec.set_color(col)
plt.xlabel("Histogram of the 'weight' * 'resp' values")
plt.title("'weight' * 'resp' Distribution")
plt.show()

print("Minimum value: " + str(np.round(df['wr'].min(), 2)))
print("Maximum value: " + str(np.round(df['wr'].max(), 2)))

del df
gc.collect()

In [None]:
df = df_train.copy()

df['weight_resp']   = df['weight'] * df['resp']
df['weight_resp_1'] = df['weight'] * (np.power(df['resp_1'] + 1, 1) - 1)
df['weight_resp_2'] = df['weight'] * (np.power(df['resp_2'] + 1, 1/2) - 1)
df['weight_resp_3'] = df['weight'] * (np.power(df['resp_3'] + 1, 1/3) - 1)
df['weight_resp_4'] = df['weight'] * (np.power(df['resp_4'] + 1, 1/4) - 1) 

fig, ax = plt.subplots(figsize=(15, 5))

resp    = pd.Series(1 + df.groupby('date')['weight_resp'].mean()).cumprod()
resp_1  = pd.Series(1 + df.groupby('date')['weight_resp_1'].mean()).cumprod()
resp_2  = pd.Series(1 + df.groupby('date')['weight_resp_2'].mean()).cumprod()
resp_3  = pd.Series(1 + df.groupby('date')['weight_resp_3'].mean()).cumprod()
resp_4  = pd.Series(1 + df.groupby('date')['weight_resp_4'].mean()).cumprod()

ax.set_xlabel ("Day", fontsize=18)
ax.set_title ("Cumulative daily return for resp and time horizons 1, 2, 3, and 4 (500 days)", fontsize=18)
resp.plot(lw=3, label='resp x weight')
resp_1.plot(lw=3, label='resp_1 x weight')
resp_2.plot(lw=3, label='resp_2 x weight')
resp_3.plot(lw=3, label='resp_3 x weight')
resp_4.plot(lw=3, label='resp_4 x weight')
# day 85 marker
ax.axvline(x=85, linestyle='--', alpha=0.3, c='red', lw=1)
ax.axvspan(0, 85 , color=sns.xkcd_rgb['grey'], alpha=0.1)
plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5));

del df
gc.collect()

**4. Features**

*> feature_0*

In [None]:
df_train['feature_0'].value_counts()

In [None]:
feature_0_is_plus_one  = df_train.query('feature_0 ==  1').reset_index(drop = True)
feature_0_is_minus_one = df_train.query('feature_0 == -1').reset_index(drop = True)
# the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
ax1.plot((pd.Series(feature_0_is_plus_one['resp']).cumsum()), lw=3, label='resp')
ax1.plot((pd.Series(feature_0_is_plus_one['resp']*feature_0_is_plus_one['weight']).cumsum()), lw=3, label='return')
ax2.plot((pd.Series(feature_0_is_minus_one['resp']).cumsum()), lw=3, label='resp')
ax2.plot((pd.Series(feature_0_is_minus_one['resp']*feature_0_is_minus_one['weight']).cumsum()), lw=3, label='return')
ax1.set_title ("feature 0 = 1", fontsize=18)
ax2.set_title ("feature 0 = -1", fontsize=18)
ax1.legend(loc="lower left")
ax2.legend(loc="upper left")

del feature_0_is_plus_one
del feature_0_is_minus_one
gc.collect()

It seems that this feature clearly differentiates the type of trades, as they represent distinct return dynamics. One assumption is that it labels a 'long' trade vs a 'short' one.

*> features_{1...129}*

There seem to exist 4 different features types by their value structure.

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,10))

ax1.plot((pd.Series(df_train['feature_1']).cumsum()), lw=3, color='red')
ax1.set_title ("Linear", fontsize=20);
ax1.axvline(x=514052, linestyle='--', alpha=0.3, c='green', lw=2)
ax1.axvspan(0, 514052 , color=sns.xkcd_rgb['grey'], alpha=0.1)
ax1.set_xlim(xmin=0)
ax1.set_ylabel ("feature_1", fontsize=14);

ax2.plot((pd.Series(df_train['feature_3']).cumsum()), lw=3, color='green')
ax2.set_title ("Noisy", fontsize=20);
ax2.axvline(x=514052, linestyle='--', alpha=0.3, c='red', lw=2)
ax2.axvspan(0, 514052 , color=sns.xkcd_rgb['grey'], alpha=0.1)
ax2.set_xlim(xmin=0)
ax2.set_ylabel ("feature_3", fontsize=14);

ax3.plot((pd.Series(df_train['feature_55']).cumsum()), lw=3, color='darkorange')
ax3.set_title ("Hybrid (Tag 21)", fontsize=20);
ax3.set_xlabel ("Trade", fontsize=18)
ax3.axvline(x=514052, linestyle='--', alpha=0.3, c='green', lw=2)
ax3.axvspan(0, 514052 , color=sns.xkcd_rgb['grey'], alpha=0.1)
ax3.set_xlim(xmin=0)
ax3.set_ylabel ("feature_55", fontsize=14);

ax4.plot((pd.Series(df_train['feature_73']).cumsum()), lw=3, color='blue')
ax4.set_title ("Negative", fontsize=20)
ax4.set_xlabel ("Trade", fontsize=18)
ax4.set_ylabel ("feature_73", fontsize=14);
gc.collect();

Features 41, 42 and 43 (tag 14) look 'stratified':

In [None]:
day_0 = df_train.loc[df_train['date'] == 0]
day_1 = df_train.loc[df_train['date'] == 1]
day_3 = df_train.loc[df_train['date'] == 3]
three_days = pd.concat([day_0, day_1, day_3])
three_days.plot.scatter(x='ts_id', y='feature_41', s=0.5, figsize=(15,3));
three_days.plot.scatter(x='ts_id', y='feature_42', s=0.5, figsize=(15,3));
three_days.plot.scatter(x='ts_id', y='feature_43', s=0.5, figsize=(15,3));

del day_0
del day_1
del day_3
gc.collect();

*Features distributions*

In [None]:
featstr = [i for i in df_train.columns if 'feature_' in i]

In [None]:
import matplotlib.gridspec as gridspec

plt.rcParams['font.size'] = 14
fig = plt.figure(figsize=(20,80))
fig.suptitle('Features Box plot with 0.1% 99.9% whiskers',fontsize=20, y=.89)
grid =  gridspec.GridSpec(33,4,figure=fig,hspace=.5,wspace=.05)
counter = 0
for i in range(33):
    for j in range(4):
        if counter < 130:
            subf = fig.add_subplot(grid[i, j]);
            sns.boxplot(x= df_train[featstr[counter]],saturation=.5,color= 'blue', ax= subf,width=.5,whis=(.1,99.9));
            subf.axvline(df_train[featstr[counter]].mean(),color= 'darkorange', label='Mean', linestyle=':',linewidth=3)
            subf.set_xlabel('')
            subf.set_title('{}'.format(featstr[counter]),fontsize=16)
            counter += 1
            gc.collect()
plt.show()

*Features growth*

In [None]:
plt.rcParams['font.size'] = 14
featstr = [i for i in df_train.columns if 'feature_' in i]
df_train.groupby('date')[featstr].mean().cumsum().plot(layout=(33,4),subplots=True,figsize=(20,82),xlabel='')
fig = plt.gcf()
fig.text(0.5, 0.19, 'Date', ha='center', fontsize=12)
fig.suptitle('Cumulative features means per day', fontsize=20, y=.886);

*Correlation matrix*

In [None]:
%%time
corr = df_train.sample(100000).loc[:, [c for c in df_train.columns if 'feature_' in str(c)]] \
                 .rank().corr(method='pearson')

In [None]:
plt.figure(figsize=(20, 16))

sns.heatmap(corr,
            cmap='RdBu_r', vmin=-1, vmax=1, square=True)
plt.show()

In [None]:
del corr
gc.collect()

**t-SNE**

In [None]:
%%time 

all_features    = [i for i in range(0,130)]
train_features  = [x+7 for x in all_features]

tsne    = manifold.TSNE(n_components=2, perplexity=50, learning_rate=20)
tsne_2D = tsne.fit_transform(df_train.iloc[:, train_features].fillna(0).sample(10000)) # increase the sample size

In [None]:
x, y = pd.DataFrame(tsne_2D).values.T
fig, ax = plt.subplots(figsize=(15, 15))
ax.scatter(x, y, s=3, c=x, cmap=plt.cm.plasma)
ax.set_title('t-SNE plot for all 130 features', fontsize=18)
plt.show();

In [None]:
del tsne
del tsne_2D

gc.collect()

**5. Action**

The target of the competition is represented by the 'action' binary variable:
* 1 if the trade is taken
* 0 if the trade is not performed

From the objective function it is easily observed that for a positive return ('resp' > 0), then the action should be 1, else action is 0.

In [None]:
df_train['action'] = np.where(df_train['resp'] > 0,1,0)

*Discarding "weight = 0" trades*

In [None]:
df_train = df_train[df_train.weight > 0.0]

In [None]:
daily_action_sum   = df_train['action'].groupby(df_train['date']).sum()
daily_action_count = df_train['action'].groupby(df_train['date']).count()
daily_ratio        = daily_action_sum / daily_action_count

fig, ax = plt.subplots(figsize=(15, 5))
plt.plot(daily_ratio)
ax.set_xlabel ("Day", fontsize=18)
ax.set_ylabel ("ratio", fontsize=18)
ax.set_title ("Daily ratio of action to inaction", fontsize=18)
plt.axhline(0.5, linestyle='--', alpha=0.85, c='r');
ax.set_xlim(xmin=0)
ax.set_xlim(xmax=500)
plt.show();

In [None]:
daily_ratio_mean = daily_ratio.mean()
print('The mean daily ratio is %.3f' % daily_ratio_mean)

In [None]:
daily_ratio_max = daily_ratio.max()
print('The maximum daily ratio is %.3f' % daily_ratio_max)

In [None]:
del daily_action_sum
del daily_action_count
del daily_ratio
gc.collect()

In [None]:
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height() + 2000
            value = '{:.0f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center", fontsize=12, color='black', fontweight='bold') 
        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height() / 2
            value = '{:.1f}%'.format(100 * p.get_height()/len(df_train[['action']].dropna()))
            ax.text(_x, _y, value, ha="center", fontsize=12, color='white', fontweight='bold') 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 20
ax = df_train['action'].value_counts().sort_index().plot(kind='bar', 
                                            title= "Data imbalance - action")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, horizontalalignment='center')
show_values_on_bars(ax)

plt.show()

**Data adjustments**

Since the majority of feature values are heavily centerd around the mean, the null values are filled using the mean.

In [None]:
df_train.fillna(df_train.mean(axis=0), inplace=True)

In [None]:
df_train.action = df_train.action.astype('category')

In [None]:
%%time
import warnings
warnings.filterwarnings("ignore")

import matplotlib.gridspec as gridspec

plt.rcParams['axes.titlesize'] = 12
plt.rcParams['font.size'] = 16
fig = plt.figure(figsize=(20,80))

fig.suptitle('KDE plot of Features', fontsize=24, transform=fig.transFigure, y=.89)
grid = gridspec.GridSpec(33,4,figure=fig,hspace=.5,wspace=.01)
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
counter = 0

featstr = [i for i in df_train.columns if 'feature_' in i]

for i in range(33):
    for j in range(4):
        if counter < 130:
            subf = fig.add_subplot(grid[i, j]);
            sns.distplot(df_train[df_train.action==0][featstr[counter]],bins= 100,label='Negative',
                         color='darkorange', kde_kws={'linewidth':4},ax=subf)
            sns.distplot(df_train[df_train.action!=0][featstr[counter]],bins= 100,label='Positive',
                         color='blue', kde_kws={'alpha':.9,'linewidth':2},hist_kws={'alpha':.3},ax=subf)
            subf.axvline(np.percentile(df_train[featstr[counter]],99.5),
                         color= 'darkblue', label='99.5%', linestyle=':',linewidth=2)
            subf.axvline(np.percentile(df_train[featstr[counter]],.5),
                         color= 'red', label='0.5%', linestyle=':',linewidth=2)
            subf.legend().set_visible(False)
            subf.set_xlabel('')
            subf.set_title('{}'.format(featstr[counter]),fontsize=16)
            try:
                kurt=round(df_train[featstr[counter]].kurt(),2)
                skew=round(df_train[featstr[counter]].skew(),2)
                subf.text(.6,.92,'Kurt = {:.2f}\nSkew = {:.2f}'.format(kurt ,skew),
                      transform=subf.transAxes, verticalalignment='top', bbox=props, fontsize=10)
            except:
                pass
            counter += 1

gc.collect()
handles, labels = subf.get_legend_handles_labels()
fig.legend(handles, labels,ncol=4, bbox_to_anchor=(0.86, 0.893),fontsize=10,
           title= 'Resp',title_fontsize=14,bbox_transform=fig.transFigure);

plt.show()

**'Resp'-features correlation**

In [None]:
respcorr =  pd.DataFrame([df_train.resp.corr(df_train[i]) for i in featstr], index=featstr).reset_index()
respcorr.columns = ['feature', 'coeff']

fig, ax = plt.subplots()
sns.barplot(data=respcorr, x='feature', y='coeff', ax=ax)

ax.set_xticklabels(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
for i, label in zip(range(130), ax.get_xticklabels()[:]):
    if i % 5 != 0:
        label.set_visible(False)

plt.show()

In [None]:
del respcorr
gc.collect()

# II. Model Development

In [None]:
df_train.set_index(['ts_id', 'date'], inplace=True)
# df_example_test.set_index(['ts_id', 'date'], inplace=True)

In [None]:
mean_resp = np.mean(df_train['resp'])
std_resp = np.std(df_train['resp'])

In [None]:
target = 'action'

In [None]:
features_list = ['weight'] + [feat for feat in df_train.columns if 'feature_' in str(feat)]
# features_list

In [None]:
fillna_dict = df_train.mean(axis=0).to_dict()
# df_example_test.fillna(value = fillna_dict, inplace=True)

### Train-validation-test split

In [None]:
# assign the indices of the full dataset to do multiple subsets later
split_buckets = pd.Series(pd.qcut(x=df_train[target], q=100, duplicates='drop'), index=df_train.index.values)
print('The number of distinct target buckets:', split_buckets.nunique())

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_devtest, y_train, y_devtest = train_test_split(df_train[features_list],
#                                                           df_train[target],
#                                                           train_size=0.70, 
#                                                           random_state=27, 
#                                                           stratify=split_buckets)
# X_val, X_test, y_val, y_test = train_test_split(X_devtest, 
#                                                 y_devtest, 
#                                                 train_size=0.5,
#                                                 stratify=split_buckets.loc[X_devtest.index], 
#                                                 random_state=27)

In [None]:
from sklearn.model_selection import train_test_split

trainval_limit = 300
valtest_limit = 400

X_train = df_train[df_train.index.get_level_values(1) <= trainval_limit][features_list]
y_train = df_train[df_train.index.get_level_values(1) <= trainval_limit][target]

X_val = df_train[(df_train.index.get_level_values(1) > trainval_limit) & (df_train.index.get_level_values(1) <= valtest_limit)][features_list]
y_val = df_train[(df_train.index.get_level_values(1) > trainval_limit) & (df_train.index.get_level_values(1) <= valtest_limit)][target]

X_test = df_train[df_train.index.get_level_values(1) > valtest_limit][features_list]
y_test = df_train[df_train.index.get_level_values(1) > valtest_limit][target]

In [None]:
print('Train shape:')
print(X_train.shape)
print(y_train.value_counts())
print('Validation shape:')
print(X_val.shape)
print(y_val.value_counts())
print('Test shape:')
print(X_test.shape)
print(y_test.value_counts())

In [None]:
# Reduce the samples size for running faster
X_train, _, y_train, _ = train_test_split(X_train,
                                          y_train,
                                          train_size = 0.5,
                                          test_size = 0.5,
                                          stratify = y_train,
                                          random_state = 27)
X_val, _, y_val, _ = train_test_split(X_val,
                                          y_val,
                                          train_size = 0.5,
                                          test_size = 0.5,
                                          stratify = y_val,
                                          random_state = 27)
X_test, _, y_test, _ = train_test_split(X_test,
                                          y_test,
                                          train_size = 0.5,
                                          test_size = 0.5,
                                          stratify = y_test,
                                          random_state = 27)

In [None]:
print('Train shape:')
print(X_train.shape)
print(y_train.value_counts())
print('Validation shape:')
print(X_val.shape)
print(y_val.value_counts())
print('Test shape:')
print(X_test.shape)
print(y_test.value_counts())

### Benchmark model

In [None]:
%%time

from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(penalty='none', solver='sag')
baseline_model.fit(X_train, y_train)

In [None]:
def utility_function(X, model):
    data = X.copy()
    data = data.reset_index().set_index('ts_id')

    data['action'] = np.round(model.predict_proba(X)[:, 1])
    
    data = data.reset_index().merge(df_train.reset_index()[['ts_id','resp']], how='left', on='ts_id').set_index('ts_id')
    if 'weight' not in list(data.columns):
        data = data.reset_index().merge(df_train.reset_index()[['ts_id','weight']], how='left', on='ts_id').set_index('ts_id')

    data['prod'] = data['weight'] * data['resp'] * data['action']
    data_agg = data.groupby('date')['prod'].sum()

    t = data_agg.sum() / np.sqrt(np.power(data_agg, 2).sum()) * np.sqrt(250 / len(data_agg))

    u = min(max(t, 0), 6) * data_agg.sum()
    
    return u 

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, log_loss, precision_recall_curve

print("Train: ")
benchmark_proba = baseline_model.predict_proba(X_train)[:, 1]
print("   ROC-AUC score: " + str(roc_auc_score(y_train, benchmark_proba)))
print("   Utility function: " + str(utility_function(X_train, baseline_model)))

print("Validation: ")
benchmark_proba = baseline_model.predict_proba(X_val)[:, 1]
print("   ROC-AUC score: " + str(roc_auc_score(y_val, benchmark_proba)))
print("   Utility function: " + str(utility_function(X_val, baseline_model)))

print("Test: ")
benchmark_proba = baseline_model.predict_proba(X_test)[:, 1]
print("   ROC-AUC score: " + str(roc_auc_score(y_test, benchmark_proba)))
print("   Utility function: " + str(utility_function(X_test, baseline_model)))

### CatBoost Classification

In [None]:
lgb_default_device = 'cpu' # must be lowercase
catboost_default_device = 'CPU' # must be uppercase

In [None]:
X_train_hyperopt = X_train.copy()
y_train_hyperopt = y_train.copy()
X_test_hyperopt = X_val.copy()
y_test_hyperopt = y_val.copy()

In [None]:
categ_vars = ['feature_0']

We are now ready to make the optimization step.
Hyperopt is a Python library for hyperparameter optimization (http://hyperopt.github.io/hyperopt/). In this way we can let an algorithm detect the best model and the best hyperparameter values for the task.
Above, we subseted the base for this task. 
The steps required are:

1. Define an objective function. That is, at each step, we compute the loss of each model. This loss is left for us to define; we can define it in terms of precision, recall, AUC or a combination of these. The algorithm will try to minimize the loss.
2. Define the search space. We need to define the arrays of models and hyperparameters. We are also required to specify the a priori distribution of each hyperparameter
3. Choose an optimization algorithm and perform the run.

The algorithm we choose is TPE (Tree Parzen of Estimators), a Bayesian optimization algorithm; for more details see https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=4&ved=2ahUKEwiq097G1cbnAhWSXsAKHTbpCI0QFjADegQIARAB&url=https%3A%2F%2Fpapers.nips.cc%2Fpaper%2F4443-algorithms-for-hyper-parameter-optimization.pdf&usg=AOvVaw2WRbRoJUBJ8FU1S3vKmmGM

In [None]:
def objective(params):
    print("Fitting " + str(params['model']))
    model = params['model'](**params['param'])

    if params['model'] == CatBoostClassifier:
        print(str(params['param']['iterations']) + ' boosting iterations')
        model.fit(X_train_hyperopt, y_train_hyperopt, cat_features = categ_vars)
    elif params['model'] == lgb.LGBMClassifier:
        print(str(params['param']['n_estimators']) + ' boosting iterations')
        print('Boosting type ' + str(params['param']['boosting_type']))
        model.fit(X_train_hyperopt, y_train_hyperopt, categorical_feature = categ_vars)
    elif params['model'] == RandomForestClassifier:
        print(str(params['param']['n_estimators']) + ' trees')
        model.fit(X_train_hyperopt, y_train_hyperopt)
    else:
        model.fit(X_train_hyperopt, y_train_hyperopt)

    loss = 0 - utility_function(X_test_hyperopt, model)
    hyperparameter_set[loss] = params

    print('Loss = ' + str(loss) + '\n')
    loss_list.append(loss)

    return {'loss': loss, 'params': params, 'status': STATUS_OK}

We will define a function that creates a back-up for the hyperopt trials after a set number of Bayesian search iterations. This is helpful for situations when the kernel crashes, or when more iterations are required at a later point in time. Given the at least partially deterministic nature of the Bayesian search process, this back-up will essentially act as a warm start.

In [None]:
def run_trials(pickling_freq, initial_max_trials, filename):
    # pickling_freq -> how many additional trials to do after loading saved trials. 1 = save after each iteration
    # initial_max_trials -> how many iterations should be run in the beginning before the first pickle save
    max_trials = initial_max_trials
    warnings.warn("UserWarning", UserWarning) #disable UserWarnings while running hyperopt in the run_trials() function
  

    ############################## This is where you can change the optimization space ##############################
        
    dictionar_lgbm = {'model':lgb.LGBMClassifier, 
                      'param': {
                            'class_weight': {0:1, 1:hp.uniform('class_weight_1', 2, 50)},
                            'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0, 1.0),
                            'max_bin': hp.choice('max_bin', np.arange(50, 750, 25, dtype=int)),
                            'num_leaves': hp.choice('num_leaves', np.arange(4, 256, dtype=int)),
                            'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
                            'subsample_for_bin': hp.choice('subsample_for_bin', np.arange(1000, X_train_hyperopt.shape[0], dtype=int)),
                            'min_child_samples': hp.choice('min_child_samples', np.arange(20, 500,5, dtype=int)),
                            'is_unbalance': hp.choice('is_unbalance', np.array([True, False], dtype = bool)), 
                            'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
                            'feature_fraction': hp.uniform('feature_fraction', 1/X_train_hyperopt.shape[1], 1.0),        
                            'max_depth': hp.choice('max_depth', np.arange(2, 21,1, dtype=int)),    
                            'lambda_l1': hp.uniform('lambda_l1', 0.0, 10.0),
                            'lambda_l2': hp.uniform('lambda_l2', 0.0, 10.0),
                            'bagging_fraction': hp.uniform('bagging_fraction',1/X_train_hyperopt.shape[0]*10,1.0),
                            'bagging_freq': hp.choice('bagging_freq', np.arange(1, 11,1, dtype=int)),
                            'objective' : 'binary',
                            # 'boost_from_average': False ,
                            'boost_from_average': hp.choice('boost_from_average', np.array([True, False], dtype=bool)),
                            'boosting_type': hp.choice('boosting_type', np.array(['gbdt','dart'], dtype = str)),
                            'n_estimators' : hp.choice('n_estimators', np.arange(50, 5000, 25, dtype=int)),
                            'device_type': lgb_default_device,
                        }}
    
    dictionar_catboost = {'model':CatBoostClassifier, 
                          'param':{
                                'iterations': hp.choice('iterations', np.arange(50, 3000, 25, dtype=int)),
                                'depth': hp.choice('depth', np.arange(2, 11, 1, dtype=int)),
                                'learning_rate': hp.loguniform('learning_rate_2', np.log(0.001), np.log(0.2)),
                                'class_weights': [1, hp.uniform('class_weight_3',0.1,4)],
                                'border_count': hp.choice('border_count', np.arange(1, 255, 1, dtype=int)),
                                'l2_leaf_reg': hp.uniform('l2_leaf_reg',0,100),
                                'logging_level': 'Silent',
                                'task_type': catboost_default_device
                            }} 
    

    dictionar_sgd = {'model':SGDClassifier, 
                     'param':{
                                 'loss': hp.choice('loss_5', np.array(['hinge', 'modified_huber', 'squared_hinge', 'perceptron'], dtype = str)),
                                 'penalty': hp.choice('penalty_5', np.array(['l1', 'l2', 'none', 'elasticnet'], dtype = str)),
                                 # 'alpha': hp.uniform('alpha', 0, 0.001),
                                 'l1_ratio': hp.uniform('l1_ratio_5', 0, 1),
                                 'max_iter': hp.choice('max_iter_7', np.arange(5, 100, 5, dtype=int)),
                                 'learning_rate': hp.choice('learning_rate_7', np.array(['constant', 'optimal', 'invscaling', 'adaptive'], dtype = str)),
                                 'eta0': hp.uniform('eta0', 0, 0.1),
                                 'power_t': hp.uniform('power_t', 0, 1),
                                 'class_weight': {0:1, 1:hp.uniform('class_weight_8', 2, 50)},
                                 'n_jobs': -1
                    }} 
    
    #############################################################################################################################
        
        ######### Comment out the models which you do not want to use #########
    
    tested_models =[]
    # tested_models.append(dictionar_lgbm)
    tested_models.append(dictionar_catboost)
    # tested_models.append(dictionar_sgd)
    
    space = hp.choice('classifier', tested_models)
    
        
        
        ############################## Checking if there are any existing Trials to reload ##############################
    try: 
        # try to load an already saved trials object, and increase the max
        trials = pickle.load(open(filename, "rb"))
        # print("Found saved Trials! Loading...")
        max_trials = len(trials.trials) + pickling_freq
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, pickling_freq))
    except:
        # if no existing object was found, create a new one
        trials = Trials()
        #################################################################################################################
        
        
        
        ###################################### Hyperopt optimization ######################################
    space = hp.choice('classifier', tested_models)
    fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = max_trials, trials = trials)
    best = hyperparameter_set[sorted(hyperparameter_set)[0]]
    print("Best model so far:", best, "\n")
        ###################################################################################################
        
        
        
        ######### Saving the Trials object once optimization is finished #########
    with open(filename, "wb") as f:
        pickle.dump(trials, f)
    print("Saved Trials! \n")  
        #########################################################################

**Start the optimization**

In [None]:
%%time
hyperparameter_set = {}
loss_list = []
i = 0  
save_frequency = 5   # How often do you want your trials to be backed-up?
initial_trials = 1    # How many Hyperopt iterations do you want to run before the first back up, if none exists?
hyperopt_iters = 320   # How many optimization iterations do you want to run (on top of the backe-up ones, if they exist)? Will be a multiple of save_frequency
save_path = "/kaggle/working/my_optimization_prod_" + experiment_name + ".hyperopt"

# while i <= hyperopt_iters/save_frequency:
#     i += 1
#     with warnings.catch_warnings():
#         warnings.simplefilter("ignore")  # Do not show any UserWarnings, see def run_trials; they can become annoying if you run many iterations :)
#         run_trials(pickling_freq = save_frequency, initial_max_trials = initial_trials, filename = save_path)

**Load the best model with the best hyparameters**

In [None]:
# trials = pickle.load(open(save_path, 'rb'))
# loss = trials.best_trial['result']['loss']
# save_path = "/kaggle/working/my_optimization_prod_" + experiment_name + ".hyperopt"

model = CatBoostClassifier(border_count=145, 
                           class_weights=(1, 0.8763763487405303),
                           depth=5,
                           iterations=2325,
                           l2_leaf_reg=19.8650097679303,
                           learning_rate=0.020187221403617027,
                           logging_level='Silent',
                           task_type=catboost_default_device)

best_loss = -956.9680483209429

display(model.get_params())
display(best_loss)

In [None]:
# for i in range(len(trials.trials)):    
#     if trials.trials[i]['result']['loss'] == loss:
#         print("Loss: " + str(trials.trials[i]['result']['loss']))
#         # print(trials.trials[i]['result']['params'])
#         itera = i
#         print(itera)
# best = trials.trials[itera]['result']['params']
# best

In [None]:
# test = hyperparameter_set[sorted(hyperparameter_set)[0]]
# test

In [None]:
# # Plotting the losses of each hyperopt iteration
# plt.figure(figsize=(15, 5))
# plt.plot(trials.losses())
# plt.title("Hyperopt loss evolution")
# plt.show()

**Fitting the model**

In [None]:
%%time
# model = best['model'](**best['param'])

if isinstance(model, CatBoostClassifier):
    model.fit(X_train_hyperopt, 
              y_train_hyperopt, 
              # eval_set = (X_test_hyperopt, np.ravel(y_test_hyperopt)), 
              # early_stopping_rounds=60,
              cat_features = categ_vars) 
    
elif isinstance(model, lgb.LGBMClassifier):
    model.fit(X_train_hyperopt, 
              np.ravel(y_train_hyperopt), 
              eval_set = (X_test_hyperopt, np.ravel(y_test_hyperopt)), 
              # early_stopping_rounds=60,
              categorical_feature = categ_vars)

else:
    model.fit(X_train_hyperopt, 
              np.ravel(y_train_hyperopt))

In [None]:
# Check
utility_function(X_test_hyperopt, model)

### Model performance

In [None]:
def predict_compute_metrics(X, y, model):
    
    predictions = model.predict_proba(X)[:, 1]
    predictions_l = model.predict(X)

    print("ROC-AUC score: " + str(np.round(roc_auc_score(y, predictions), decimals=4)))
    print("Precision score: " + str(np.round(precision_score(y, predictions_l), decimals=4)))
    print("Utility function: " + str(np.round(utility_function(X, model), decimals=4)))
    
    return predictions

In [None]:
pred_train = predict_compute_metrics(X_train, y_train, model)
# identical with predict_compute_metrics(X_train, y_train, model) - same data

In [None]:
pred_val = predict_compute_metrics(X_val, y_val, model)
# identical with predict_compute_metrics(X_test_hyperopt, y_test_hyperopt, model) - same data

In [None]:
pred_test = predict_compute_metrics(X_test, y_test, model)

*Confusion matrices*

In [None]:
def confusion_matrix_plot(X, y, model):
    sns.set(rc={'figure.figsize': (15, 5)})
    
    predictions_l = model.predict(X)
    cm = confusion_matrix(y, predictions_l)

    ax1 = plt.subplot(1, 2, 1)
    sns.heatmap(cm, annot=True, ax=ax1, fmt='.0f', cmap='Blues')

    # labels, title and ticks
    ax1.set_xlabel('Predicted labels')
    ax1.set_ylabel('True labels')
    ax1.set_title('Confusion Matrix')

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    ax2 = plt.subplot(1, 2, 2)
    sns.heatmap(cm, annot=True, ax=ax2, fmt='.2f', cmap='Blues')

    # labels, title and ticks
    ax2.set_xlabel('Predicted labels')
    ax2.set_ylabel('True labels')
    ax2.set_title('Confusion Matrix %')

In [None]:
confusion_matrix_plot(X_train, y_train, model)

In [None]:
confusion_matrix_plot(X_val, y_val, model)

In [None]:
confusion_matrix_plot(X_test, y_test, model)

*ROC curves*

In [None]:
def roc_curve_plot(X, y, model):
    sns.set(rc={'figure.figsize': (10, 5)})
    
    predictions = model.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = roc_curve(y, predictions)
    
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(15, 5))
    # Plot ROC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.3f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1.0])
    plt.ylim([0, 1.00])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
roc_curve_plot(X_train, y_train, model)

In [None]:
roc_curve_plot(X_val, y_val, model)

In [None]:
roc_curve_plot(X_test, y_test, model)

*Probabilities histograms*

In [None]:
def prob_histogram(X, y, model):
    X_1 = X.loc[y == 0]
    X_2 = X.loc[y == 1]

    Zero = model.predict_proba(X_1)[:, 1]
    One = model.predict_proba(X_2)[:, 1]

    x_locs = [i/20 for i in range(21)]
    x_labs = [str(i/20) for i in range(21)]

    y_locs = [i for i in range(20) if i%2 ==0]
    y_labs = [str(i) +'%' for i in range(20) if i%2 ==0]

    bins = np.linspace(0, 1, 20)

    plt.figure(figsize=(15, 5))
    plt.hist(Zero, bins, weights=np.zeros_like(Zero) + 100. / Zero.size, 
             alpha=0.5, label='Zero', color = 'r')
    plt.hist(One, bins, weights=np.zeros_like(One) + 100. /One.size, 
             alpha=0.5, label='One', color = 'b')
    plt.axvline(x=0.5, color = 'black')
    plt.xlabel('Zero Probability')
    plt.ylabel('Percentage in group')
    plt.xticks(x_locs, x_labs)
    plt.yticks(y_locs, y_labs)
    plt.xlim(0,1)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
prob_histogram(X_train, y_train, model)

In [None]:
prob_histogram(X_val, y_val, model)

In [None]:
prob_histogram(X_test, y_test, model)

*Precision-recall curve*

In [None]:
def precision_recall_curve_plot(X, y, model):
   
    predictions = model.predict_proba(X)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y, predictions)

    f1_scores = 2 * precision * recall / (precision + recall)
    f1_scores = np.nan_to_num(f1_scores)
    f1_max = np.max(f1_scores)
    f1_max_which = np.argmax(f1_scores)

    threshold_optim = thresholds[f1_max_which]

    plt.figure(figsize=(10, 10))

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve - Optimal probability threshold = ' +str(np.round(threshold_optim, 3)) + ', Maximal F1-score = ' + str(np.round(f1_max, 3) ))
    plt.xticks(ticks = [0.05*i for i in range(21)], labels = [str(round(0.05*i, 2)) for i in range(21)])
    plt.yticks(ticks = [0.05*i for i in range(21)], labels = [str(round(0.05*i, 2)) for i in range(21)])
    sns.set_context('notebook', font_scale=1)
    f1_scores_list_plot=np.array([i*0.05 for i in range(20)])
    recall_array = np.array([i*0.01 for i in range(101)])
    for f1_score in f1_scores_list_plot:
        num = 1000
        x = np.linspace(0.0001, 1, num=num)
        y = f1_score * x / (2 * x - f1_score)
        l, = plt.plot(x[(y >= 0) & (y<=1)], y[(y >= 0) & (y<=1)], alpha=0.3, color = 'grey')
        plt.annotate('f1={0:0.2f}'.format(f1_score), xy=(0.95, y[num-10]))

    plt.plot(recall, precision, color='red', alpha=0.6)

    plt.show()

In [None]:
precision_recall_curve_plot(X_train, y_train, model)

In [None]:
precision_recall_curve_plot(X_val, y_val, model)

In [None]:
precision_recall_curve_plot(X_test, y_test, model)

*T-SNE*

Stochasting Neighbourhood Embedding is a non-linear dimensionality reduction technique which we can use to visualiwe high-dimensional data. In this way we can visualise what the model sees in the data.

In [None]:
def tsne_plot(X):
    tsne    = manifold.TSNE(n_components=2, perplexity=50, learning_rate=20)
    tsne_2D = tsne.fit_transform(X_train.sample(5000))
    
    x, y = pd.DataFrame(tsne_2D).values.T
    fig, ax = plt.subplots(figsize=(7, 7))
    ax.scatter(x, y, s=2, c=x, cmap = plt.cm.plasma)
    ax.set_title('t-SNE plot', fontsize=18)
    plt.show();

In [None]:
%%time 
tsne_plot(X_train)

In [None]:
%%time 
tsne_plot(X_val)

In [None]:
%%time 
tsne_plot(X_test)

### Model interpretability

**Permutation importance**

In the Permutation Importance (model agnostic feature importance method) we determine what are the most useful features, the "drivers" of the target. 

The procedure is as follows:
1.  Fix a metric (e.g. AUC / F1-score)
2.  For each predictor column, shuffle the values. Train a new model on the database with the shuffled column and record the metric delta. Sort the features in the descending  order of the delta.

The idea behind the method is that features that are important to the prediction, when shuffled, will worsen the performance of the model. The degree of worsening is proportional to the importance of the feature.

In [None]:
def permutation_importances_f1(X, y, model):
    X = X.sample(100000)
    y = y[X.index]
    
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    baseline = (cm[0, 0] + cm[1, 1]) / 2
    imp = []
    
    for col in X.columns:
        save = X[col].copy()
        X[col] = np.random.permutation(X[col])
        cm_2 = confusion_matrix(y, model.predict(X))
        cm_2 = cm_2.astype('float') / cm_2.sum(axis=1)[:, np.newaxis]
        m = (cm_2[0, 0] + cm_2[1, 1]) / 2
        X[col] = save
        imp.append(baseline - m)
        
    return np.array(imp)

In [None]:
%%time
imp_perm = permutation_importances_f1(X_train, y_train, model)

imp_df = pd.DataFrame()
imp_df["feature"] = list(X_train)
imp_df["permutation_importance"] = imp_perm.copy()
imp_df.sort_values(['permutation_importance'], ascending=False).head(10)

In [None]:
%%time
imp_df_2 = imp_df.sort_values(['permutation_importance'], ascending=False)
sns.set(rc={'figure.figsize': (20, 20)})
sns.set_context("paper", font_scale=1)    
sns.barplot(x = 'permutation_importance',
            y = 'feature',
            data = imp_df_2)
plt.show()

In [None]:
def permutation_importances_ut(X, y, model):
    X = X.sample(100000)
    y = y[X.index]
    
    y_pred = model.predict(X)
    baseline = utility_function(X, model)
    imp = []
    
    for col in X.columns:
        save = X[col].copy()
        X[col] = np.random.permutation(X[col])
        m = utility_function(X, model)
        X[col] = save
        imp.append(baseline - m)
        
    return np.array(imp)

In [None]:
%%time
imp_perm_ut = permutation_importances_ut(X_train, y_train, model)

imp_df_ut = pd.DataFrame()
imp_df_ut["feature"] = list(X_train)
imp_df_ut["permutation_importance"] = imp_perm_ut.copy()
imp_df_ut.sort_values(['permutation_importance'], ascending=False).head(10)

In [None]:
%%time
imp_df_2_ut = imp_df_ut.sort_values(['permutation_importance'], ascending=False)
sns.set(rc={'figure.figsize': (20, 20)})
sns.set_context("paper", font_scale=1)    
sns.barplot(x = 'permutation_importance',
            y = 'feature',
            data = imp_df_2_ut)
plt.show()

In [None]:
ut_pos_features = imp_df_ut.loc[imp_df_ut.permutation_importance > 0, 'feature']

**K-fold cross-validation grouping the data chronologically**

In order to perform a cross-validation on the data, one must be certain that there is no forward bias that cannot be achieved using a standard KFold split (or even a Group KFold) from sklearn for example. Moreover, even the TimeSeriesSplit can cause problems as it does not ensure a separation following the last trade of the day (we may end up with some trades as part of train data and the rest part of the test data, all occurring on the same date).

Therefore, the following implementation of the GroupTimeSeriesSplit forces the chronologically selection of data, while splitting trades exactly at the end of the day. 

In [None]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [None]:
def classification_kfold(X, y, model):
    X_ttv = X.reset_index()[['ts_id','date']]
    X_ttv = X_ttv.sort_values(by=['date','ts_id'])
    
    if 'feature_0' in list(X.columns):
        categ_vars = ['feature_0']
    else:
        categ_vars = []
    
    i = 1
    for train_index, valid_index in GroupTimeSeriesSplit().split(X_ttv, groups=X_ttv['date']):
        print('Fold {}'.format(i))
        train_index_list = X_ttv.loc[train_index, 'date'].unique()
        valid_index_list = X_ttv.loc[valid_index, 'date'].unique()
        print("      [Train dates: " + str(np.min(train_index_list)) + " ---- " + str(np.max(train_index_list)) + "]")
        print("      [Validation dates: " + str(np.min(valid_index_list)) + " ---- " + str(np.max(valid_index_list)) + "]")

        train_cv = X[X.index.get_level_values(1).isin(train_index_list)]
        y_train_cv = y[X.index.get_level_values(1).isin(train_index_list)]
        valid_cv = X[X.index.get_level_values(1).isin(list(valid_index_list))]
        y_valid_cv = y[X.index.get_level_values(1).isin(list(valid_index_list))]

        model.fit(train_cv, 
                  y_train_cv, 
                  # eval_set = (valid_cv, np.ravel(y_valid_cv)), 
                  # early_stopping_rounds=60,
                  cat_features = categ_vars) 

        print("   Train: ")
        pred_trainval = predict_compute_metrics(train_cv, y_train_cv, model)
        print("   Validation: ")
        pred_test = predict_compute_metrics(valid_cv, y_valid_cv, model)
        print('')

        i += 1
        
    return model

In [None]:
%%time
model_k = classification_kfold(X_train, y_train, model)

In [None]:
print("Train: ")
pred_train = predict_compute_metrics(X_train, y_train, model_k)
print("Validation: ")
pred_val = predict_compute_metrics(X_val, y_val, model_k)
print("Test: ")
pred_test = predict_compute_metrics(X_test, y_test, model_k)

In [None]:
%%time

if 'feature_0' in list(X_train.loc[:,ut_pos_features].columns):
    categ_vars = ['feature_0']
else:
    categ_vars = []

# model_pos = best['model'](**best['param'])
model_pos = CatBoostClassifier(border_count=145, 
                           class_weights=(1, 0.8763763487405303),
                           depth=5,
                           iterations=2325,
                           l2_leaf_reg=19.8650097679303,
                           learning_rate=0.020187221403617027,
                           logging_level='Silent',
                           task_type=catboost_default_device)


model_pos.fit(X_train.loc[:,ut_pos_features], 
              y_train, 
              # eval_set = (X_val, np.ravel(y_val)), 
              # early_stopping_rounds=60,
              cat_features = categ_vars) 

In [None]:
print("Train: ")
pred_train = predict_compute_metrics(X_train.loc[:,ut_pos_features], y_train, model_pos)
print("Validation: ")
pred_val = predict_compute_metrics(X_val.loc[:,ut_pos_features], y_val, model_pos)
print("Test: ")
pred_test = predict_compute_metrics(X_test.loc[:,ut_pos_features], y_test, model_pos)

In [None]:
%%time
model_k_pos = classification_kfold(X_train.loc[:,ut_pos_features], y_train, model_pos)

In [None]:
print("Train: ")
pred_train = predict_compute_metrics(X_train.loc[:,ut_pos_features], y_train, model_k_pos)
print("Validation: ")
pred_val = predict_compute_metrics(X_val.loc[:,ut_pos_features], y_val, model_k_pos)
print("Test: ")
pred_test = predict_compute_metrics(X_test.loc[:,ut_pos_features], y_test, model_k_pos)

So far, the best model (considering the utility function) seems the initial one fitted on the 'train' data.

In [None]:
%%time
# model = best['model'](**best['param'])

model = CatBoostClassifier(border_count=145, 
                           class_weights=(1, 0.8763763487405303),
                           depth=5,
                           iterations=2325,
                           l2_leaf_reg=19.8650097679303,
                           learning_rate=0.020187221403617027,
                           logging_level='Silent',
                           task_type=catboost_default_device)

if 'feature_0' in list(X_train.columns):
    categ_vars = ['feature_0']
else:
    categ_vars = []

if isinstance(model, CatBoostClassifier):
    model.fit(X_train, 
              y_train, 
              # eval_set = (X_test_hyperopt, np.ravel(y_test_hyperopt)), 
              # early_stopping_rounds=60,
              cat_features = categ_vars) 

**Partial dependence plots**

After we determined which features are important, we also want to assess how these features influence the outcome. This can be realized using Partial Dependency Plots (PDPs for short).
In a PDP, each curve represents a model point, the x-axis denote the feature and the y-axis the Delta in Probability.

In [None]:
from pdpbox import pdp

features_for_pdp = list(imp_df_2_ut.sort_values(['permutation_importance'], ascending=False).iloc[:10,0])
features_for_pdp = [x for x in list(features_for_pdp)]
features_for_pdp

In [None]:
%%time

sns.set_context("paper", font_scale=1)    

temp = X_val.iloc[:2000, :]

# def plot_pdp(feat, df, clusters=None, feat_name=None, dummy=False):
#     feat_name = feat_name or feat
#     if(dummy == True):
#         p = pdp.pdp_isolate(model, df, df.columns, feat, percentile_range=(0, 100), grid_type='equal', 
#                             cust_grid_points=df[feat].unique(), n_jobs=-1)
#         return pdp.pdp_plot(p,
#                         feat_name,
#                         plot_lines=True,
#                         cluster = clusters is not None,
#                         n_cluster_centers=clusters,
#                         # plot_pts_dist=True,
#                         # x_quantile=True,
#                         show_percentile=True,
#                            plot_params = {'fontsize':10})
#     else:    
#         p = pdp.pdp_isolate(model, df, df.columns, feat, percentile_range=(0, 100),  n_jobs=-1)
#         return pdp.pdp_plot(p,
#                         feat_name,
#                         plot_lines=True,
#                         cluster=clusters is not None,
#                         n_cluster_centers=clusters,
#                         plot_pts_dist=True,
#                         x_quantile=True,
#                         show_percentile=True,
#                         plot_params = {'fontsize':10})

# for i in features_for_pdp:
#     print('feature: ' + i)
#     dummy=False
#     if temp[i].nunique()<=2:
#         dummy=True
            
#     plot_pdp(i, df=temp, dummy=dummy)
#     plt.tight_layout()
#     plt.show()


def partial_dependence_plots(df, feats, model):
    for f in pdp_features:
        if df[f].nunique() <= 25: 
            pdp_response = pdp.pdp_isolate(model=model, dataset=df, model_features=df.columns, feature=f, grid_type='equal',
                                           cust_grid_points=np.sort(df[f].unique()))
        else:
            pdp_response = pdp.pdp_isolate(model=model, dataset=df, model_features=df.columns, feature=f, grid_type='equal',
                                           cust_grid_points=np.arange(df[f].min(), df[f].max(), (df[f].max()-df[f].min())/25))
    
        pdp.pdp_plot(pdp_response, f, figsize=(10, 7))
        plt.show()
        
pdp_features = features_for_pdp
partial_dependence_plots(temp, pdp_features, model)

**Shap values**

This is another interpretation technique inspired by cooperative game theory.

In [None]:
%%time

def shap_plot(X, y, model):
    X = X.sample(5000)
    y = y[X.index]
    
    no_of_features = 20
    
    pool1 = Pool(data=X, label=y, cat_features=categ_vars)
    shap_values = model.get_feature_importance(pool1, type='ShapValues')
    cols_for_shap = list(X.columns)
    index_for_shap = [X[cols_for_shap].columns.get_loc(c) for c in cols_for_shap]

    plt.figure(figsize=(20, 5))

    shap.summary_plot(shap_values[:, index_for_shap],
                      X[cols_for_shap],
                      max_display = no_of_features)

In [None]:
%time
shap_plot(X_train, y_train, model)

### Model fitting on train & validation, then evaluated on test set

In [None]:
X_trainval = df_train[df_train.index.get_level_values(1) <= valtest_limit][features_list]
y_trainval = df_train[df_train.index.get_level_values(1) <= valtest_limit][target]

In [None]:
# Free memory

del X_train
del y_train
del X_val
del y_val

gc.collect()

In [None]:
%%time
# model_tv = best['model'](**best['param'])
model_tv = CatBoostClassifier(border_count=145, 
                           class_weights=(1, 0.8763763487405303),
                           depth=5,
                           iterations=2325,
                           l2_leaf_reg=19.8650097679303,
                           learning_rate=0.020187221403617027,
                           logging_level='Silent',
                           task_type=catboost_default_device)

if isinstance(model_tv, CatBoostClassifier):
    model_tv.fit(X_trainval, 
              y_trainval, 
              # eval_set = (X_test, np.ravel(y_test)), 
              # early_stopping_rounds=60,
              cat_features = categ_vars) 
    
elif isinstance(model_tv, lgb.LGBMClassifier):
    model_tv.fit(X_trainval, 
              np.ravel(y_trainval), 
              eval_set = (X_test, np.ravel(y_test)), 
              # early_stopping_rounds=60,
              categorical_feature = categ_vars)

else:
    model_tv.fit(X_trainval, 
              np.ravel(y_train_hyperopt))

In [None]:
print("Train & val: ")
pred_trainval = predict_compute_metrics(X_trainval, y_trainval, model_tv)
print("Test: ")
pred_test = predict_compute_metrics(X_test, y_test, model_tv)

In [None]:
print("Train & val: ")
confusion_matrix_plot(X_trainval, y_trainval, model_tv)

In [None]:
print("Test: ")
confusion_matrix_plot(X_test, y_test, model_tv)

In [None]:
print("Train & val: ")
roc_curve_plot(X_trainval, y_trainval, model_tv)
print("Test: ")
roc_curve_plot(X_test, y_test, model_tv)

In [None]:
print("Train & val: ")
prob_histogram(X_trainval, y_trainval, model_tv)
print("Test: ")
prob_histogram(X_test, y_test, model_tv)

In [None]:
print("Train & val: ")
precision_recall_curve_plot(X_trainval, y_trainval, model_tv)
print("Test: ")
precision_recall_curve_plot(X_test, y_test, model_tv)

*K-fold cross-validation grouping the data chronologically*

In [None]:
def utility_function(X, model):
    data = X.copy()
    data = data.reset_index().set_index('ts_id')

    data['action'] = np.round(model.predict_proba(X)[:, 1])
    
    data = data.reset_index().merge(df_train.reset_index()[['ts_id','resp']], how='left', on='ts_id').set_index('ts_id')
    if 'weight' not in list(data.columns):
        data = data.reset_index().merge(df_train.reset_index()[['ts_id','weight']], how='left', on='ts_id').set_index('ts_id')

    data['prod'] = data['weight'] * data['resp'] * data['action']
    data_agg = data.groupby('date')['prod'].sum()

    t = data_agg.sum() / np.sqrt(np.power(data_agg, 2).sum()) * np.sqrt(250 / len(data_agg))

    u = min(max(t, 0), 6) * data_agg.sum()
    
    return u 


def predict_compute_metrics(X, y, model):
    
    predictions = model.predict_proba(X)[:, 1]
    predictions_l = model.predict(X)

    print("ROC-AUC score: " + str(np.round(roc_auc_score(y, predictions), decimals=4)))
    print("Precision score: " + str(np.round(precision_score(y, predictions_l), decimals=4)))
    print("Utility function: " + str(np.round(utility_function(X, model), decimals=4)))
    
    return predictions


def classification_kfold(X, y, model):
    X_ttv = X.reset_index()[['ts_id','date']]
    X_ttv = X_ttv.sort_values(by=['date','ts_id'])
    
    if 'feature_0' in list(X.columns):
        categ_vars = ['feature_0']
    else:
        categ_vars = []
    
    i = 1
    for train_index, valid_index in GroupTimeSeriesSplit().split(X_ttv, groups=X_ttv['date']):
        print('Fold {}'.format(i))
        train_index_list = X_ttv.loc[train_index, 'date'].unique()
        valid_index_list = X_ttv.loc[valid_index, 'date'].unique()
        print("      [Train dates: " + str(np.min(train_index_list)) + " ---- " + str(np.max(train_index_list)) + "]")
        print("      [Validation dates: " + str(np.min(valid_index_list)) + " ---- " + str(np.max(valid_index_list)) + "]")

        train_cv = X[X.index.get_level_values(1).isin(train_index_list)]
        y_train_cv = y[X.index.get_level_values(1).isin(train_index_list)]
        valid_cv = X[X.index.get_level_values(1).isin(list(valid_index_list))]
        y_valid_cv = y[X.index.get_level_values(1).isin(list(valid_index_list))]

        model.fit(train_cv, 
                  y_train_cv, 
                  # eval_set = (valid_cv, np.ravel(y_valid_cv)), 
                  # early_stopping_rounds=60,
                  cat_features = categ_vars) 

        print("   Train: ")
        pred_trainval = predict_compute_metrics(train_cv, y_train_cv, model)
        print("   Validation: ")
        pred_test = predict_compute_metrics(valid_cv, y_valid_cv, model)
        print('')

        i += 1
        
    return model

In [None]:
%%time
model_tvk = classification_kfold(X_trainval, y_trainval, model_tv)

In [None]:
print("Train & val: ")
pred_trainval = predict_compute_metrics(X_trainval, y_trainval, model_tvk)
print("Test: ")
pred_test = predict_compute_metrics(X_test, y_test, model_tvk)

In [None]:
%%time
# model_tv_pos = best['model'](**best['param'])
model_tv_pos = CatBoostClassifier(border_count=145, 
                           class_weights=(1, 0.8763763487405303),
                           depth=5,
                           iterations=2325,
                           l2_leaf_reg=19.8650097679303,
                           learning_rate=0.020187221403617027,
                           logging_level='Silent',
                           task_type=catboost_default_device)

if 'feature_0' in list(X_trainval.loc[:,ut_pos_features].columns):
    categ_vars = ['feature_0']
else:
    categ_vars = []
    
model_tv_pos.fit(X_trainval.loc[:,ut_pos_features], 
              y_trainval, 
              # eval_set = (X_val, np.ravel(y_val)), 
              # early_stopping_rounds=60,
              cat_features = categ_vars) 

In [None]:
print("Train & val: ")
pred_train = predict_compute_metrics(X_trainval.loc[:,ut_pos_features], y_trainval, model_tv_pos)
print("Test: ")
pred_test = predict_compute_metrics(X_test.loc[:,ut_pos_features], y_test, model_tv_pos)

In [None]:
%%time
model_tvk_pos = classification_kfold(X_trainval.loc[:,ut_pos_features], y_trainval, model_tv_pos)

In [None]:
print("Train & val: ")
pred_train = predict_compute_metrics(X_trainval.loc[:,ut_pos_features], y_trainval, model_tvk_pos)
print("Test: ")
pred_test = predict_compute_metrics(X_test.loc[:,ut_pos_features], y_test, model_tvk_pos)

### Predictions submission

In [None]:
import janestreet
from tqdm.notebook import tqdm

env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in tqdm(iter_test):
    test_df['feature_0'] = test_df['feature_0'].astype('int').astype('category')
    sample_prediction_df.action = model_tvk_pos.predict(test_df.loc[:,ut_pos_features]) # make your 0/1 prediction here
    env.predict(sample_prediction_df)

In [None]:
!rm -r /kaggle/working/catboost_info
!rm -r /kaggle/working/pickle_files/

In [None]:
# # np.round(model.predict_proba(df_example_test)[:, 1])
# # np.round(model_tvk_pos.predict_proba(df_example_test.loc[:,ut_pos_features])[:, 1])

In [None]:
# df_example_test_submission = df_example_test.reset_index()[['ts_id']].copy()
# df_example_test_submission['action'] = np.round(model_tvk_pos.predict_proba(df_example_test.loc[:,ut_pos_features])[:, 1]).astype('bool')

In [None]:
# # Final dataframe to submit
# display(df_example_test_submission)