In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Import

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            c_sum = df[col].sum()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_sum < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_sum < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_sum < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_sum < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_sum < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_sum < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
import datatable as dt

df_train = (
    dt.fread('../input/jane-street-market-prediction/train.csv')
      .to_pandas()
      .query('weight > 0')
      .pipe(reduce_mem_usage)
)

feature_names = df_train.columns[df_train.columns.str.contains('feature')]

# Data Adjustments

In [None]:
id_field = 'ts_id'
date_field = 'date'
featstr = [i for i in df_train.columns if 'feature_' in i]

df_train['action'] = np.where(df_train['resp'] > 0,1,0)
df_train = df_train[df_train.weight > 0.0]
df_train.fillna(df_train.mean(axis=0), inplace=True)

df_train.action = df_train.action.astype('category')
df_train.feature_0 = df_train.feature_0.astype('category')

In [None]:
df_train.set_index(['ts_id', 'date'], inplace=True)

target = 'action'
features_list = ['weight'] + [feat for feat in df_train.columns if 'feature_' in str(feat)]
fillna_dict = df_train.mean(axis=0).to_dict()

# Train, Validation & Test Split

In [None]:
trainval_limit = 300
valtest_limit = 400

X_trainval = df_train[df_train.index.get_level_values(1) <= valtest_limit][features_list]
y_trainval = df_train[df_train.index.get_level_values(1) <= valtest_limit][target]

X_test = df_train[df_train.index.get_level_values(1) > valtest_limit][features_list]
y_test = df_train[df_train.index.get_level_values(1) > valtest_limit][target]

In [None]:
categ_vars = ['feature_0']

# Selected based on positive feature importance
ut_pos_features = ['feature_1', 'feature_2', 'feature_3', 'feature_4', 
                   'feature_5', 'feature_6', 'feature_7', 'feature_8', 
                   'feature_10', 'feature_11', 'feature_12', 'feature_14', 
                   'feature_15', 'feature_16', 'feature_17', 'feature_18', 
                   'feature_19', 'feature_20', 'feature_21', 'feature_22', 
                   'feature_23', 'feature_24', 'feature_25', 'feature_26', 
                   'feature_27', 'feature_28', 'feature_29', 'feature_30', 
                   'feature_31', 'feature_32', 'feature_34', 'feature_35', 
                   'feature_36', 'feature_37', 'feature_38', 'feature_39', 
                   'feature_40', 'feature_41', 'feature_42', 'feature_43', 
                   'feature_44', 'feature_45', 'feature_46', 'feature_47', 
                   'feature_48', 'feature_49', 'feature_51', 'feature_52', 
                   'feature_54', 'feature_55', 'feature_56', 'feature_57', 
                   'feature_59', 'feature_60', 'feature_61', 'feature_62', 
                   'feature_63', 'feature_64', 'feature_66', 'feature_68', 
                   'feature_69', 'feature_70', 'feature_71', 'feature_72', 
                   'feature_73', 'feature_74', 'feature_75', 'feature_76', 
                   'feature_77', 'feature_78', 'feature_79', 'feature_82', 
                   'feature_83', 'feature_86', 'feature_87', 'feature_89', 
                   'feature_90', 'feature_91', 'feature_92', 'feature_95', 
                   'feature_96', 'feature_97', 'feature_98', 'feature_99', 
                   'feature_100', 'feature_101', 'feature_102', 'feature_103', 
                   'feature_104', 'feature_105', 'feature_106', 'feature_107', 
                   'feature_108', 'feature_109', 'feature_111', 'feature_112', 
                   'feature_113', 'feature_115', 'feature_116', 'feature_118', 
                   'feature_120', 'feature_121', 'feature_123', 'feature_124', 
                   'feature_125', 'feature_126', 'feature_127', 'feature_129']

if 'feature_0' in ut_pos_features:
    categ_vars = ['feature_0']
else:
    categ_vars = []

# Model Evaluation Functions

In [None]:
def utility_function(X, model):
    data = X.copy()
    data = data.reset_index().set_index('ts_id')

    data['action'] = np.round(model.predict_proba(X)[:, 1])
    
    data = data.reset_index().merge(df_train.reset_index()[['ts_id','resp']], how='left', on='ts_id').set_index('ts_id')
    if 'weight' not in list(data.columns):
        data = data.reset_index().merge(df_train.reset_index()[['ts_id','weight']], how='left', on='ts_id').set_index('ts_id')

    data['prod'] = data['weight'] * data['resp'] * data['action']
    data_agg = data.groupby('date')['prod'].sum()

    t = data_agg.sum() / np.sqrt(np.power(data_agg, 2).sum()) * np.sqrt(250 / len(data_agg))

    u = min(max(t, 0), 6) * data_agg.sum()
    
    return u 

# Check
# utility_function(X_test_hyperopt, model)

In [None]:
def predict_compute_metrics(X, y, model):
    
    predictions = model.predict_proba(X)[:, 1]
    predictions_l = model.predict(X)

    print("Utility function: " + str(np.round(utility_function(X, model), decimals=4)))
    
    return predictions

# Model Fitting & Results

In [None]:
from catboost import CatBoostClassifier

# Following HyperOpt Run
model_tv = CatBoostClassifier(border_count=70, 
                             class_weights=(1, 0.9710117916727243),
                             depth=5,
                             iterations=875,
                             l2_leaf_reg=37.89277395675028,
                             learning_rate=0.0683720060874196,
                             logging_level='Silent',
                             task_type='CPU')

model_tv.fit(X_trainval.loc[:,ut_pos_features], 
             y_trainval, 
             cat_features = categ_vars) 

In [None]:
print("Train & validation: ")
pred_train = predict_compute_metrics(X_trainval.loc[:,ut_pos_features], y_trainval, model_tv)
print("Test: ")
pred_test = predict_compute_metrics(X_test.loc[:,ut_pos_features], y_test, model_tv)

In [None]:
import janestreet
from tqdm.notebook import tqdm

env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

for (test_df, sample_prediction_df) in tqdm(iter_test):
    sample_prediction_df.action = model_tv.predict(test_df.loc[:,ut_pos_features]) # make your 0/1 prediction here
    env.predict(sample_prediction_df)

In [None]:
!rm -r /kaggle/working/catboost_info