This notebook was implemented using the official documentation of Catboost

https://catboost.ai/

# Kaggle Code

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

# Sys

The used cmds are designed for a GPU env, and I have collected them from various notebooks.

Kaggle provides free access to NVidia K80 GPUs in kernels.

A GPU Kernel will give you Tesla P100 16gb VRAM as GPU, with 13gb RAM + 2-core of Intel Xeon as CPU. No-GPU option will give you 4-cores + 16gb RAM, hence more CPU power.

In [None]:
import subprocess
from ast import literal_eval
import multiprocessing
from pynvml import *
import torch



def run(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out.decode('utf-8').strip())
    

    
print('### CPU ###')
run('cat /proc/cpuinfo | egrep -m 1 "^model name"')
print("cpu count       :", multiprocessing.cpu_count())
#run('cat /proc/cpuinfo | egrep -m 1 "^cpu MHz"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu cores"')



# These codes executes only in a GPU env
print('\n### RAM ###')
#run('cat /proc/meminfo | egrep "^MemTotal"')
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'Total    : {info.total / 1073741824} GB')
print(f'Used     : {info.used  / 1073741824} GB')
print(f'Free     : {info.free  / 1073741824} GB')



print('\n### OS ###')
run('uname -a')



print('\n### GPU ###')
#run('lspci | grep VGA')
# setting device on GPU if available, else CPU
print('Device name      :', torch.cuda.get_device_name(0))
print('Memory Allocated :', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Memory Cached    :', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# Libs

In [None]:
%%time

import cudf as cu
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool
from sklearn.feature_selection import SelectFromModel
from tqdm.notebook import tqdm
import gc

# Data

In [None]:
%%time

data_cudf = cu.read_csv('/kaggle/input/jane-street-market-prediction/train.csv') # cudf is sensitivee about tyoe and use at and iat instead of loc and iloc
data = data_cudf.to_pandas()
del data_cudf

#backup = data
#train = backup_train

# Nan

In [None]:
# # drop the colomns that have +10% na

# print(train.columns.shape)
# missing_val = pd.DataFrame(train.isna().sum().sort_values(ascending=True)*100/train.shape[0],columns=['missing %'])[:138-14]
# missing_val.style.background_gradient(cmap='Oranges_r')
# features = missing_val.index
# print(features.shape)

# train = train[features]
# print(train.shape)

In [None]:
features = [c for c in data.columns if 'feature' in c]
for i in features:
    x = data[i].mean()     
    data[i] = data[i].fillna(x)

# Split

In [None]:
print(data.shape)

# prepare the data before splitting to train and test
data = data[data['weight']!=0]
data['action'] = (data['resp']>0)*1
print(data.shape)
#features = data.columns.str.contains('feature')

# with we will create the model
train = data.sample(frac = 0.75, random_state = 73)
print(train.shape)
#train = data.reset_index(drop=True) # if we keep it here it will raise an error in the valid set, cause if the change of the index

valid = data.drop(train.index) # take the data that does not exist in train
print(valid.shape)


train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

# it will help us track the model
#test  = data.tail(500).reset_index(drop=True)

# split the train set
X_train = train.loc[:,features] 
y_train = train.loc[:, 'action']

# split the test set
#X_test = test.loc[:, features]
#y_test = test.loc[:, 'action']


X_valid = valid.loc[:, features]
y_valid = valid.loc[:, 'action']

# Feature Selection
### using CatBoostClassifier

In [None]:
%%time

selector = CatBoostClassifier(thread_count = -1, task_type = "GPU", devices = '-1', random_seed = 73).fit(Pool(X_train, y_train), verbose = 100) # without task_type='GPU' the fit time is 42min with GPU is 46s
list_of_tuples = list(zip(X_train.columns.values, selector.get_feature_importance())) 

df = pd.DataFrame(list_of_tuples).sort_values(by = [1]).reset_index(drop = True).rename(columns = {0: 'feat_labels', 1: 'feature_importances'})
df.head()

# Update Splitted data

In [None]:
df1 = df.tail(111) # features importance > 0.003
features = list(df1["feat_labels"])

X_train = X_train.loc[:, features]
#X_test = X_test.loc[:, features]
#print(train.shape)
print(X_train.shape)
#print(X_test.shape)

X_valid = X_valid.loc[:, features]
print(X_valid.shape)

# Hyper Pram tuning
### using CatBoostClassifier Randomized Search
##### dont run this it takes alot of time. The result is used in the next cell.
### V1: I have changed some grid params

In [None]:
#%%time

#model = CatBoostClassifier(
#                thread_count = -1, task_type = "GPU", devices = '-1', random_seed = 73, 
#                bootstrap_type = 'Poisson', verbose = 10, name = 'V1')

# my core number is 4
# I have only one GPU, so no need to -1 for activating all the gpu's but instead 0:1
# save_snapshot=True, snapshot_file="V0", snapshot_interval=600 not supported for randomized search

#grid = {  'depth'           :[3, 1, 2, 6, 4, 5, 7, 8, 9, 10],
#          'iterations'      :[1000, 250, 100, 500],
#          'learning_rate'   :[0.03, 0.001, 0.01, 0.1, 0.2, 0.3],
#          'l2_leaf_reg'     :[3, 1, 5, 10, 100],
#          'border_count'    :[32, 5, 10, 20, 50, 100, 200],
#          }
# 'loss_function'   :['Logloss', 'CrossEntropy'], currently not supported in grid search
# 'thread_count'    :4 error non iterable
# 'ctr_border_count':[50,5,10,20,100,200] error not a map

#grid_search_result = model.randomized_search(  grid, 
#                                               Pool(X_train, y_train), # X = X_train, y = y_train
#                                               cv         = 3,
#                                               n_iter     = 10,
#                                               refit      = False,
#                                               shuffle    = True,
#                                               stratified = True,
#                                               train_size = 0.75,
#                                               plot       = True)
# search_by_train_test_split = Split the source dataset into train and test parts. 
# Models are trained on the train part, while parameters are compared by the loss function score on the test dataset.


#print(grid_search_result['params'])

# Model

In [None]:
# {'border_count': 200, 'depth': 10, 'l2_leaf_reg': 1, 'iterations': 250, 'learning_rate': 0.5}

# Dataset processing.The fastest way to pass the features data to the Pool
train_data = Pool(data = X_train,
                  label = y_train,
                  ) #weight=[0.1, 0.2...]

valid_data = Pool(data = X_valid,
                  label = y_valid)

model = CatBoostClassifier(border_count = 32, depth = 5, l2_leaf_reg = 3.5, 
                           thread_count = -1,iterations = 100, learning_rate = 0.5, 
                           task_type = "GPU",devices = '0:1', bootstrap_type = 'Poisson', 
                           random_seed = 73, verbose = 100, name = 'V3', 
                           use_best_model=True, loss_function= 'Logloss', eval_metric='AUC',) 
# rsm=0.98 not supported on GPU
# use_best_model=True This option requires a validation dataset to be provided. 
# Use the validation dataset to identify the iteration with the optimal value of the metric specified in  --eval-metric (eval_metric).

model.fit(train_data, eval_set = valid_data) # eval_set=(X_test, y_test) Pool(X_train, y_train)
#y_pred = model.predict(X_test)

#print("\n\n",model.predict(X_test))
#print("\n\n",classification_report(y_test, y_pred))

# Save and Load

In [None]:
#model.save_model(fname = "/model_v2",
#                   format="cbm",
#                   export_parameters=None, # Additional format-dependent parameters for Apple CoreML ONNX-ML PMML
#                   pool=None) # This parameter is required if the model contains categorical features and the output format is cpp, python, or JSON.

In [None]:
#model = CatBoostClassifier()      
#model.load_model("/model_v1")

# Submission

In [None]:
for (test_df, sample_prediction_df) in tqdm(iter_test):
    
    X_test = test_df.loc[:, features]
    
    for i in features:
        x = X_test[i].mean()     
        X_test[i] = X_test[i].fillna(x)

    y_preds = model.predict(X_test)
    
    sample_prediction_df.action = y_preds
    env.predict(sample_prediction_df)