In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Optimizing compiler for Numpy fucntions
from numba import njit

# To manipulate dataframes with speed and big data support
import datatable as dtable

# make your loops show a smart progress meter
from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score
import gc
from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Steps Ahead**
1. Loading the data with less storage memory
2. Data Cleaning
    1. Eliminating unnecessary rows and selecting necessary columns 
    2. Handling Missing Values
    3. Creating the target variable,i.e action. 
3. Modeling
4. Fitting best Hyperparameters on the groupedTimeSeriesSplits
5. Submission

# Data Loading with Reduced Memory Storage

In [None]:
%%time
# Loading the data with datatable and coverting to pandas for easy data manipulation
train_original = dtable.fread('../input/jane-street-market-prediction/train.csv').to_pandas()

# Converting datatypes of columns from float64 to float32 for reduced memory saving in RAM. 
float64Cols = train_original.select_dtypes(include=['float64']).columns
dict_astype = {col:'float32' for col in float64Cols}
train = train_original.astype(dict_astype)

# Freeing up not necessary dataframe
del train_original

train.head()

# Data Cleaning
1. Eliminating unnecessary rows and selecting necessary columns 
2. Handling Missing Values
3. Creating the target variable,i.e action. 

In [None]:
# Selecting necessary columns for modeling
features = [col for col in train.columns if 'feature' in col]

# Eliminating rows with weight = 0 as these are excluded in scoring evaluation
train = train[train['weight'] > 0].reset_index(drop = True)

# Finding number of features with Nulls and filling them with mean for the while
#print("Number of features with NA's: ",sum(train.isna().sum()>0))
train.fillna(-999,inplace=True)
print("Number of features with NA's: ",sum(train.isna().sum()>0)) 

# Deciding the target column. Taking only trades with a positive return
train['action'] = np.where(train['resp']> 0, 1,0).astype(int)

In [None]:
@njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# weighted average as per Donate et al.'s formula
# https://doi.org/10.1016/j.neucom.2012.02.053
def weighted_average(a):
    w = []
    n = len(a)
    for j in range(1, n + 1):
        j = 2 if j == 1 else j
        w.append(1 / (2**(n + 1 - j)))
    return np.average(a, weights=w)

In [None]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://www.kaggle.com/marketneutral/purged-rolling-time-series-cv-split
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [None]:

cv = PurgedGroupTimeSeriesSplit(
    n_splits=3,
    #max_train_group_size=150,#208, # 262
    group_gap=63, #84,#105
    max_test_group_size=63# 108 #131
)


for tr,te in cv.split(
        train.loc[:,features].values,
        train['action'].values,
        groups=train['date'].values):
    print(train.loc[tr,'date'].unique())
    print(train.loc[te,'date'].unique())
    


# Modeling

**Modeling Steps**

1. Split numbers have been derived from another script.
2. Derived the below Hyperparameters using bayesian optimizer, Optuna.
3. Fit the hyperparameters to 3 splits. 
4. Evaluate the utility scores for train and test data for each split. 
5. Perform the weighted average of training and test scores across splits.


In [None]:
params = {
                'random_state': 13,
                'objective':'binary:logistic',
                'missing': -999,
                'tree_method': 'gpu_hist',
                'n_estimators': 507,
                'max_depth': 8,
                'learning_rate': 0.06992689459063349,
                'subsample': 0.8012753784867586,
                'colsample_bytree': 0.8419498887494685,
                'gamma': 9
}



clf = XGBClassifier(**params)
clf.fit(train.loc[:, features], train.loc[:, 'action'])

cv = PurgedGroupTimeSeriesSplit(
    n_splits=3,
    #max_train_group_size=499,
    group_gap=63,
    max_test_group_size=63
)

auc_scores = []
utilityscores_train = []
predited_utilityscores_test = []
actual_utilityscores_test = []
models = []

start_time = timer(None)
for fold, (tr, te) in enumerate(cv.split(train.loc[:,features].values, train['action'].values, train['date'].values)):
    X_tr, X_val = train.loc[tr, features].values, train.loc[te, features].values
    y_tr, y_val = train.loc[tr, 'action'].values, train.loc[te, 'action'].values
    
    clf.fit(X_tr, y_tr)
    val_pred = clf.predict(X_val)
    models.append(clf)
    
    # Calculating Metrics AUC & Utility score
    score = roc_auc_score(y_val, val_pred)
    auc_scores.append(score)
    
    predicted_utilityscore_test = utility_score_numba(train.loc[te,'date'].values,
                              train.loc[te,'weight'].values,
                              train.loc[te,'resp'].values,
                              val_pred)
    predited_utilityscores_test.append(predicted_utilityscore_test)
    
    actual_utilityscore_test = utility_score_numba(train.loc[te,'date'].values,
                              train.loc[te,'weight'].values,
                              train.loc[te,'resp'].values,
                              y_val)
    actual_utilityscores_test.append(actual_utilityscore_test)
    
    utilityscore_train = utility_score_numba(train.loc[tr,'date'].values,
                              train.loc[tr,'weight'].values,
                              train.loc[tr,'resp'].values,
                              train.loc[tr,'action'].values)
    utilityscores_train.append(utilityscore_train)
    

    del val_pred, X_tr, X_val, y_tr, y_val, score, predicted_utilityscore_test, utilityscore_train, actual_utilityscore_test

#Calculating weighted averages of Utility scores through all the folds    
utility_train_avg = weighted_average(utilityscores_train)
predicted_utility_test_avg = weighted_average(predited_utilityscores_test)
actual_utility_test_avg = weighted_average(actual_utilityscores_test)

print(f'Utility Score for train data is {utility_train_avg}')
print(f'Predicted Utility Score for test data is {predicted_utility_test_avg}')
print(f'Actual Utility Score for test data is {actual_utility_test_avg}')
print('Finished training the classifier.') 

print('Actual Test Utility scores: ',actual_utilityscores_test)
print('Predicted Test Utility scores: ',predited_utilityscores_test)

gc.collect()

timer(start_time)


**Training Results V6**

* Utility Score for train data is           98746.66189362355<br>
* Predicted Utility Score for test data is  510.0560209681661<br>
* Actual Utility Score for test data is     55089.58933249386<br>

Time taken: 0 hours 1 minutes and 27.91 seconds.

**Training Results V7**

* Utility Score for train data is           143333.9253399343<br>
* Predicted Utility Score for test data is  602.1926323968815<br>
* Actual Utility Score for test data is     27379.270573522677<br>

Time taken: 0 hours 1 minutes and 48.26 seconds.

# Submission

In [None]:
import janestreet
env = janestreet.make_env()
env_iter_test = env.iter_test()

In [None]:
@njit
def fast_fillna(array, value):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), value, array)
    return array

In [None]:
'''The test set does not have 'resp'column, so below are the steps to calculate action variable
Features -> resp
resp -> action
'''

opt_th = 0.503
for (test_df, sample_prediction_df) in tqdm(env_iter_test):
    if test_df['weight'].item() > 0:
        
        test = test_df.loc[:,features].values
        test[0, :] = fast_fillna(test[0, :], -999)
        
        #@ Submitting xgbClassifier model predictions using classifier voting.
        y_preds = models[2].predict(test) # + models[3].predict(test) + models[4].predict(test)
        #if y_preds >= 2:
        #    y_preds = np.array([1]) # Marking action as '1' if 2 or more classifiers classify action as 1
        #else:
        #    y_preds = np.array([0])
        
    else:
         y_preds = np.array([0])
    
    sample_prediction_df.action = y_preds
        
    env.predict(sample_prediction_df) 

## References:

1) Time efficient Replace missing values method

https://www.kaggle.com/c/jane-street-market-prediction/discussion/201302

https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function?scriptVersionId=48926407

Decommisining the below operation as it is time consuming.<br>
test.fillna(-999,inplace=True)<br>
Utilizing numba version of np.where to speed up the process of NA filling with 0.

2. Group Time series split

https://www.kaggle.com/jorijnsmit/found-the-holy-grail-grouptimeseriessplit

3. Purged Group Time series split

https://www.kaggle.com/marketneutral/purged-time-series-cv-xgboost-optuna

4. Grabbed the ideas of classifier voting in test set from the below notebook

https://www.kaggle.com/isaienkov/jane-street-market-prediction-xgb-kfold-rfe
