In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import pandas as pd
import numpy as np

import fastai
from   fastai.callback import *
from   fastai.callback.all import *
from   fastai.callback.training import GradientClip
from   fastai.callback.all import SaveModelCallback, EarlyStoppingCallback, ReduceLROnPlateau 
from   fastai.tabular import *
from   fastai.tabular.data import *
from   fastai.tabular.all import *
from   fastai.tabular.all import TabularPandas, RandomSplitter, CategoryBlock, MultiCategoryBlock, range_of, accuracy, tabular_learner, TabularDataLoaders
from   fastai.learner import Learner
from   fastai.metrics import RocAucMulti

from   sklearn.pipeline import Pipeline, FeatureUnion
from   sklearn.impute import SimpleImputer
from   sklearn.preprocessing import PolynomialFeatures, StandardScaler
from   sklearn.compose import ColumnTransformer
from   sklearn.ensemble import RandomForestClassifier
from   sklearn.base import BaseEstimator, TransformerMixin

import torch.nn as nn
from   torch.nn import CrossEntropyLoss, MSELoss
from   torch.nn.modules.loss import _WeightedLoss

import pickle
from   functools import partial
import warnings
warnings.filterwarnings ("ignore")

In [None]:
# Global Vars
TP   = None
DF   = None
DLs  = None
PIPE = None
BS   = 10000
N_FEATURES  = 0

In [None]:
def preprocess_data (filename='../input/jane-street-market-prediction/train.csv', df=None, isTrainData=True):
    
    global PIPE, N_FEATURES
    dtype = None
    if isTrainData:
        
        dtype = {
            'date'      : 'int64', 
            'weight'    : 'float32',
            'resp'      : 'float32',
            'ts_id'     : 'int64',  
            'feature_0' : 'float32'
        }
    else:
        
        dtype = {
            'date'      : 'int64', 
            'weight'    : 'float32',
            'feature_0' : 'float32'
        }
    for i in range (1, 130):
        k = 'feature_' + str (i)
        dtype[k] = 'float32'
    
    to   = None
    if isTrainData:
        df         = pd.read_csv (filename, dtype=dtype)
        df         = df.query ('date > 85')
        # df       = df[df['weight'] != 0].reset_index (drop = True)
        df         = df.reset_index (drop = True)
        
        resp_cols  = ['resp_1', 'resp_2', 'resp_3','resp_4', 'resp']    
        # df[:5000].to_csv (filename+'.dummy', index=False) 
        y          = np.median (np.stack ([df[c] for c in resp_cols]).T, axis=1)
        df.drop (columns=['weight', 'date', 'ts_id']+resp_cols, inplace=True)
        f_columns  = [c for c in df.columns if "feature" in c]
        PIPE       = Pipeline ([
                         ("imputer", SimpleImputer (missing_values=np.nan, strategy='mean')),
                         ("stndard", StandardScaler ()),
        ])
        X          = PIPE.fit_transform (df)                                   #;print('X.shape =', X.shape)
        f_columns  = [f"feature_{i}" for i in range (X.shape[1])]              #;print ('columns =', columns)
        df         = pd.DataFrame (np.hstack ((X, y.reshape ((-1,1)))))
        df.columns = f_columns + ['Y']
        N_FEATURES = len (f_columns)
        del X
    else:
        
        df         = df.drop (columns=['weight', 'date']).reset_index (drop = True)
        X          = PIPE.transform (df)
        df         = pd.DataFrame (X)
        df.columns = [f"feature_{i}" for i in range (X.shape[1])] 
        del X
    return df

In [None]:
DF = preprocess_data ()
with open ("PIPE.bin", "ab") as f:
    pickle.dump (PIPE, f)
# DF = DF.sample (DF.shape[0]//10)
Y = DF['Y']
DF.drop (columns=['Y'], inplace=True)
gc.collect ()

In [None]:
DF.shape, Y.shape

In [None]:
from scipy.stats import pearsonr
UNARY_TRACKER = dict ()

def replaceFeature (unaryFuncs, df, y):
    """
    unaryFuncs: list of functions
    """
    col_corrs = []
    for colname in df.columns:
        
        data = df[colname].values
        max_corr = abs (pearsonr (data, y)[0])                  #;print (colname, 'max_corr =', max_corr)
        unaryFunc_i = -1
        for i in range (len (unaryFuncs)):
            
            unaryFunc = unaryFuncs[i]
            transformed_data = unaryFunc (data)                 #;print ('data =', data, 'transformed_data =', transformed_data)
            corr = abs (pearsonr (transformed_data, y)[0])      #;print (colname, 'transformed_data corr =', corr)
            if np.isnan (corr):
                corr = 0.0
            if corr > max_corr:
                
                df[colname] = transformed_data                  # TODO: uncomment this
                max_corr = corr                                 #;print (f'replacing by unaryFuncs[{i}]')
                unaryFunc_i = i
        if unaryFunc_i > -1:
            UNARY_TRACKER[colname] = unaryFunc_i
        col_corrs.append (max_corr)
    return col_corrs

In [None]:
UNARY_FUNCS = [lambda x: np.sign(x)*x**2,    lambda x: x**3,    lambda x: np.sign(x)*np.sqrt(np.abs(x)),
               lambda x: np.sign(x)*np.log(np.abs(x) + (np.abs(x)<1e-4)*1e-4),     lambda x: 1/(x + (np.abs(x)<1e-4)*1e-4) ]
col_corrs   = replaceFeature (UNARY_FUNCS, DF, Y)
mean_feat_corr = np.mean (col_corrs)
with open ('UNARY_TRACKER.bin', 'ab') as pfile:
    pickle.dump (UNARY_TRACKER, pfile)
mean_feat_corr

In [None]:
col_corrs = sorted (col_corrs, reverse=True)
col_corrs[:20]

In [None]:
# CORR_THRESH = np.mean (col_corrs)
CORR_THRESH = np.quantile (col_corrs, 0.97)

# Find Correlation among features and remove highly correlated features
DF['Y'] = Y
corr = DF.corr(method='pearson').abs().unstack().sort_values(kind='quicksort', ascending=False).reset_index()
DF.drop (columns=['Y'], inplace=True)
corr.rename(columns={'level_0':'feature_A', 'level_1':'feature_B', 0:'Corr_Coeff'}, inplace=True)
corr = corr[corr['Corr_Coeff']<=0.8]
corr.dropna(inplace=True)
corr.head ()

# Which features correlate more with the target?
corr[corr['feature_A']=='Y'].head(10)

In [None]:
def add_interactions (binFuncs, df, y):
    
    bin_cols = []
    for colname1 in df.columns:
        for colname2 in df.columns:
            if colname1 == colname2:
                break                
            for i in range (len (binFuncs)):
                
                binFunc  = binFuncs[i]
                new_data = binFunc (df[colname1].values, df[colname2].values)
                corr  = abs (pearsonr (new_data, y)[0])
                # corr1 = abs (pearsonr (df[colname1].values, y)[0])
                # corr2 = abs (pearsonr (df[colname2].values, y)[0])
                if np.isnan (corr):
                    corr = 0.0
                # if corr > corr1 and corr > corr2 and corr > CORR_THRESH:
                if corr > CORR_THRESH:    
                    # insert the new colname but don't add now in the df else infinite loop
                    new_col_name = colname1+"#"+str(i)+"#"+colname2
                    bin_cols.append ((corr, new_col_name))
    
    # now add the new found cols to the df
    # for t in BIN_COLS:        
    #     colname1 = t[1].split('#')[0]
    #     colname2 = t[1].split('#')[-1]
    #     binFunc  = binFuncs[int (t[1].split('#')[1])]        
    #     df[t[1]]    = binFunc (df[colname1].values, df[colname2].values)  
    
    bin_cols.sort (reverse=True)
    bin_cols = [t[1] for t in bin_cols]
    return bin_cols

In [None]:
BIN_FUNCS = [lambda x,y: x+y,    lambda x,y: x-y,    lambda x,y: x*y,
             lambda x,y: x/(y + (np.abs(y)<1e-4)*1e-4),  lambda x,y: y/(x + (np.abs(x)<1e-4)*1e-4) ]
BIN_COLS  = add_interactions (BIN_FUNCS, DF, Y)

with open('BIN_COLS.bin', 'ab') as pfile:
    pickle.dump (BIN_COLS, pfile)
BIN_COLS

In [None]:
len (BIN_COLS)

# Inference time usage

In [None]:
pickle_path_dir = "."

UNARY_FUNCS = [lambda x: np.sign(x)*x**2,    lambda x: x**3,    lambda x: np.sign(x)*np.sqrt(np.abs(x)),
               lambda x: np.sign(x)*np.log(np.abs(x) + (np.abs(x)<1e-4)*1e-4),     lambda x: 1/(x + (np.abs(x)<1e-4)*1e-4) ]
BIN_FUNCS = [lambda x,y: x+y,    lambda x,y: x-y,    lambda x,y: x*y,
             lambda x,y: x/(y + (np.abs(y)<1e-4)*1e-4),  lambda x,y: y/(x + (np.abs(x)<1e-4)*1e-4) ]
with open (pickle_path_dir+"/PIPE.bin", "rb") as f:
    PIPE = pickle.load (f)
with open (pickle_path_dir+"/BIN_COLS.bin", "rb") as f:
    BIN_COLS = pickle.load (f)
with open (pickle_path_dir+"/UNARY_TRACKER.bin", "rb") as f:
    UNARY_TRACKER = pickle.load (f)

    
class EDATransformer (BaseEstimator, TransformerMixin):
    
    def __init__(self, unary_tracker, bin_cols, unary_funcs, bin_funcs, n_new_feat=500):
        
        super (EDATransformer, self).__init__()
        self.unary_tracker = unary_tracker
        self.bin_cols      = bin_cols
        self.unary_funcs   = unary_funcs
        self.bin_funcs     = bin_funcs
        self.n_new_feat    = min (n_new_feat, len (bin_cols))
        return
        
    def fit (self, X, y=None):        
        return self
    
    def transform (self, X, y=None):
        
        for colname in self.unary_tracker:
            col_idx  = int (colname.split ("_")[1])
            uf_idx   = self.unary_tracker[colname]
            X[:, col_idx] = self.unary_funcs[uf_idx] (X[:, col_idx])
        
        for i in range (self.n_new_feat):
            colname  = self.bin_cols[i]
            col_idx1 = int (colname.split ('#')[0].split ("_")[1])           #;print ('col_idx1 =', col_idx1)
            col_idx2 = int (colname.split ('#')[-1].split ("_")[1])
            binFunc  = self.bin_funcs[int (colname.split ('#')[1])]        
            X        = np.hstack ((X, binFunc (X[:, col_idx1], X[:, col_idx2]).reshape ((-1, 1))))
        return X
    
    def set_params (self, **parameters):
        for parameter, value in parameters.items ():
            setattr (self, parameter, value)
        return self

    def get_params (self, deep=True):
        params = {}
        return params

edat = EDATransformer (UNARY_TRACKER, BIN_COLS, UNARY_FUNCS, BIN_FUNCS)
PIPE = Pipeline (PIPE.steps + [('eda', edat)])

In [None]:
PIPE

In [None]:
batch = DF[:100]
batch = PIPE.transform (batch)
batch.shape

In [None]:
def read_data (filename='../input/jane-street-market-prediction/train.csv', df=None, isTrainData=True):
    
    dtype = None
    if isTrainData:
        
        dtype = {
            'date'      : 'int64', 
            'weight'    : 'float32',
            'resp'      : 'float32',
            'ts_id'     : 'int64',  
            'feature_0' : 'float32'
        }
    else:
        
        dtype = {
            'date'      : 'int64', 
            'weight'    : 'float32',
            'feature_0' : 'float32'
        }
    for i in range (1, 130):
        k = 'feature_' + str (i)
        dtype[k] = 'float32'    
    to = None
    df = None
    if isTrainData:
        
        df         = pd.read_csv (filename, dtype=dtype)
        df         = df.query ('date > 85')
        df         = df[df['weight'] != 0].reset_index (drop = True)
        # df       = df.reset_index (drop = True)        
        resp_cols  = ['resp_1', 'resp_2', 'resp_3','resp_4', 'resp']    
        # df[:5000].to_csv (filename+'.dummy', index=False) 
        y          = np.stack ([(df[c] > 0).astype ('int') for c in resp_cols]).T
        df.drop (columns=['weight', 'date', 'ts_id']+resp_cols, inplace=True)
        f_columns  = [c for c in df.columns if "feature" in c]                
        df[resp_cols] = y
        df.columns = f_columns + resp_cols
        del y

        splits     = RandomSplitter (valid_pct=0.05) (range_of (df))
        to         = TabularPandas (df, cont_names=f_columns, cat_names=None, y_names=resp_cols, y_block=MultiCategoryBlock(encoded=True, vocab=resp_cols), splits=splits)
    else:
        
        df         = df.drop (columns=['weight', 'date']).reset_index (drop = True)
        # X          = PIPE.transform (df)        
        # df.columns = [f"feature_{i}" for i in range (X.shape[1])] 
    return to, df