<a href="https://colab.research.google.com/github/pratik-poudel/jane-street/blob/main/3%20feature%20combination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Sanity Check #

Click the _Runtime_ dropdown at the top of the page, then _Change Runtime Type_ and confirm the instance type is _GPU_.

Check the output of `!nvidia-smi` to make sure you've been allocated a Tesla T4, P4, or P100.

In [1]:
!nvidia-smi

Tue Feb 16 14:00:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Setup:
Set up script installs
1. Install most recent Miniconda release compatible with Google Colab's Python install  (3.6.7)
1. removes incompatible files
1. Install RAPIDS 0.14 libraries including:
  1. cuDF
  1. cuML
  1. cuGraph
  1. cuSpatial
  1. cuSignal
  1. xgboost and dask-xgboost
1. Set necessary environment variables
1. Copy RAPIDS .so files into current working directory, a workaround for conda/colab interactions
1. If running v0.11 or higher, updates pyarrow library to 0.15.x.

In [2]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Executing transaction: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
Copying shared object files to /usr/lib
Copying RAPIDS compatible xgboost

**************************

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c jane-street-market-prediction -f train.csv

import zipfile
zip_ref = zipfile.ZipFile('/content/train.csv.zip', 'r')
zip_ref.extractall('files')
zip_ref.close()

Downloading train.csv.zip to /content
100% 2.61G/2.61G [00:25<00:00, 30.1MB/s]
100% 2.61G/2.61G [00:25<00:00, 111MB/s] 


In [4]:
import pandas as pd
import cudf
import cupy
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings("ignore")

In [5]:
train = cudf.read_csv('/content/files/train.csv')

In [None]:
train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]
train['action'] = ((train['resp'].values) > 0).astype(int)

In [15]:
train = train.to_pandas()

In [17]:
train.fillna(train.mean(), inplace=True)

In [18]:
train = cudf.from_pandas(train)

# Three Features

In [111]:
to_combine = pd.read_csv('perm_imp.csv')['feature'][25:50].to_list()

In [122]:
to_combine = pd.read_csv('perm_imp.csv')['feature'][:9].to_list()

In [125]:
len(to_combine)

9

In [136]:
three_comb = []
from itertools import combinations
for i in combinations(to_combine, 5):
    three_comb.append(i)

In [137]:
len(three_comb)

126

In [None]:
from itertools import permutations,combinations_with_replacement, combinations
operators = ["+","-","*","/"]
for i in combinations_with_replacement(operators, 3):
    print(i)

In [72]:
%%time
th = 0.02
feature_name =[]
score = []
for i in three_comb:
    a = i[0]
    b = i[1]
    c = i[2]
    
    train['computed'] = train[a] + train[b] + train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b, c])
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] - train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"-"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    
    train['computed'] = train[a] + train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] - train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"-"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] - train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] - train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] * train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '*'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] * train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '*'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] / train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '/'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)

CPU times: user 4min 53s, sys: 5min 5s, total: 9min 58s
Wall time: 9min 58s


In [73]:
pd.DataFrame(feature_name, score).shape

(757, 1)

In [75]:
results = pd.DataFrame(zip(feature_name, score), columns=['features', 'score']).sort_values(by='score',ascending=False)

In [97]:
results[results['score'] > 0.04]

Unnamed: 0,features,score
93,feature_11+feature_39*feature_44,0.047796
21,feature_70+feature_39*feature_44,0.0459
187,feature_12+feature_39*feature_44,0.045794
96,feature_11+feature_39*feature_108,0.045164
366,feature_39*feature_44*feature_108,0.044457
23,feature_70+feature_39*feature_108,0.04445
237,feature_18-feature_39*feature_44,0.044011
189,feature_12+feature_39*feature_108,0.043788
236,feature_18+feature_39*feature_44,0.043534
239,feature_18+feature_39*feature_108,0.043426


In [76]:
# train[to_combine + ['resp_3']].corr()['resp_3'].abs().sort_values(ascending=False)

In [79]:
df = train.copy()

In [86]:
for i in results[results['score'] > 0.04]['features']:
    print(i.split())

['feature_11+feature_39*feature_44']
['feature_70+feature_39*feature_44']
['feature_12+feature_39*feature_44']
['feature_11+feature_39*feature_108']
['feature_39*feature_44*feature_108']
['feature_70+feature_39*feature_108']
['feature_18-feature_39*feature_44']
['feature_12+feature_39*feature_108']
['feature_18+feature_39*feature_44']
['feature_18+feature_39*feature_108']
['feature_126+feature_44*feature_34']
['feature_128+feature_44*feature_34']
['feature_18-feature_39*feature_108']
['feature_11+feature_44*feature_34']
['feature_39+feature_44*feature_34']
['feature_18*feature_43*feature_44']
['feature_11-feature_39*feature_44']
['feature_52-feature_44*feature_34']
['feature_11-feature_39*feature_108']
['feature_12-feature_39*feature_108']
['feature_43+feature_44*feature_34']
['feature_49-feature_44*feature_34']
['feature_43*feature_44*feature_34']
['feature_12-feature_39*feature_44']
['feature_14+feature_44*feature_34']
['feature_12+feature_44*feature_34']


In [87]:
def combine(df):
    df['feature_11+feature_39*feature_44'] = df['feature_11'] + df['feature_39'] * df['feature_44']
    df['feature_70+feature_39*feature_44'] = df['feature_70'] + df['feature_39'] * df['feature_44']
    df['feature_12+feature_39*feature_44'] = df['feature_12'] + df['feature_39'] * df['feature_44']
    df['feature_11+feature_39*feature_108'] = df['feature_11'] + df['feature_39'] * df['feature_108']
    df['feature_39*feature_44*feature_108'] = df['feature_39'] + df['feature_44'] * df['feature_108']
    df['feature_70+feature_39*feature_108'] = df['feature_70'] + df['feature_39'] * df['feature_108']
# ['feature_18-feature_39*feature_44']
# ['feature_12+feature_39*feature_108']
# ['feature_18+feature_39*feature_44']
# ['feature_18+feature_39*feature_108']
# ['feature_126+feature_44*feature_34']
# ['feature_128+feature_44*feature_34']
# ['feature_18-feature_39*feature_108']
# ['feature_11+feature_44*feature_34']
# ['feature_39+feature_44*feature_34']
# ['feature_18*feature_43*feature_44']
# ['feature_11-feature_39*feature_44']
# ['feature_52-feature_44*feature_34']
# ['feature_11-feature_39*feature_108']
# ['feature_12-feature_39*feature_108']
# ['feature_43+feature_44*feature_34']
# ['feature_49-feature_44*feature_34']
# ['feature_43*feature_44*feature_34']
# ['feature_12-feature_39*feature_44']
# ['feature_14+feature_44*feature_34']
# ['feature_12+feature_44*feature_34']
    return df

In [81]:
df['computed'] = train['feature_11'] + train['feature_39'] * train['feature_44']
# feature_11+feature_39*feature_44	

In [82]:
df[['computed', 'feature_11', 'feature_39', 'feature_44']].corr()

Unnamed: 0,computed,feature_11,feature_39,feature_44
computed,1.0,0.286847,0.378842,-0.007032
feature_11,0.286847,1.0,0.190932,-0.020335
feature_39,0.378842,0.190932,1.0,-0.005566
feature_44,-0.007032,-0.020335,-0.005566,1.0


In [88]:
train = combine(train)

# Four Features

In [211]:
to_combine = pd.read_csv('perm_imp.csv')['feature'][40:60].to_list()

In [212]:
three_comb = []
from itertools import combinations
for i in combinations(to_combine, 4):
    three_comb.append(i)

In [219]:
three_comb

[('feature_108', 'feature_58', 'feature_63', 'feature_36'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_34'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_23'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_73'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_77'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_123'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_24'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_17'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_103'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_19'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_104'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_125'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_21'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_9'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_62'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_59'),
 ('feature_108', 'feature_58', 'featu

In [214]:
from itertools import permutations,combinations_with_replacement, combinations
operators = ["+","-","*","/"]
for i in combinations_with_replacement(operators, 3):
    print(i)

('+', '+', '+')
('+', '+', '-')
('+', '+', '*')
('+', '+', '/')
('+', '-', '-')
('+', '-', '*')
('+', '-', '/')
('+', '*', '*')
('+', '*', '/')
('+', '/', '/')
('-', '-', '-')
('-', '-', '*')
('-', '-', '/')
('-', '*', '*')
('-', '*', '/')
('-', '/', '/')
('*', '*', '*')
('*', '*', '/')
('*', '/', '/')
('/', '/', '/')


In [215]:
three_comb[:2]

[('feature_108', 'feature_58', 'feature_63', 'feature_36'),
 ('feature_108', 'feature_58', 'feature_63', 'feature_34')]

In [216]:
%%time
th = 0.02
feature_name =[]
score = []
for i in three_comb:
    a = i[0]
    b = i[1]
    c = i[2]
    d = i[3]
    
    train['computed'] = train[a] + train[b] + train[c] + train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b, c, d])
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] + train[c] - train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b, c])+"-"+str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] + train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b, c])+"*"+str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    
    train['computed'] = train[a] + train[b] + train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b, c])+"/"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] + train[b] - train[c] - train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"-"+str(c) + "-"+str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] - train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"-"+str(c)+"*"+str(d)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] + train[b] - train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"-"+str(c) + "/" + str(d)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] + train[b] * train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"*"+str(c) + "*" + str(d)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] + train[b] * train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"*"+str(c) + "/"+str(d)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] + train[b] / train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '+'.join([a, b])+"/"+str(c) + "/"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] - train[c] - train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b, c, d])
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] - train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b, c])+"*"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] - train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b, c])+"/"+str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] - train[b] * train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"*"+str(c)+ "*"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] * train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"*"+str(c)+ "/"+str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] - train[b] / train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '-'.join([a, b])+"/"+str(c)+ "/"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] * train[b] * train[c] * train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '*'.join([a, b, c, d])
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] * train[b] * train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '*'.join([a, b, c])+ "/"+str(d)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] * train[b] / train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '*'.join([a, b])+ "/"+str(c) + "/" + str(d)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] / train[b] / train[c] / train[d]
    cor = np.abs(train['computed'].corr(train['resp_3']))
    if cor > th:
        f_name = '/'.join([a, b, c, d])
        feature_name.append(f_name)
        score.append(cor)

CPU times: user 1h 11min 19s, sys: 1min 21s, total: 1h 12min 40s
Wall time: 49min 21s
Parser   : 242 ms


In [218]:
pd.DataFrame(zip(feature_name, score), columns=['features', 'score']).sort_values(by='score',ascending=False).to_csv('four_features2.csv', index=False)

Unnamed: 0,features,score
55,feature_108+feature_34+feature_77*feature_17,0.032712
217,feature_63+feature_34+feature_77*feature_17,0.032569
295,feature_36+feature_34+feature_77*feature_17,0.032281
145,feature_58-feature_34-feature_77*feature_17,0.032236
40,feature_108+feature_36+feature_77*feature_17,0.031844
...,...,...
394,feature_34+feature_73+feature_19+feature_9,0.020029
574,feature_77*feature_24*feature_17*feature_21,0.020020
166,feature_58+feature_23-feature_125-feature_9,0.020017
265,feature_63+feature_77-feature_9-feature_62,0.020016


In [175]:
d1 = pd.DataFrame(zip(feature_name, score), columns=['features', 'score']).sort_values(by='score',ascending=False)