<a href="https://colab.research.google.com/github/pratik-poudel/jane-street/blob/main/4%20feautres_corr_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download gdonchyts/jane-street-market-prediction-parquet -f train.parquet
import zipfile
zip_ref = zipfile.ZipFile('/content/train.parquet.zip', 'r')
zip_ref.extractall('files')
zip_ref.close()

Downloading train.parquet.zip to /content
 99% 1.06G/1.08G [00:09<00:00, 76.8MB/s]
100% 1.08G/1.08G [00:09<00:00, 116MB/s] 


In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import gc
# pd.options.display.float_format| = "{:.2f}".format
pd.options.display.max_columns = 500

In [91]:
train = pd.read_parquet('/content/files/train.parquet')
train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]
train.shape

(1571415, 138)

In [92]:
original_features = [f for f in train.columns if 'feature' in f]

In [94]:
val_range = train[original_features].max()-train[original_features].min()
filler = pd.Series(train[original_features].min()-0.01*val_range, index=original_features)

In [97]:
def fill_missing(df):
    df[original_features] = df[original_features].fillna(filler)
    return df  

train = fill_missing(train)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1571415 entries, 0 to 1862595
Columns: 138 entries, date to ts_id
dtypes: float32(135), int64(3)
memory usage: 857.2 MB


In [98]:
def feature_transforms(df):
    # Generate Features using Linear shifting, Natural Logarithm and Square Root
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # linear shifting to value above 1.0
        df['pos_'+str(f)] = (df[f]+abs(train[f].min())+1).astype(np.float16)
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # Natural log of all the values
        df['log_'+str(f)] = np.log(df['pos_'+str(f)]).astype(np.float16)
    for f in [f'feature_{i}' for i in range(1,130)]: 
        # Square root of all the values
        df['sqrt_'+str(f)] = np.sqrt(df['pos_'+str(f)]).astype(np.float16)
    
    # Linearly shifted values are used for log and sqrt transformations
    # However they are useless since we have our original values which are 100% correlated
    # Let's drop them from our data
    df.drop([f'pos_feature_{i}' for i in range(1,130)], inplace=True, axis=1)
    return df

In [99]:
train_org = train.copy()

In [100]:
train = feature_transforms(train)
train.shape

(1571415, 396)

In [101]:
def other_trans(df):
    for i in original_features[1:]:
        df['quad_'+i] = np.square(df[i])
        df['cub_'+i] = np.power(df[i], 3)
    return df

In [102]:
train = other_trans(train)

In [103]:
train.shape    

(1571415, 654)

In [104]:
cubic_features = [f for f in train.columns if 'cub_' in f]
quad_features = [f for f in train.columns if 'quad_' in f]
sqrt_features = [f for f in train.columns if 'sqrt_' in f]
log_features = [f for f in train.columns if 'log_' in f]

In [105]:
len(cubic_features), len(quad_features), len(sqrt_features), len(log_features), len(original_features)

(129, 129, 129, 129, 130)

In [106]:
all_features = [f for f in train.columns if 'feature' in f]

In [107]:
corr_resp = train[all_features].corrwith(train['resp']).abs().sort_values(ascending=False).reset_index()
corr_resp3 = train[all_features].corrwith(train['resp_3']).abs().sort_values(ascending=False).reset_index()

In [108]:
corr_resp.columns = ['features', 'score_resp']
corr_resp3.columns = ['features', 'score_resp3']

In [109]:
corr_resp.to_csv('corr_resp.csv', index=False)
corr_resp3.to_csv('corr_resp3.csv', index=False)

In [122]:
org = train_org[original_features].corrwith(train['resp']).abs().sort_values(ascending=False)

In [125]:
to_combine = corr_resp[corr_resp['score_resp'] > 0.02]['features'].to_list()

In [126]:
len(to_combine)

18

In [130]:
three_comb = []
from itertools import combinations
for i in combinations(to_combine, 3):
    three_comb.append(i)

In [131]:
len(three_comb)

816

In [132]:
three_comb[0]

('cub_feature_39', 'feature_39', 'log_feature_39')

In [134]:
from itertools import permutations,combinations_with_replacement, combinations
operators = ["+","-","*","/"]
for i in combinations_with_replacement(operators, 2):
    print(i)

('+', '+')
('+', '-')
('+', '*')
('+', '/')
('-', '-')
('-', '*')
('-', '/')
('*', '*')
('*', '/')
('/', '/')


In [136]:
%%time
from google.colab import output

th = 0.02
feature_name =[]
score = []
count = 0
for i in three_comb:
    a = i[0]
    b = i[1]
    c = i[2]
    
    train['computed'] = train[a] + train[b] + train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '+'.join([a, b, c])
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] - train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '+'.join([a, b])+"-"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] + train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '+'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    
    train['computed'] = train[a] + train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '+'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[a] - train[b] - train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '-'.join([a, b])+"-"+str(c)
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] - train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '-'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] - train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '-'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] * train[b] * train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '*'.join([a, b])+"*"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] * train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '*'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)
        
    train['computed'] = train[a] / train[b] / train[c]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '/'.join([a, b])+"/"+str(c)
        feature_name.append(f_name)
        score.append(cor)
   
    count += 1
    print(count, "done out of ", len(three_comb) )
    if count % 10 ==0:
        output.clear()

811 done out of  816
812 done out of  816
813 done out of  816
814 done out of  816
815 done out of  816
816 done out of  816
CPU times: user 7min 6s, sys: 13.7 s, total: 7min 20s
Wall time: 7min 5s


In [138]:
pd.DataFrame(zip(feature_name, score), columns=['features', 'score']).sort_values(by='score',ascending=False).to_csv('three_features.csv', index=False)

In [200]:
three_features = pd.read_csv('three_features.csv')

In [153]:
split = three_features[three_features['score'] > 0.063]['features'].to_list()

In [205]:
three_features[three_features['score'] > 0.064]

Unnamed: 0,features,score
0,cub_feature_39-cub_feature_37/sqrt_feature_29,0.068198
1,cub_feature_39-cub_feature_37/sqrt_feature_40,0.066402
2,cub_feature_39-feature_37*sqrt_feature_29,0.065228
3,cub_feature_39-sqrt_feature_40*feature_37,0.065133
4,cub_feature_39-sqrt_feature_37*feature_37,0.064913
5,cub_feature_39-sqrt_feature_39*feature_37,0.064905
6,cub_feature_39-feature_37*sqrt_feature_38,0.06486
7,cub_feature_39-feature_37*log_feature_40,0.064653
8,cub_feature_39+feature_29*sqrt_feature_29,0.064638
9,cub_feature_39-log_feature_37*feature_37,0.064556


In [146]:
import re
feature_split = [] 
symbol_split = []
all_name = []
for i in split:
    all_name.append(i)
    features_list = re.split('[- + * /]',i)
    maths = re.findall(r"([\+\-\*\/]+)", i)
    feature_split.append(features_list)
    symbol_split.append(maths + [0])

In [147]:
feature_split

[['cub_feature_39', 'cub_feature_37', 'sqrt_feature_29'],
 ['cub_feature_39', 'cub_feature_37', 'sqrt_feature_40'],
 ['cub_feature_39', 'feature_37', 'sqrt_feature_29'],
 ['cub_feature_39', 'sqrt_feature_40', 'feature_37']]

In [148]:
count = 0
for i,j in zip(feature_split, symbol_split):
#     print([train.columns.get_loc(i[index]) for index in range(len(i))])
#     print(len(i))
    feature_name = all_name[count]
    print("train['"+feature_name+"']", "= " "train["+"'"+i[0]+"'"+"]", str(j[0]), "train["+"'"+i[1]+"'"+"]", str(j[1]), "train["+"'"+i[2]+"'"+"]")
    count += 1

train['cub_feature_39-cub_feature_37/sqrt_feature_29'] = train['cub_feature_39'] - train['cub_feature_37'] / train['sqrt_feature_29']
train['cub_feature_39-cub_feature_37/sqrt_feature_40'] = train['cub_feature_39'] - train['cub_feature_37'] / train['sqrt_feature_40']
train['cub_feature_39-feature_37*sqrt_feature_29'] = train['cub_feature_39'] - train['feature_37'] * train['sqrt_feature_29']
train['cub_feature_39-sqrt_feature_40*feature_37'] = train['cub_feature_39'] - train['sqrt_feature_40'] * train['feature_37']


# Two Features

In [163]:
to_combine = corr_resp[corr_resp['score_resp'] > 0.005]['features'].to_list()

In [164]:
len(to_combine)

197

In [165]:
corr_resp.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,...,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645
features,cub_feature_39,feature_39,log_feature_39,sqrt_feature_39,cub_feature_37,log_feature_37,sqrt_feature_37,feature_40,sqrt_feature_40,feature_37,log_feature_40,feature_29,sqrt_feature_29,cub_feature_40,feature_0,cub_feature_38,log_feature_38,sqrt_feature_38,feature_38,feature_33,feature_35,feature_1,log_feature_29,log_feature_103,sqrt_feature_103,feature_53,sqrt_feature_1,feature_19,feature_69,feature_122,sqrt_feature_53,feature_124,feature_126,feature_103,sqrt_feature_122,feature_128,log_feature_91,sqrt_feature_124,log_feature_53,quad_feature_1,sqrt_feature_128,log_feature_97,log_feature_122,sqrt_feature_126,sqrt_feature_69,log_feature_85,feature_30,feature_9,sqrt_feature_91,log_feature_128,quad_feature_69,log_feature_1,log_feature_124,sqrt_feature_19,sqrt_feature_43,sqrt_feature_97,log_feature_43,feature_43,log_feature_126,sqrt_feature_85,sqrt_feature_42,quad_feature_53,feature_42,log_feature_42,sqrt_feature_33,feature_31,log_feature_69,sqrt_feature_9,feature_91,cub_feature_4,feature_120,log_feature_51,sqrt_feature_106,feature_25,sqrt_feature_30,feature_97,quad_feature_39,sqrt_feature_35,feature_5,quad_feature_122,log_feature_106,sqrt_feature_51,sqrt_feature_5,cub_feature_3,feature_73,feature_2,feature_106,feature_85,quad_feature_126,quad_feature_124,quad_feature_128,sqrt_feature_120,feature_51,log_feature_5,feature_13,sqrt_feature_2,quad_feature_5,cub_feature_53,feature_23,quad_feature_89,feature_101,feature_44,quad_feature_113,feature_71,log_feature_2,feature_83,feature_113,sqrt_feature_83,quad_feature_96,sqrt_feature_73,cub_feature_69,sqrt_feature_44,feature_79,feature_27,log_feature_88,log_feature_83,sqrt_feature_101,log_feature_19,feature_89,feature_50,quad_feature_95,sqrt_feature_50,feature_52,log_feature_120,quad_feature_101,sqrt_feature_88,log_feature_9,feature_21,feature_3,cub_feature_1,quad_feature_108,log_feature_94,sqrt_feature_52,log_feature_50,sqrt_feature_31,quad_feature_107,log_feature_100,quad_feature_119,sqrt_feature_71,feature_34,log_feature_44,log_feature_52,feature_15,cub_feature_122,cub_feature_126,sqrt_feature_3,cub_feature_96,sqrt_feature_113,sqrt_feature_100,quad_feature_29,feature_36,quad_feature_84,feature_96,cub_feature_108,feature_77,quad_feature_49,sqrt_feature_13,log_feature_33,feature_88,quad_feature_71,sqrt_feature_94,log_feature_101,sqrt_feature_25,feature_20,quad_feature_120,cub_feature_89,log_feature_71,log_feature_41,sqrt_feature_77,cub_feature_113,feature_49,sqrt_feature_89,quad_feature_2,quad_feature_44,feature_32,log_feature_3,sqrt_feature_41,cub_feature_84,sqrt_feature_49,cub_feature_128,log_feature_30,cub_feature_52,cub_feature_49,log_feature_113,cub_feature_101,log_feature_77,feature_6,log_feature_49,feature_100,feature_41,feature_10,sqrt_feature_6,feature_105,feature_108,log_feature_35,sqrt_feature_27,sqrt_feature_34,quad_feature_6,quad_feature_40,feature_107,sqrt_feature_105,feature_84,feature_119,quad_feature_93,quad_feature_117,sqrt_feature_79,cub_feature_124,feature_76,log_feature_6,sqrt_feature_20,quad_feature_42,sqrt_feature_96,sqrt_feature_15,cub_feature_95,feature_94,sqrt_feature_23,feature_110,feature_11,log_feature_89,sqrt_feature_36,sqrt_feature_21,feature_14,feature_98,log_feature_105,log_feature_13,quad_feature_98,log_feature_47,log_feature_73,quad_feature_92,quad_feature_3,sqrt_feature_110,quad_feature_48,quad_feature_70,sqrt_feature_107,log_feature_31,feature_28,sqrt_feature_119,sqrt_feature_10,sqrt_feature_47,cub_feature_119,sqrt_feature_32,quad_feature_110,quad_feature_4,quad_feature_37,quad_feature_116,feature_115,cub_feature_107,sqrt_feature_98,feature_95,feature_86,...,log_feature_58,sqrt_feature_67,quad_feature_41,sqrt_feature_12,feature_111,log_feature_21,feature_18,sqrt_feature_65,cub_feature_43,quad_feature_125,log_feature_11,cub_feature_111,sqrt_feature_117,sqrt_feature_121,feature_102,quad_feature_97,sqrt_feature_4,feature_67,cub_feature_41,log_feature_16,feature_65,quad_feature_100,quad_feature_118,sqrt_feature_26,quad_feature_25,log_feature_28,quad_feature_33,sqrt_feature_102,log_feature_7,log_feature_121,quad_feature_127,quad_feature_58,feature_48,feature_45,sqrt_feature_8,feature_121,log_feature_82,quad_feature_62,quad_feature_115,sqrt_feature_57,log_feature_112,feature_93,quad_feature_85,log_feature_66,cub_feature_88,quad_feature_88,cub_feature_99,sqrt_feature_22,quad_feature_90,feature_57,log_feature_61,log_feature_95,cub_feature_31,sqrt_feature_55,feature_123,sqrt_feature_123,cub_feature_54,log_feature_99,cub_feature_62,cub_feature_35,sqrt_feature_99,cub_feature_29,log_feature_123,quad_feature_30,log_feature_102,cub_feature_15,log_feature_93,quad_feature_67,sqrt_feature_92,sqrt_feature_66,feature_46,feature_104,quad_feature_26,cub_feature_27,quad_feature_16,quad_feature_104,quad_feature_105,cub_feature_16,log_feature_55,log_feature_12,quad_feature_91,cub_feature_94,cub_feature_58,cub_feature_26,log_feature_24,cub_feature_36,sqrt_feature_112,cub_feature_25,quad_feature_82,log_feature_60,sqrt_feature_24,cub_feature_5,feature_66,quad_feature_63,log_feature_64,cub_feature_82,cub_feature_76,quad_feature_15,quad_feature_76,log_feature_8,quad_feature_68,quad_feature_36,sqrt_feature_48,cub_feature_127,feature_114,log_feature_4,feature_112,feature_116,sqrt_feature_114,cub_feature_85,sqrt_feature_61,sqrt_feature_58,cub_feature_63,log_feature_23,cub_feature_65,sqrt_feature_18,cub_feature_109,sqrt_feature_64,quad_feature_87,quad_feature_114,cub_feature_91,quad_feature_17,quad_feature_21,log_feature_114,quad_feature_80,cub_feature_114,cub_feature_80,sqrt_feature_104,log_feature_56,log_feature_80,cub_feature_22,log_feature_92,log_feature_127,log_feature_17,log_feature_116,quad_feature_106,cub_feature_28,quad_feature_22,quad_feature_123,feature_127,sqrt_feature_60,feature_59,log_feature_74,cub_feature_74,cub_feature_8,sqrt_feature_80,cub_feature_32,feature_64,log_feature_125,quad_feature_129,sqrt_feature_127,sqrt_feature_74,feature_74,cub_feature_18,feature_99,feature_80,cub_feature_50,quad_feature_18,log_feature_109,feature_58,cub_feature_9,cub_feature_56,cub_feature_11,cub_feature_121,quad_feature_7,cub_feature_7,sqrt_feature_46,quad_feature_8,cub_feature_12,feature_61,log_feature_117,cub_feature_129,quad_feature_28,quad_feature_56,feature_129,quad_feature_109,log_feature_59,sqrt_feature_129,quad_feature_59,cub_feature_123,quad_feature_74,quad_feature_11,log_feature_48,cub_feature_97,log_feature_129,quad_feature_32,quad_feature_12,feature_78,cub_feature_66,feature_109,quad_feature_102,feature_72,cub_feature_83,sqrt_feature_93,cub_feature_102,sqrt_feature_56,cub_feature_64,cub_feature_105,cub_feature_115,cub_feature_68,sqrt_feature_72,cub_feature_106,quad_feature_19,quad_feature_10,cub_feature_21,log_feature_22,quad_feature_72,log_feature_46,sqrt_feature_78,feature_60,log_feature_104,cub_feature_87,sqrt_feature_125,quad_feature_66,quad_feature_35,sqrt_feature_59,quad_feature_27,cub_feature_59,cub_feature_30,log_feature_72,feature_56,cub_feature_19,cub_feature_10,sqrt_feature_109,quad_feature_31,quad_feature_65,quad_feature_78,sqrt_feature_90,feature_125,cub_feature_79,feature_90,cub_feature_20,cub_feature_72,log_feature_26,cub_feature_90,log_feature_78,log_feature_18,cub_feature_118,cub_feature_104,cub_feature_67,quad_feature_73,log_feature_90,quad_feature_20,cub_feature_78,cub_feature_73,cub_feature_103,cub_feature_17,feature_55,quad_feature_79,sqrt_feature_116
score_resp,0.0625061,0.048461,0.0482525,0.0482493,0.0317498,0.030865,0.0302389,0.0299992,0.0297919,0.029791,0.0294727,0.0284939,0.0251586,0.0241596,0.0237678,0.0220545,0.0206181,0.020174,0.0198655,0.0189919,0.0174111,0.0170646,0.0168442,0.0165132,0.0158959,0.0153868,0.0152845,0.0152566,0.0152414,0.014864,0.0147869,0.014565,0.0145349,0.0145062,0.0144566,0.0144119,0.0142334,0.0140215,0.0140192,0.0139716,0.0139577,0.0139168,0.0139035,0.0138358,0.0138113,0.0137932,0.0137515,0.013334,0.0133115,0.0132444,0.0132143,0.0131099,0.0130701,0.0129925,0.0128778,0.0128169,0.0127932,0.012709,0.0125709,0.0124935,0.0124721,0.0124024,0.0123653,0.0123532,0.0123021,0.0122098,0.0120517,0.0117454,0.0115518,0.0114647,0.0114362,0.0110836,0.0110291,0.0110123,0.0110034,0.0109635,0.0109052,0.0109018,0.0108788,0.0108697,0.0107319,0.0106781,0.0104929,0.0104792,0.0104738,0.0104232,0.0104096,0.0103359,0.0103064,0.0102465,0.0100929,0.00998084,0.00991751,0.00991014,0.00952358,0.00944122,0.00943295,0.00914976,0.00910901,0.0089664,0.00892145,0.00877591,0.00849061,0.00841262,0.00832231,0.00824922,0.00822843,0.00818103,0.0081238,0.00812128,0.00807628,0.00807457,0.00803334,0.00801621,0.00801618,0.00800142,0.00797405,0.00787085,0.00783542,0.00782825,0.00779946,0.00779715,0.00777091,0.00773588,0.00766866,0.00762988,0.00761681,0.00761436,0.00760684,0.00758152,0.00755639,0.00755537,0.00753775,0.00753476,0.00748873,0.00746168,0.00745439,0.0074075,0.00737848,0.0073433,0.0072277,0.00722313,0.00709892,0.00708495,0.00704899,0.00702523,0.00701703,0.00697328,0.00687389,0.00687181,0.00682198,0.00677812,0.00674731,0.00671106,0.00667501,0.00664144,0.00663728,0.00662026,0.00657849,0.00655618,0.00654487,0.00652483,0.00646051,0.00643517,0.00640649,0.00640371,0.00635902,0.00634746,0.00630453,0.00621109,0.00619791,0.00613315,0.00609915,0.00602274,0.00600933,0.0059849,0.00596938,0.00593496,0.00589162,0.00583146,0.00579476,0.00575401,0.00565193,0.00559245,0.00558358,0.00556716,0.0055246,0.00551379,0.00543585,0.00541135,0.00525319,0.0052456,0.00513748,0.00507166,0.00507062,0.00503567,0.00501974,0.00497608,0.00497321,0.00496355,0.00493444,0.00490975,0.00489887,0.0048874,0.00488193,0.00485609,0.00477607,0.00477557,0.00475569,0.00473929,0.00473042,0.00471381,0.00466698,0.00464389,0.00460935,0.00448487,0.0044217,0.00439122,0.00435457,0.00426911,0.00425228,0.00423077,0.00422536,0.00422339,0.00410778,0.00408383,0.00407413,0.00405901,0.0039992,0.0039966,0.00396549,0.00393483,0.00390506,0.00382685,0.00381724,0.00379897,0.00377241,0.00368179,0.00365723,0.00362475,0.00361656,0.00356262,0.00355282,0.00354734,0.0035154,0.00351466,0.00346299,0.00344064,0.00338957,0.00335176,...,0.00163564,0.00160761,0.00160583,0.00160496,0.00160465,0.00159811,0.00159427,0.00158434,0.00156795,0.00156347,0.0015327,0.00152666,0.00151687,0.00146128,0.00146031,0.00141772,0.00141734,0.00140433,0.00140394,0.00140166,0.00139915,0.00139349,0.00137324,0.00135606,0.00134582,0.0013393,0.00133083,0.00132675,0.00131325,0.0013114,0.00128645,0.00128245,0.00127786,0.00127773,0.00127567,0.0012706,0.00126697,0.00125961,0.00124664,0.00123813,0.00123303,0.00119297,0.00117765,0.00116948,0.00116567,0.00115141,0.00115079,0.00112977,0.00112898,0.00109863,0.00109831,0.00109503,0.00109473,0.00108944,0.00108867,0.00107045,0.00106791,0.0010581,0.00105605,0.00105374,0.00104693,0.00104585,0.00104019,0.00102626,0.00102538,0.0010252,0.0010229,0.00102286,0.00102133,0.00101923,0.00101862,0.00101381,0.00101307,0.00101237,0.00101095,0.000992197,0.000980973,0.000963994,0.000954268,0.00093875,0.000937938,0.000936767,0.000936213,0.00092923,0.000927483,0.000927209,0.000922902,0.000906622,0.000903241,0.000898762,0.000898023,0.000893934,0.000885494,0.000884287,0.000879563,0.000877085,0.000863064,0.000844314,0.000843261,0.000839148,0.000828037,0.000807465,0.000806102,0.000806045,0.000801229,0.00079093,0.000783478,0.000766164,0.000765644,0.000764576,0.000756962,0.000745219,0.000739174,0.000732206,0.000723131,0.000718254,0.000717052,0.000689262,0.000671945,0.000669853,0.000637317,0.00062644,0.000612507,0.000587517,0.000580087,0.000575273,0.000571238,0.000569208,0.000565877,0.000561147,0.000556278,0.000556209,0.000555895,0.000555536,0.000552758,0.000551038,0.000544384,0.000539068,0.000533764,0.000527987,0.000527415,0.000517923,0.000517669,0.000513554,0.000512804,0.000512638,0.000509279,0.000506361,0.000493843,0.000485736,0.000479766,0.00047824,0.000475322,0.000471946,0.000462932,0.000455336,0.000453062,0.000451373,0.000450875,0.00044836,0.000447526,0.000433581,0.000424289,0.000422237,0.000419079,0.000416526,0.000409034,0.000400309,0.000398381,0.000398083,0.000395518,0.000393654,0.00038905,0.000388887,0.000385296,0.000374368,0.000374287,0.000346368,0.000338651,0.000336751,0.000328647,0.000324437,0.000323873,0.000307642,0.000307409,0.00030482,0.000294755,0.000289151,0.000288336,0.000282246,0.000277192,0.000275159,0.000265419,0.000253065,0.000223582,0.000222709,0.000220286,0.000192123,0.000190682,0.00018805,0.000183763,0.000183087,0.000178953,0.000170431,0.000163502,0.000160869,0.000158636,0.000155854,0.000148661,0.000138969,0.000137422,0.000129554,0.000129171,0.000128982,0.000122668,0.000122198,0.00012176,0.000109542,9.63809e-05,8.88703e-05,8.21037e-05,8.07153e-05,7.63383e-05,7.18452e-05,6.11815e-05,5.78797e-05,5.73353e-05,5.54154e-05,5.42295e-05,5.32757e-05,5.21781e-05,5.20847e-05,5.12426e-05,5.10471e-05,4.96979e-05,4.84707e-05,4.7851e-05,3.43227e-05,2.8449e-05,2.79721e-05,2.74244e-05,2.70446e-05,2.48111e-05,2.16913e-05,1.127e-05,7.43067e-06,4.61712e-06,1.45551e-06,1.21763e-06,1.04992e-06


In [166]:
two_comb = []
from itertools import combinations
for i in combinations(to_combine, 2):
    two_comb.append(i)

In [167]:
len(two_comb)

19306

In [168]:
two_comb[0]

('cub_feature_39', 'feature_39')

In [169]:
from itertools import permutations,combinations_with_replacement, combinations
operators = ["+","-","*","/"]
for i in combinations_with_replacement(operators, 1):
    print(i)

('+',)
('-',)
('*',)
('/',)


In [182]:
two_comb[:5]

[('cub_feature_39', 'feature_39'),
 ('cub_feature_39', 'log_feature_39'),
 ('cub_feature_39', 'sqrt_feature_39'),
 ('cub_feature_39', 'cub_feature_37'),
 ('cub_feature_39', 'log_feature_37')]

In [194]:
%%time
from google.colab import output

th = 0.02
feature_name =[]
score = []
count = 0
for i in two_comb:
    a = i[0]
    b = i[1]
    # c = i[2]
    
    train['computed'] = train[a] + train[b]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '+'.join([a, b])
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] - train[b]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '-'.join([a, b])
        feature_name.append(f_name)
        score.append(cor)
    
    train['computed'] = train[a] * train[b]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '*'.join([a, b])
        feature_name.append(f_name)
        score.append(cor)
    
    
    train['computed'] = train[a] / train[b]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '/'.join([a, b])
        feature_name.append(f_name)
        score.append(cor)

    train['computed'] = train[b] / train[a]
    cor = np.abs(train['computed'].corr(train['resp']))
    if cor > th:
        f_name = '/'.join([b, a])
        feature_name.append(f_name)
        score.append(cor)
   
    count += 1
    print(count, "done out of ", len(two_comb) )
    if count % 10 ==0:
        output.clear()

19301 done out of  19306
19302 done out of  19306
19303 done out of  19306
19304 done out of  19306
19305 done out of  19306
19306 done out of  19306
CPU times: user 1h 9min 28s, sys: 2min 26s, total: 1h 11min 55s
Wall time: 1h 10min 34s


In [196]:
two_features = pd.DataFrame(zip(feature_name, score), columns=['features', 'score']).sort_values(by='score',ascending=False)

In [197]:
two_features

Unnamed: 0,features,score
472,cub_feature_39*log_feature_41,0.063808
322,cub_feature_39/sqrt_feature_44,0.063706
52,cub_feature_39+feature_1,0.063581
496,cub_feature_39*sqrt_feature_41,0.063545
484,cub_feature_39/sqrt_feature_89,0.063487
...,...,...
2791,sqrt_feature_40+sqrt_feature_83,0.020005
5281,feature_38+log_feature_106,0.020004
4980,cub_feature_38+quad_feature_107,0.020004
5212,feature_38*log_feature_29,0.020002


In [198]:
two_features.to_csv('two_features.csv', index=False)

In [None]:
# pd.DataFrame(zip(feature_name, score), columns=['features', 'score'])
# .sort_values(by='score',ascending=False)
# .to_csv('three_features.csv', index=False)

In [206]:
two_features[two_features['score'] > 0.063]

Unnamed: 0,features,score
472,cub_feature_39*log_feature_41,0.063808
322,cub_feature_39/sqrt_feature_44,0.063706
52,cub_feature_39+feature_1,0.063581
496,cub_feature_39*sqrt_feature_41,0.063545
484,cub_feature_39/sqrt_feature_89,0.063487
370,cub_feature_39-feature_3,0.063384
24,cub_feature_39-feature_37,0.063382
95,cub_feature_39/sqrt_feature_122,0.063362
74,cub_feature_39+feature_69,0.063343
420,cub_feature_39/sqrt_feature_113,0.063343
