# <center>Data research

In [1]:
import os
os.chdir("..")

%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
data_path = Path("data")

In [4]:
example_predictions_name = "example_predictions.pkl"
test_features_name = "test_features.pkl"
train_features_name = "train_features.pkl"
train_outcomes_name = "train_outcomes.pkl"

In [105]:
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_rows', 10)

In [6]:
example_predictions = pd.read_pickle(data_path / example_predictions_name)
example_predictions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,g__12m_binary
date,tradingitemid,Unnamed: 2_level_1
2006-12-29,2585893,0.541588
2006-12-29,2585895,0.522537
2006-12-29,2585941,0.500847
2006-12-29,2585945,0.496815
2006-12-29,2585957,0.552594


In [8]:
test_features = pd.read_pickle(data_path /test_features_name)
train_features = pd.read_pickle(data_path / train_features_name)
train_outcomes = pd.read_pickle(data_path / train_outcomes_name)

In [74]:
from dataprocessing.process import StandardScaler, SmartLogtransformer, TrainTestSplit, Pipeline
from dataprocessing.impute import SimpleImputer

In [75]:
train_test_split = TrainTestSplit(test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_outcomes)

### Pipeline for data processing

In [76]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")), 
        ("logtransformer", SmartLogtransformer()), 
        ("scaler", StandardScaler())
    ]
)
pipe

[(imputer, <dataprocessing.impute.SimpleImputer>), 
(logtransformer, <dataprocessing.process.SmartLogtransformer>), 
(scaler, <dataprocessing.process.StandardScaler>)]

In [77]:
%%time
X_train = pipe.fit_transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(test_features)

Wall time: 5.89 s


In [78]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,f__buytransactions_1m,f__selltransactions_1m,f__buyshares_1m,f__sellshares_1m,f__buysharespctsharesout_1m,f__sellsharespctsharesout_1m,f__uniquebuyers_1m,f__uniquesellers_1m,f__buytransactions_3m,f__selltransactions_3m,f__buyshares_3m,f__sellshares_3m,f__buysharespctsharesout_3m,f__sellsharespctsharesout_3m,f__uniquebuyers_3m,f__uniquesellers_3m,f__buytransactions_6m,f__selltransactions_6m,f__buyshares_6m,f__sellshares_6m,f__buysharespctsharesout_6m,f__sellsharespctsharesout_6m,f__uniquebuyers_6m,f__uniquesellers_6m,f__buytransactions_12m,f__selltransactions_12m,f__buyshares_12m,f__sellshares_12m,f__buysharespctsharesout_12m,f__sellsharespctsharesout_12m,f__uniquebuyers_12m,f__uniquesellers_12m
date,tradingitemid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
2004-08-06,2585893,1.124676,-0.745641,-0.443263,0.003256,-0.372335,0.000805,1.137242,-0.745811,0.667193,0.795100,-0.849923,0.004573,-0.885121,0.000811,0.689566,0.775471,0.430536,0.560260,-1.313943,0.007727,-1.489867,0.000819,0.471217,0.550649,0.273003,0.383375,-1.921067,0.007480,-2.278363,0.000844,0.338890,0.389804
2004-08-06,2585895,1.173487,1.348587,-0.288378,0.003541,0.005511,0.000805,1.186371,1.368299,0.637492,0.843336,-0.659300,0.005440,-0.421348,0.000813,0.638244,0.853088,0.396030,0.611525,-1.123632,0.008496,-0.966126,0.000821,0.411208,0.633622,0.234136,0.439430,-1.718303,0.007815,-1.676731,0.000846,0.270767,0.481275
2004-08-06,2585941,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2004-08-06,2585957,-0.821051,1.268505,0.027006,0.003540,0.087960,0.000805,-0.821200,1.287456,0.637492,0.795100,0.210314,0.005433,-0.230532,0.000813,0.608222,0.824442,0.396030,0.560260,-0.255439,0.008490,-0.750636,0.000821,0.376105,0.602999,0.234136,0.383375,-0.793301,0.007813,-1.429194,0.000846,0.230917,0.447516
2004-08-06,2585970,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-11-23,586093745,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,607745150,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,610592433,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,618650356,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768


In [82]:
tr_ind = X_train.index

In [83]:
tr_shape1 = tr_ind.levshape[0]
tr_shape2 = tr_ind.levshape[1]
tr_shape3 = train_features.shape[1]

In [87]:
train_features_3D = np.empty((tr_shape1, tr_shape2, tr_shape3))

In [88]:
train_features_3D.shape

(595, 9695, 32)

In [91]:
train_features_3D[:] = np.nan

In [79]:
idx = pd.IndexSlice

In [102]:
X_train.loc[idx[:, 2585941], idx[:]].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,f__buytransactions_1m,f__selltransactions_1m,f__buyshares_1m,f__sellshares_1m,f__buysharespctsharesout_1m,f__sellsharespctsharesout_1m,f__uniquebuyers_1m,f__uniquesellers_1m,f__buytransactions_3m,f__selltransactions_3m,f__buyshares_3m,f__sellshares_3m,f__buysharespctsharesout_3m,f__sellsharespctsharesout_3m,f__uniquebuyers_3m,f__uniquesellers_3m,f__buytransactions_6m,f__selltransactions_6m,f__buyshares_6m,f__sellshares_6m,f__buysharespctsharesout_6m,f__sellsharespctsharesout_6m,f__uniquebuyers_6m,f__uniquesellers_6m,f__buytransactions_12m,f__selltransactions_12m,f__buyshares_12m,f__sellshares_12m,f__buysharespctsharesout_12m,f__sellsharespctsharesout_12m,f__uniquebuyers_12m,f__uniquesellers_12m
date,tradingitemid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
2011-06-17,2585941,1.23801,1.319031,0.980876,0.003537,0.955078,0.000805,1.251316,1.338463,0.653837,0.919789,0.320971,0.005266,0.554744,0.000812,0.676066,0.889178,0.443655,0.698697,-0.036215,0.003249,0.243511,0.000793,0.471217,0.685972,0.388414,0.561065,0.472682,0.0055,0.810183,0.000818,0.389095,0.538987
2011-06-24,2585941,1.23801,1.319031,0.980876,0.003537,0.955078,0.000805,1.251316,1.338463,0.653837,0.919789,0.320971,0.005266,0.554744,0.000812,0.676066,0.889178,0.443655,0.698697,-0.036215,0.003249,0.243511,0.000793,0.471217,0.685972,0.388414,0.561065,0.472682,0.0055,0.810183,0.000818,0.389095,0.538987
2011-07-01,2585941,1.23801,1.319031,0.980876,0.003537,0.955078,0.000805,1.251316,1.338463,0.653837,0.919789,0.320971,0.005266,0.554744,0.000812,0.676066,0.889178,0.443655,0.698697,-0.036215,0.003249,0.243511,0.000793,0.471217,0.685972,0.388414,0.561065,0.472682,0.0055,0.810183,0.000818,0.389095,0.538987
2011-07-08,2585941,1.23801,1.319031,0.980876,0.003537,0.955078,0.000805,1.251316,1.338463,0.653837,0.7951,0.320971,0.005438,0.554744,0.000813,0.676066,0.824442,0.443655,0.698697,-0.036215,0.003249,0.243511,0.000793,0.471217,0.685972,0.360837,0.561065,0.394655,0.0055,0.727258,0.000818,0.389095,0.538987
2011-07-15,2585941,1.23801,1.319031,0.980876,0.003537,0.955078,0.000805,1.251316,1.338463,0.653837,0.7951,0.320971,0.005438,0.554744,0.000813,0.676066,0.824442,0.443655,0.698697,-0.036215,0.003249,0.243511,0.000793,0.471217,0.685972,0.360837,0.561065,0.394655,0.0055,0.727258,0.000818,0.389095,0.538987


In [113]:
comp_ts_templ = pd.DataFrame(np.zeros([len(tr_ind.levels[0]), len(X_train.columns)]), 
                             index=tr_ind.levels[0], columns=X_train.columns)
comp_ts_templ.shape

(595, 32)

In [110]:
df_comp.index.get_level_values(0).unique()

DatetimeIndex(['2004-08-06', '2004-08-13', '2004-08-20', '2004-08-27',
               '2004-09-03', '2004-09-10', '2004-09-17', '2004-09-24',
               '2004-10-01', '2004-10-08',
               ...
               '2011-05-13', '2011-05-20', '2011-05-27', '2011-06-03',
               '2011-06-10', '2011-06-17', '2011-06-24', '2011-07-01',
               '2011-07-08', '2011-07-15'],
              dtype='datetime64[ns]', name='date', length=334, freq=None)

In [138]:
df_comp = X_train.loc[idx[:, 2585941], idx[:]]
df_comp.index = df_comp.index.get_level_values(0).unique()
added_df_comp = df_comp + comp_ts_templ
added_df_comp

Unnamed: 0,f__buytransactions_1m,f__selltransactions_1m,f__buyshares_1m,f__sellshares_1m,f__buysharespctsharesout_1m,f__sellsharespctsharesout_1m,f__uniquebuyers_1m,f__uniquesellers_1m,f__buytransactions_3m,f__selltransactions_3m,f__buyshares_3m,f__sellshares_3m,f__buysharespctsharesout_3m,f__sellsharespctsharesout_3m,f__uniquebuyers_3m,f__uniquesellers_3m,f__buytransactions_6m,f__selltransactions_6m,f__buyshares_6m,f__sellshares_6m,f__buysharespctsharesout_6m,f__sellsharespctsharesout_6m,f__uniquebuyers_6m,f__uniquesellers_6m,f__buytransactions_12m,f__selltransactions_12m,f__buyshares_12m,f__sellshares_12m,f__buysharespctsharesout_12m,f__sellsharespctsharesout_12m,f__uniquebuyers_12m,f__uniquesellers_12m
2004-08-06 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2004-08-13 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2004-08-20 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2004-08-27 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2004-09-03 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
591,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
592,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
593,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [122]:
np.abs((added_df_comp.iloc[:, 1].isna() * 1).values - 1)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

#### use of dates will be ignore, will be use sequence only

In [127]:
tr_ind = X_train.index

In [149]:
tr_shape1 = len(X_train.index.get_level_values(1).unique())
tr_shape2 = len(X_train.index.get_level_values(0).unique())
tr_shape3 = train_features.shape[1]

In [150]:
train_features_3D = np.empty((tr_shape1, tr_shape2, tr_shape3))

In [151]:
train_features_3D.shape

(8335, 434, 32)

In [152]:
comp_ts_templ = pd.DataFrame(np.zeros([tr_shape2, tr_shape3]), columns=X_train.columns)
comp_ts_templ.shape

(434, 32)

In [155]:
# df_comp = X_train.loc[idx[:, 2585941], idx[:]]
# df_comp.index = range(len(df_comp.index.get_level_values(0).unique()))
# added_df_comp = (df_comp + comp_ts_templ).fillna(0)
# added_df_comp

In [159]:
%%time
seq_lengths = []
for num, compID in enumerate(X_train.index.get_level_values(1).unique()):
    df_comp = X_train.loc[idx[:, compID], idx[:]]
    df_comp.index = range(len(df_comp.index.get_level_values(0).unique()))
    padded_df_comp = (df_comp + comp_ts_templ).fillna(0)
    train_features_3D[num, :, :] = padded_df_comp
    seq_lengths.append(len(df_comp))

Wall time: 2min 53s


In [162]:
len(seq_lengths)

8335

In [164]:
train_features_3D.shape

(8335, 434, 32)