In [1]:
%%capture
!pip install sagemaker==1.72.0

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h2>Lag Features</h2>

Lag features are created by using previous $w$ data points as additional columns. Since we will remove any rows with NA values, we will also be calculating mean and standard deviation in the same method. Mean and standard deviation are calculated using the last $w$ values of the given property.

In [2]:
import source.load
from source.load import *

data_dict = source.load.load_data()
data = data_dict['data']
symbols = data_dict['symbols']

In [3]:
PROPERTIES = ['market_cap', 'price', 'volume', 'rank', 'market_share', 'age', 'roi']

In [4]:
def create_features(data, W):
    features = []

    for p in PROPERTIES:
        for w in range(1, W+1):
            col_name = "{}_lag_{}".format(p, w)
            data[col_name] = data.groupby(['sym'])[p].shift(w)
            features.append(col_name)

        data[p + '_mean'] = data.groupby(['sym'])[p].shift(1) \
                                    .transform(lambda x: x.rolling(W, min_periods=1).mean())
        data[p + '_std'] = data.groupby(['sym'])[p].shift(1) \
                                    .transform(lambda x: x.rolling(W, min_periods=1).std(ddof=0))
    
    data.dropna(inplace=True)
    
    return {'data': data, 'features': features}

In [5]:
features_dict = create_features(data, W=5)

In [6]:
feat_df = features_dict['data']
features = features_dict['features']

In [7]:
feat_df.query("sym == 'BTC'")[[col for col in feat_df.columns if 'price' in col]].head(10)

Unnamed: 0,price,price_lag_1,price_lag_2,price_lag_3,price_lag_4,price_lag_5,price_mean,price_std
96947,97.75,105.21,116.99,139.0,144.54,134.21,127.99,14.654279
96948,112.5,97.75,105.21,116.99,139.0,144.54,120.698,18.350162
96949,115.91,112.5,97.75,105.21,116.99,139.0,114.29,13.979243
96950,112.3,115.91,112.5,97.75,105.21,116.99,109.672,7.245783
96951,111.5,112.3,115.91,112.5,97.75,105.21,108.734,6.503244
96952,113.57,111.5,112.3,115.91,112.5,97.75,109.992,6.30524
96953,112.67,113.57,111.5,112.3,115.91,112.5,113.156,1.527175
96954,117.2,112.67,113.57,111.5,112.3,115.91,113.19,1.514028
96955,115.24,117.2,112.67,113.57,111.5,112.3,113.448,1.990491
96956,115.0,115.24,117.2,112.67,113.57,111.5,114.036,1.998746


<h2>Feature Normalization</h2>

This part is meant to normalize the feature.

In [8]:
TARGET = 'price'

In [9]:
def scale_col(df, base, col):
    mean = df[base + '_mean']
    std = df[base + '_std']
    std = np.where(std == 0, 0.001, std)
    
    return (df[col] - mean)/std

In [10]:
cols = features + [TARGET]

In [11]:
def normalize(df, W):
    df_scaled = df.copy()
    for p in PROPERTIES:
        df_scaled[p] = scale_col(df_scaled, p, p)
        
        cols = [col for col in df_scaled.columns if p+'_lag' in col]
        
        for col in cols:
            df_scaled[col] = scale_col(df_scaled, p, col)
    
    return df_scaled

In [12]:
feat_normed = normalize(feat_df, W=5)

In [13]:
feat_normed.query("sym == 'BTC'")[[col for col in feat_normed.columns if 'price' in col]].head(10)

Unnamed: 0,price,price_lag_1,price_lag_2,price_lag_3,price_lag_4,price_lag_5,price_mean,price_std
96947,-2.063561,-1.554495,-0.750634,0.751316,1.129363,0.424449,127.99,14.654279
96948,-0.446754,-1.250561,-0.844025,-0.202069,0.997375,1.29928,120.698,18.350162
96949,0.115886,-0.128047,-1.183183,-0.649534,0.193144,1.767621,114.29,13.979243
96950,0.362694,0.860915,0.390296,-1.645371,-0.615806,1.009967,109.672,7.245783
96951,0.425326,0.548342,1.103449,0.579096,-1.689003,-0.541883,108.734,6.503244
96952,0.567464,0.239166,0.366045,0.938584,0.397764,-1.94156,109.992,6.30524
96953,-0.318235,0.271089,-1.084355,-0.560512,1.803329,-0.429551,113.156,1.527175
96954,2.648564,-0.343455,0.250986,-1.116228,-0.587836,1.796532,113.19,1.514028
96955,0.90028,1.884962,-0.390858,0.061291,-0.978653,-0.576742,113.448,1.990491
96956,0.482302,0.602378,1.582993,-0.683429,-0.233146,-1.268796,114.036,1.998746


<h2> Data Split </h2>

In [14]:
def create_sets(df, features, target, W):
    criteria = {'train': "time < @VAL_START", 
                'val': "time >= @VAL_START & time < @TEST_START",
                'trainval': "time <= @TEST_START", 
                'test': "time >= @TEST_START"}
    
    scaled_df = normalize(df, W)

    sets = {}

    for key in criteria:
        sets[key] = {}
        sets[key]['ori'] = df.query(criteria[key])
        sets[key]['scaled'] = scaled_df.query(criteria[key])
        sets[key]['X'] = sets[key]['scaled'][features]
        sets[key]['Y'] = sets[key]['scaled'][target]
        
    return sets

In [15]:
sets = create_sets(feat_df, features, TARGET, W=5)

In [16]:
sets['train']['X']

Unnamed: 0,market_cap_lag_1,market_cap_lag_2,market_cap_lag_3,market_cap_lag_4,market_cap_lag_5,price_lag_1,price_lag_2,price_lag_3,price_lag_4,price_lag_5,...,age_lag_1,age_lag_2,age_lag_3,age_lag_4,age_lag_5,roi_lag_1,roi_lag_2,roi_lag_3,roi_lag_4,roi_lag_5
28003,-0.247769,-0.954532,0.525119,-0.988519,1.665701,0.797886,0.078171,-1.091702,1.361258,-1.145613,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,0.216879,-0.258136,-0.552856,1.781003,-1.186890
28004,0.802869,0.078363,-1.091136,1.357280,-1.147377,0.364099,0.522543,-0.340817,-1.744181,1.198356,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,-0.549574,0.042012,-0.525442,-0.877515,1.910519
28005,-1.668713,0.764456,0.276874,-0.510184,1.137567,1.260158,0.328682,0.482360,-0.355027,-1.716174,...,1.748315,0.291386,-0.194257,-0.679900,-1.165543,1.024361,-0.465008,1.252886,-0.394932,-1.417307
28006,1.176286,-1.655283,0.737267,0.257824,-0.516094,1.069398,1.100365,-0.432508,-0.179609,-1.557646,...,1.382189,0.950255,-0.345547,-0.777482,-1.209416,0.221837,0.888334,-1.209553,1.210228,-1.110846
28007,0.865568,0.760809,-1.881328,0.351159,-0.096208,1.185977,0.576424,0.616178,-1.351616,-1.026963,...,1.209416,0.777482,0.345547,-0.950255,-1.382189,0.159952,-0.106964,0.691962,-1.822766,1.077816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670102,-1.082705,0.163753,-1.129996,1.486347,0.562601,-0.049648,-1.152935,0.384771,-1.208099,2.025911,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,-0.225964,-0.868771,0.619101,-1.121922,1.597555
670103,-0.037826,-0.972064,0.326612,-1.021336,1.704613,0.045000,0.055000,-0.105000,0.118000,-0.113000,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,-0.260025,0.350819,-0.708560,1.743533,-1.125766
670104,0.486666,0.598458,-1.134017,1.274281,-1.225387,0.085400,-0.004600,0.005400,-0.154600,0.068400,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,-1.611244,0.014708,0.452538,-0.306784,1.450782
670105,1.014018,-0.056844,0.065078,-1.824397,0.802145,2.041581,-0.096834,-0.459961,-0.419614,-1.065173,...,1.414214,0.707107,0.000000,-0.707107,-1.414214,0.921454,-1.840242,0.256261,0.820799,-0.158271


<h2> Creating Feature Creation Module </h2>

In [17]:
!pygmentize source/create.py

[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36mmatplotlib[39;49;00m[04m[36m.[39;49;00m[04m[36mpyplot[39;49;00m [34mas[39;49;00m [04m[36mplt[39;49;00m

[34mfrom[39;49;00m [04m[36msource[39;49;00m[04m[36m.[39;49;00m[04m[36mload[39;49;00m [34mimport[39;49;00m *

PROPERTIES = [[33m'[39;49;00m[33mmarket_cap[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mprice[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mvolume[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mrank[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mmarket_share[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mage[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mroi[39;49;00m[33m'[39;49;00m]

[34mdef[39;49;00m [32mcreate_features[39;49;00m(data, W):
    features = []

    [34mfor[39;49;00m p [35min[39;4

In [18]:
from source import create
sets = create.load_sets(data, W=5, target='price')

In [27]:
sets['train']['Y']

28008      1.124231
28009     -2.891847
28010     15.692755
28011      4.369095
28012      1.038355
            ...    
670102    -0.114575
670103     3.530054
670104    63.619098
670105     1.889467
670106     0.176474
Name: price, Length: 25683, dtype: float64