In [None]:
!pip install sagemaker==1.72.0 

In [52]:
%matplotlib inline
import os
import ast
import json
import numpy as np
import pandas as pd
import source

import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import sklearn.model_selection

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.sklearn.estimator import SKLearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from IPython.display import Audio
sound_file = './sound/beep.wav'

import warnings
warnings.filterwarnings('ignore')

session = sagemaker.Session()
role = get_execution_role()

<h2> Data Processing </h2>

ROI is calculated using next price because we are trying to predict future ROI using the data from current time period.
Referred to https://github.com/NGYB/Stocks/blob/master/StockPricePrediction/StockPricePrediction_v1c_xgboost.ipynb

In [None]:
DATA_DIR = './data'

In [4]:
data = pd.read_csv(os.path.join(DATA_DIR, 'crypto-historical-data.csv'), 
                   parse_dates=['time'], 
                   index_col=0, 
                   keep_default_na=False,
                   header=0,
                   names=['market_cap', 'name', 'price', 'sym', 'time', 'volume'])

data['rank'] = data.groupby("time")["market_cap"] \
                    .rank("dense", ascending=False) \
                    .astype(int)

data['market_share'] = data.groupby('time')["market_cap"] \
                    .apply(lambda x: x/float(x.sum()))

data['age'] = data.groupby(['sym'])["time"] \
                    .apply(lambda x: x - min(x)) \
                    .dt.days

next_price = data.groupby(['sym'])['price'].shift(-1)
data['roi'] = next_price/data['price'] - 1

In [5]:
data.head(10)

Unnamed: 0,market_cap,name,price,sym,time,volume,rank,market_share,age,roi
0,20461600.0,Viberate,0.114889,VIB,2018-04-01,4702470.0,250,7.8e-05,176,0.086658
1,19204400.0,Viberate,0.124845,VIB,2018-04-02,3688650.0,260,7.6e-05,177,0.069118
2,20825800.0,Viberate,0.133474,VIB,2018-04-03,3681530.0,253,7.9e-05,178,-0.0878
3,22260000.0,Viberate,0.121755,VIB,2018-04-04,5583970.0,250,7.9e-05,179,-0.028278
4,20086900.0,Viberate,0.118312,VIB,2018-04-05,2824800.0,251,7.9e-05,180,-0.016727
5,19692900.0,Viberate,0.116333,VIB,2018-04-06,2941990.0,257,7.7e-05,181,0.062665
6,19452600.0,Viberate,0.123623,VIB,2018-04-07,2648150.0,256,7.8e-05,182,0.027988
7,20606400.0,Viberate,0.127083,VIB,2018-04-08,4403120.0,257,8e-05,183,-0.015588
8,21145500.0,Viberate,0.125102,VIB,2018-04-09,4402010.0,257,8e-05,184,0.065403
9,21131200.0,Viberate,0.133284,VIB,2018-04-10,4475740.0,257,8.2e-05,185,0.0816


<h2> Data Selection </h2>

In [6]:
filtered = data.query('volume > 100000')

In [7]:
filtered.head()

Unnamed: 0,market_cap,name,price,sym,time,volume,rank,market_share,age,roi
0,20461600.0,Viberate,0.114889,VIB,2018-04-01,4702470.0,250,7.8e-05,176,0.086658
1,19204400.0,Viberate,0.124845,VIB,2018-04-02,3688650.0,260,7.6e-05,177,0.069118
2,20825800.0,Viberate,0.133474,VIB,2018-04-03,3681530.0,253,7.9e-05,178,-0.0878
3,22260000.0,Viberate,0.121755,VIB,2018-04-04,5583970.0,250,7.9e-05,179,-0.028278
4,20086900.0,Viberate,0.118312,VIB,2018-04-05,2824800.0,251,7.9e-05,180,-0.016727


<h2> Feature Engineering </h2>

In [27]:
feat = filtered.copy()

In [46]:
W = 3
TARGET = 'roi'

In [77]:
properties = ['market_cap', 'price', 'volume', 'rank', 'market_share', 'age', 'roi']
feat_columns = []

for p in properties:
    if p != TARGET:
        feat_columns.append(p)
        
        for w in range(1, W+1):
            col_name = "{}_lag_{}".format(p, w)
            feat[col_name] = feat.groupby(['sym'])[p].shift(w)
            feat_columns.append(col_name)
    
    feat[p + '_mean'] = feat[p].shift(1).rolling(w, min_periods=1).mean()
    feat[p + '_std'] = feat[p].shift(1).rolling(w, min_periods=1).std()
    print(p, len(feat.columns))

feat.dropna(inplace=True)

market_cap 42
price 42
volume 42
rank 42
market_share 42
age 42
roi 42


In [78]:
feat.head()

Unnamed: 0,market_cap,name,price,sym,time,volume,rank,market_share,age,roi,...,market_share_lag_3,market_share_mean,market_share_std,age_lag_1,age_lag_2,age_lag_3,age_mean,age_std,roi_mean,roi_std
12,19637600.0,Viberate,0.131197,VIB,2017-12-01,2562970.0,191,6.6e-05,55,0.020496,...,8.2e-05,8.5e-05,3.343084e-06,187.0,186.0,185.0,186.0,1.0,0.000192,0.124536
13,20889700.0,Viberate,0.133886,VIB,2017-12-02,1933420.0,193,6.5e-05,56,0.086768,...,8.5e-05,8e-05,1.237917e-05,55.0,187.0,186.0,142.666667,75.923207,-0.020176,0.108532
14,21623400.0,Viberate,0.145503,VIB,2017-12-03,3292750.0,193,6.7e-05,57,0.130685,...,8.8e-05,7.3e-05,1.337001e-05,56.0,55.0,187.0,99.333333,75.923207,-0.011969,0.118357
15,23451300.0,Viberate,0.164518,VIB,2017-12-04,4313180.0,186,7.1e-05,58,0.051873,...,6.6e-05,6.6e-05,9.092321e-07,57.0,56.0,55.0,56.0,1.0,0.079316,0.055471
16,26792700.0,Viberate,0.173052,VIB,2017-12-05,7439160.0,182,7.8e-05,59,-0.132041,...,6.5e-05,6.8e-05,2.866954e-06,58.0,57.0,56.0,57.0,1.0,0.089775,0.039492


<h2> Data Split </h2>

In [79]:
VAL_START = pd.Timestamp('2017-04-25')
TEST_START = pd.Timestamp('2017-10-25')

In [80]:
train = feat.query("time < @VAL_START")
val = feat.query("time >= @VAL_START & time <= @TEST_START")
trainval = feat.query("time <= @TEST_START")
test = feat.query("time >= @TEST_START")

In [81]:
cols_to_scale = feat_columns + [TARGET]

In [82]:
scaler_train = StandardScaler()
train_cols_scaled = scaler_train.fit_transform(train[cols_to_scale])
train_scaled = train.copy()
train_scaled[cols_to_scale] = train_cols_scaled

In [83]:
train_scaled.describe()

Unnamed: 0,market_cap,price,volume,rank,market_share,age,roi,market_cap_lag_1,market_cap_lag_2,market_cap_lag_3,...,market_share_lag_3,market_share_mean,market_share_std,age_lag_1,age_lag_2,age_lag_3,age_mean,age_std,roi_mean,roi_std
count,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,...,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0,15349.0
mean,-8.812129e-16,-4.691595e-16,5.577516e-16,2.427807e-15,-2.989841e-16,5.365728e-16,1.694448e-16,-5.0283360000000006e-17,2.843278e-16,1.599983e-16,...,7.387225e-16,0.07054671,0.02406284,1.952312e-16,3.163148e-16,6.568319e-16,819.667275,363.425303,5.287715,9.588665
std,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,...,1.000033,0.1998919,0.06049671,1.000033,1.000033,1.000033,338.466594,226.503117,15.794139,22.564645
min,-0.2945563,-0.2811341,-0.2398116,-0.6821502,-0.3321423,-1.426204,-0.3672089,-0.2226048,-0.2152955,-0.1976042,...,-0.3355329,3.17266e-07,2.016534e-08,-1.720289,-1.54042,-1.816314,1.0,1.0,-0.999994,0.0
25%,-0.2908312,-0.2810321,-0.2370712,-0.5619743,-0.3293778,-0.9160326,-0.3480693,-0.221539,-0.2147139,-0.1967107,...,-0.3328704,0.0008110201,0.0002419117,-0.8278275,-0.8922167,-0.8256658,551.666667,210.444767,-0.191408,0.473801
50%,-0.2835729,-0.2781291,-0.2289777,-0.3616812,-0.3215536,-0.1190487,-0.3041624,-0.2157839,-0.2130941,-0.1916125,...,-0.3239079,0.002711993,0.001540087,-0.09613289,-0.001937021,-0.01146333,860.0,365.0,1.108811,2.65771
75%,-0.23642,-0.2565055,-0.1856525,0.1190223,-0.2734137,0.8479923,-0.04191462,-0.1798675,-0.1986475,-0.1681686,...,-0.2614209,0.02042991,0.007776341,0.8581618,0.8223219,0.8112425,1088.666667,557.54671,5.477742,10.112504
max,8.614784,12.76354,22.6979,14.21966,3.802569,2.271903,25.68387,14.63427,14.64107,15.4829,...,4.17561,0.8755433,0.32372,2.010323,2.080717,2.031483,1782.0,860.740573,716.927901,530.450552


<h2> Data Scaling </h2>

In [84]:
scaler_trainval = StandardScaler()
trainval_cols_scaled = scaler_trainval.fit_transform(trainval[cols_to_scale])
trainval_scaled = trainval.copy()
trainval_scaled[cols_to_scale] = trainval_cols_scaled

In [104]:
def scale_col(df, base, col):
    mean = df[base+'_mean']
    std = df[base+'_std']
    std = np.where(std == 0, 0.001, std)
    return (df[col] - mean)/std

In [107]:
val_scaled = val.copy()
for p in properties:
    val_scaled[p] = scale_col(val_scaled, p, p)
    
    if p != TARGET:
        for w in range(1, W+1):
            col_name = "{}_lag_{}".format(p, w)
            val_scaled[col_name] = scale_col(val_scaled, p, col_name)

In [109]:
test_scaled = test.copy()
for p in properties:
    test_scaled[p] = scale_col(test_scaled, p, p)
    
    if p != TARGET:
        for w in range(1, W+1):
            col_name = "{}_lag_{}".format(p, w)
            test_scaled[col_name] = scale_col(test_scaled, p, col_name)

In [110]:
test_scaled.describe()

Unnamed: 0,market_cap,price,volume,rank,market_share,age,roi,market_cap_lag_1,market_cap_lag_2,market_cap_lag_3,...,market_share_lag_3,market_share_mean,market_share_std,age_lag_1,age_lag_2,age_lag_3,age_mean,age_std,roi_mean,roi_std
count,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,...,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0,53362.0
mean,3.245548,2.875137,-613.9197,1320.924,0.246209,2.64662,5.715785,-0.036055,0.034318,0.001737,...,0.116395,0.003988125,0.0006738116,0.814818,0.035904,-0.850723,427.203428,40.268228,2.569417,2.299949
std,89.556781,66.715254,145282.0,115590.5,19.801757,26.763291,449.218315,0.851155,0.725022,0.863844,...,0.856494,0.0482542,0.005508658,0.517288,0.354627,0.466603,456.354837,120.356021,34.373764,15.201654
min,-355.026898,-640.0,-33560330.0,-870457.8,-438.74357,-149.00001,-275.900714,-1.154701,-1.1547,-1.154701,...,-1.154701,9.672461e-08,4.946122e-09,-1.1547,-1.1547,-1.1547,1.0,1.0,-0.999993,0.0
25%,-1.111662,-1.194636,-0.9480582,-0.5913398,-1.310811,2.0,-1.085014,-0.87189,-0.578623,-0.852553,...,-0.744055,4.850539e-05,2.444393e-06,1.0,0.0,-1.0,86.0,1.0,-0.643318,0.016159
50%,0.184176,0.055623,-0.2139113,0.5773503,-0.372537,2.0,-0.276396,-0.166162,0.008778,-0.073247,...,0.317694,0.0001290995,7.755005e-06,1.0,0.0,-1.0,205.0,1.0,-0.024742,0.073705
75%,1.253692,1.210502,1.171766,1.456986,0.92381,2.0,0.809293,0.86572,0.663962,0.918409,...,0.981662,0.0004173574,3.203377e-05,1.0,0.0,-1.0,699.583333,1.0,0.043912,0.167536
max,13829.715713,10403.841667,49047.85,18017130.0,3383.524787,1463.000088,59658.708485,1.154701,1.154701,1.154705,...,1.154701,0.8925229,0.1365649,1.1547,1.1547,1.1547,1790.0,875.555252,2023.532953,896.258199


In [111]:
test_scaled.head()

Unnamed: 0,market_cap,name,price,sym,time,volume,rank,market_share,age,roi,...,market_share_lag_3,market_share_mean,market_share_std,age_lag_1,age_lag_2,age_lag_3,age_mean,age_std,roi_mean,roi_std
12,-1.904294,Viberate,-1.240722,VIB,2017-12-01,-1.427527,-15.5,-5.854664,-131.0,0.163035,...,-1.001655,8.5e-05,3.343084e-06,1.0,0.0,-1.0,186.0,1.0,0.000192,0.124536
13,-0.514127,Viberate,-0.811061,VIB,2017-12-02,-1.114489,-1.095142,-1.183577,-1.141504,0.985361,...,0.43776,8e-05,1.237917e-05,-1.154675,0.583923,0.570752,142.666667,75.923207,-0.020176,0.108532
14,0.021833,Viberate,0.510388,VIB,2017-12-03,1.403376,-0.546711,-0.46377,-0.557581,1.205284,...,1.154509,7.3e-05,1.337001e-05,-0.570752,-0.583923,1.154675,99.333333,75.923207,-0.011969,0.118357
15,2.723196,Viberate,3.637441,VIB,2017-12-04,2.523665,-5.484828,5.34174,2.0,-0.494733,...,-0.288405,6.6e-05,9.092321e-07,1.0,0.0,-1.0,56.0,1.0,0.079316,0.055471
16,3.642101,Viberate,1.622007,VIB,2017-12-05,3.567631,-2.144444,3.500104,2.0,-5.616756,...,-0.856544,6.8e-05,2.866954e-06,1.0,0.0,-1.0,57.0,1.0,0.089775,0.039492


In [112]:
train_X = train_scaled[feat_columns]
train_Y = train_scaled[[TARGET]]

trainval_X = trainval_scaled[feat_columns]
trainval_Y = trainval_scaled[[TARGET]]

val_X = val_scaled[feat_columns]
val_Y = val_scaled[[TARGET]]

test_X = test_scaled[feat_columns]
test_Y = test_scaled[[TARGET]]

<h2> Training </h2>

In [None]:
prefix = 'xgboost-new'

pd.concat([train_Y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([val_Y, val_X], axis=1).to_csv(os.path.join(data_dir, 'val.csv'), header=False, index=False)

train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)

s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='text/csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='text/csv')

In [None]:
container = get_image_uri(session.boto_region_name, 'xgboost')