In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import sklearn.model_selection

import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()
role = get_execution_role()

In [2]:
data_df = pd.read_csv("./data/processed_data.csv")

We will be experimenting with the following algorithms for the model:
1. XGBoost
2. Simple Custom Neural Net
3. Long Short Term Memory Networks

Before I work with these models, I will start with the benchmark model first which is based on Simple Moving Average model.

<h2> Simple Moving Average (SMA)</h2>

In Simple Moving Average, the predicted next price is equal to the average of the last $w$ data points. In addition to creating this model, I will be creating the function to implement the trading strategy in this section as well.

In [3]:
import source

In [4]:
w = 50

In [5]:
def predict_sma_prices(data_df, w):
    # Since predicted price is the average of the last w price, we can use create_features function
    features_df = source.create_features(data_df, w)
    features_df.index = pd.to_datetime(features_df['time'])
    
    sma_features_df = features_df[['sym', 'price_4_last', 'price_1_mean', 'target_price']]
    sma_features_df.columns = ['sym', 'previous_price', 'predicted_price', 'actual_price']
    sma_features_df['expected_roi'] = sma_features_df['predicted_price']/sma_features_df['previous_price'] - 1
    sma_features_df['actual_roi'] = sma_features_df['actual_price']/sma_features_df['previous_price'] - 1
    
    return sma_features_df

In [6]:
sma_prices_df = predict_sma_prices(data_df, w)

In [7]:
sma_prices_df.head()

Unnamed: 0_level_0,sym,previous_price,predicted_price,actual_price,expected_roi,actual_roi
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-30,$$$,2.7e-05,3.4e-05,2.6e-05,0.258519,-0.037037
2016-01-31,$$$,2.6e-05,3.4e-05,2.6e-05,0.306154,0.0
2016-02-01,$$$,2.6e-05,3.4e-05,2.6e-05,0.293077,0.0
2016-02-02,$$$,2.6e-05,3.4e-05,2.6e-05,0.296154,0.0
2016-02-03,$$$,2.6e-05,3.4e-05,2.6e-05,0.299231,0.0


Now that we have the predicted price for a given time period, we can build a portfolio at each time step so that we can have an idea how much profit can be generated by using the SMA prediction model. The strategy employed to build the portfolio will be standardized accross all the different prediction model we will explore later as well.

The strategy will be to invest in the top $n$ coins with the highest expected ROI and then sell it the following day. To simplify the calculation, we will be ignoring the transaction cost. As the algorithm continues to trade, it will re-evaluate the best value of $n$ for a given time period to maximize the Sharpe ratio.

We will be trading for 1 year from April 24, 2016 and April 24, 2018.

In [8]:
def calculate_sharpe_ratio(array):
    return np.mean(array)/np.std(array)

def calculate_next_sharpe_ratio(roi_history, new_value):
    new_hist = roi_history + [new_value]
    return np.mean(new_hist)/np.std(new_hist)

def update_mean(mean, t, new_value):
    if t == 0:
        return new_value
    else:
        return (mean * (t - 1) + new_value) / t

def update_std(std, mean, new_mean, t, new_value):
    if t == 0:
        return 0
    else:
        return np.sqrt((std ** 2 * (t - 1) + (new_value - new_mean) * (new_value - mean)) / t)

In [82]:
initial_value = 10000
total_value = initial_value
sharpe_ratio = None
mean_roi = 0
std_roi = 0
initial_n = 10
t = 1
percent_returns = 0

dates = list(set([time for time in sma_prices_df.index if time > pd.Timestamp('2016-04-24')]))
dates.sort()

for date in dates:
    
    coins_stats_df = sma_prices_df.loc[date,:]
    coins_stats_df = coins_stats_df.sort_values(by='expected_roi', ascending=False)
    
    if t != 1:
        sharpe_select_df = []
        
        coins_stats_df['avg_expected_roi'] = coins_stats_df['expected_roi'].expanding().mean()
        coins_stats_df['new_mean'] = coins_stats_df['avg_expected_roi'].apply(lambda x: update_mean(mean_roi, t, x))
        coins_stats_df['new_std'] = coins_stats_df.apply(lambda row: update_std(std_roi, mean_roi, row['new_mean'], t, row['avg_expected_roi']), axis=1)
        coins_stats_df['sharpe_ratio'] = coins_stats_df['new_mean']/coins_stats_df['new_std']
        coins_stats_df['n'] = np.arange(start=1, stop=(len(coins_stats_df)+1))
        
        n = coins_stats_df[coins_stats_df['sharpe_ratio']==coins_stats_df['sharpe_ratio'].max()]['n'].values[0]
        
    else:
        n = initial_n
        
    n_index = n - 1        
    day_return = sum(coins_stats_df.iloc[:n,:]['actual_roi'] * total_value / n)
    day_roi = day_return/total_value
    total_value += day_return
    percent_returns = (total_value/initial_value - 1 ) * 100
    
    prev_mean_roi = mean_roi
    mean_roi = update_mean(prev_mean_roi, t, day_roi)
    std_roi = update_std(std_roi, prev_mean_roi, mean_roi, t, day_roi)
    sharpe_ratio = mean_roi/std_roi

print('Date: {}; Total: {:.2f}; Day Change: {:.2f}; n: {}, Cum. Returns: {:.2f}%, Sharpe: {:.2f}'.format(date, total_value, day_return, n, percent_returns, sharpe_ratio))

Date: 2018-04-12 00:00:00; Total: 6437851911952914007319257872467999800316671803338861425377750284822773363405751094165316087615183187351916362027488226062559955919044608.00; Day Change: 947072580399698926753895054354188840028677523177118646270814884999241665832782055024301289775064822988532778762501977971920940667240448.00; n: 10, Cum. Returns: 64378519119529145318387643588713070741752536164506483985992394532587994071334328914843643097860508464986287470554852364803917628833792.00%, Sharpe: inf


In [None]:
X_bos_pd = pd.DataFrame(boston.data, columns=boston.feature_names)
Y_bos_pd = pd.DataFrame(boston.target)

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_bos_pd, Y_bos_pd, test_size=0.33)
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(X_train, Y_train, test_size=0.33)

In [None]:
data_dir = '../data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [None]:
prefix = 'boston-xgboost-deploy-hl'

val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [None]:
container = get_image_uri(session.boto_region_name, 'xgboost')

In [None]:
xgb = sagemaker.estimator.Estimator(container, # The name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix), # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

In [None]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [None]:
#Testing Performance
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
plt.scatter(Y_test, Y_pred)
plt.xlabel("Median Price")
plt.ylabel("Predicted Price")
plt.title("Median Price vs Predicted Price")

In [None]:
#Deployment
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [None]:
Y_pred = xgb_predictor.predict(X_test.values).decode('utf-8')
Y_pred = np.fromstring(Y_pred, sep=',')

xgb_predictor.delete_endpoint()