In [20]:
!pip install sagemaker==1.72.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


In [21]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import sklearn.model_selection

import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()
role = get_execution_role()

In [22]:
data_df = pd.read_csv("./data/processed_data.csv")

We will be experimenting with the following algorithms for the model:
1. XGBoost
2. Simple Custom Neural Net
3. Long Short Term Memory Networks

Before I work with these models, I will start with the benchmark model first which is based on Simple Moving Average model.

<h2> Simple Moving Average (SMA)</h2>

In Simple Moving Average, the predicted next price is equal to the average of the last $w$ data points. In addition to creating this model, I will be creating the function to implement the trading strategy in this section as well.

In [23]:
import source

In [24]:
w = 50

In [25]:
def predict_sma_prices(data_df, w):
    # Since predicted price is the average of the last w price, we can use create_features function
    features_df = source.create_features(data_df, w)
    features_df.index = pd.to_datetime(features_df['time'])
    
    sma_features_df = features_df[['sym', 'price_4_last', 'price_1_mean', 'target_price']]
    sma_features_df.columns = ['sym', 'previous_price', 'predicted_price', 'actual_price']
    sma_features_df['expected_roi'] = sma_features_df['predicted_price']/sma_features_df['previous_price'] - 1
    sma_features_df['actual_roi'] = sma_features_df['actual_price']/sma_features_df['previous_price'] - 1
    
    return sma_features_df

In [26]:
sma_prices_df = predict_sma_prices(data_df, w)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
sma_prices_df.head()

Unnamed: 0_level_0,sym,previous_price,predicted_price,actual_price,expected_roi,actual_roi
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-30,$$$,2.7e-05,3.4e-05,2.6e-05,0.258519,-0.037037
2016-01-31,$$$,2.6e-05,3.4e-05,2.6e-05,0.306154,0.0
2016-02-01,$$$,2.6e-05,3.4e-05,2.6e-05,0.293077,0.0
2016-02-02,$$$,2.6e-05,3.4e-05,2.6e-05,0.296154,0.0
2016-02-03,$$$,2.6e-05,3.4e-05,2.6e-05,0.299231,0.0


<h2>Trading Strategy</h2>

Now that we have the predicted price for a given time period, we can build a portfolio at each time step so that we can have an idea how much profit can be generated by using the SMA prediction model. The strategy employed to build the portfolio will be standardized accross all the different prediction model we will explore later as well.

The strategy will be to invest in the top $n$ coins with the highest expected ROI and then sell it the following day. To simplify the calculation, we will be ignoring the transaction cost. As the algorithm continues to trade, it will re-evaluate the best value of $n$ for a given time period such that it maximizes the Sharpe ratio based on the predicted prices on the next time step.

We will be trading for 1 year from April 12, 2017 and April 12, 2018.

In [28]:
def calculate_sharpe_ratio(array):
    return np.mean(array)/np.std(array)

def calculate_next_sharpe_ratio(roi_history, new_value):
    new_hist = roi_history + [new_value]
    return np.mean(new_hist)/np.std(new_hist)

def update_mean(mean, t, new_value):
    if t == 0:
        return new_value
    else:
        return (mean * (t - 1) + new_value) / t

def update_std(std, mean, new_mean, t, new_value):
    if t == 0:
        return 0
    else:
        return np.sqrt((std ** 2 * (t - 1) + (new_value - new_mean) * (new_value - mean)) / t)

In [29]:
def evaluate_strategy(predicted_prices_df):
    initial_value = 10000
    total_value = initial_value
    sharpe_ratio = None
    mean_roi = 0
    std_roi = 0
    initial_n = 10
    t = 1
    percent_returns = 0

    dates = list(set([time for time in sma_prices_df.index if time > pd.Timestamp('2017-04-12')]))
    dates.sort()

    for date in dates:

        coins_stats_df = sma_prices_df.loc[date,:]
        coins_stats_df = coins_stats_df.sort_values(by='expected_roi', ascending=False)

        if t != 1:
            sharpe_select_df = []

            coins_stats_df['avg_expected_roi'] = coins_stats_df['expected_roi'].expanding().mean()
            coins_stats_df['new_mean'] = coins_stats_df['avg_expected_roi'].apply(lambda x: update_mean(mean_roi, t, x))
            coins_stats_df['new_std'] = coins_stats_df.apply(lambda row: update_std(std_roi, mean_roi, row['new_mean'], t, row['avg_expected_roi']), axis=1)
            coins_stats_df['sharpe_ratio'] = coins_stats_df['new_mean']/coins_stats_df['new_std']
            coins_stats_df['n'] = np.arange(start=1, stop=(len(coins_stats_df)+1))

            n = coins_stats_df[coins_stats_df['sharpe_ratio']==coins_stats_df['sharpe_ratio'].max()]['n'].values[0]

        else:
            n = initial_n

        n_index = n - 1        
        day_return = sum(coins_stats_df.iloc[:n,:]['actual_roi'] * total_value / n)
        day_roi = day_return/total_value
        total_value += day_return
        percent_returns = (total_value/initial_value - 1 ) * 100

        prev_mean_roi = mean_roi
        mean_roi = update_mean(prev_mean_roi, t, day_roi)
        std_roi = update_std(std_roi, prev_mean_roi, mean_roi, t, day_roi)
        sharpe_ratio = mean_roi/std_roi

        t += 1
        
    print('Sharpe Ratio: {:.4f}, Cumulative Returns: {:.2e}%'.format(sharpe_ratio, percent_returns))    
     
    return sharpe_ratio, percent_returns

In [45]:
def calculate_rmse(sma_prices_df):
    sma_prices_df['error'] = (sma_prices_df['actual_price'] - sma_prices_df['predicted_price']) ** 2
    rmse = np.sqrt(sum(sma_prices_df['error'])/len(sma_prices_df))
    print('RMSE: {:.4f}'.format(rmse))
    return rmse

In [30]:
sharpe_ratio, percent_returns = evaluate_strategy(sma_prices_df)



Sharpe Ratio: 0.1221, Cumulative Returns: 1.28e+51%


In [46]:
rmse = calculate_rmse(sma_prices_df)

RMSE: 7066.5171


<h2> XGBoost </h2>

Testing on XGBoost algorithm. We will experiement using different values of $w$.

In [31]:
w = 50
features_df = source.create_features(data_df, w)
features_df['time'] = pd.to_datetime(features_df['time'])

In [32]:
xgb_X_pd = pd.DataFrame(features_df.iloc[:,2:-2])
xgb_Y_pd = pd.DataFrame(features_df['target_price'])

In [33]:
xgb_X_train = xgb_X_pd[features_df['time'] > pd.Timestamp('2017-04-12')]
xgb_Y_train = xgb_Y_pd[features_df['time'] > pd.Timestamp('2017-04-12')]

xgb_X_train, xgb_X_val, xgb_Y_train, xgb_Y_val = sklearn.model_selection.train_test_split(xgb_X_train, xgb_Y_train, test_size=0.33)

xgb_X_test = xgb_X_pd[features_df['time'] <= pd.Timestamp('2017-04-12')]
xgb_Y_test = xgb_Y_pd[features_df['time'] <= pd.Timestamp('2017-04-12')]

In [34]:
data_dir = './data'

In [48]:
pd.concat([xgb_Y_train, xgb_X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
pd.concat([xgb_Y_val, xgb_X_val], axis=1).to_csv(os.path.join(data_dir, 'val.csv'), header=False, index=False)
pd.concat([xgb_X_test], axis=1).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [None]:
prefix = 'xgb-w1'

val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

In [37]:
container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [39]:
xgb = sagemaker.estimator.Estimator(container, # The name of the training container
                                    role,      # The IAM role to use (our current role in this case)\
                                    train_instance_count=1, # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix), # Where to save the output (the model artifacts)
                                    sagemaker_session=session) # The current SageMaker session

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [40]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=20,
                        num_round=500)

In [41]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-01-22 15:15:37 Starting - Starting the training job...
2021-01-22 15:15:39 Starting - Launching requested ML instances......
2021-01-22 15:16:42 Starting - Preparing the instances for training......
2021-01-22 15:17:40 Downloading - Downloading input data...
2021-01-22 15:18:34 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2021-01-22:15:18:35:INFO] Running standalone xgboost training.[0m
[34m[2021-01-22:15:18:35:INFO] File size need to be processed in the node: 119.22mb. Available memory size in the node: 8423.94mb[0m
[34m[2021-01-22:15:18:35:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:18:35] S3DistributionType set as FullyReplicated[0m
[34m[15:18:35] 187760x30 matrix with 5632800 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-01-22:15:18:35:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:18:35] S3DistributionType set as FullyReplicated[0m
[3

In [None]:
#Testing Performance
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: xgboost-2021-01-22-15-15-36-930


.......

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
# plt.scatter(Y_test, Y_pred)
# plt.xlabel("Median Price")
# plt.ylabel("Predicted Price")
# plt.title("Median Price vs Predicted Price")

In [None]:
#Deployment
# xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
# xgb_predictor.content_type = 'text/csv'
# xgb_predictor.serializer = csv_serializer

In [None]:
# Y_pred = xgb_predictor.predict(X_test.values).decode('utf-8')
# Y_pred = np.fromstring(Y_pred, sep=',')

# xgb_predictor.delete_endpoint()