# We will use Linear Learner Model to Forecast if Stock price for Matson will go up or down . We will train the model with data from 2001 - 2019 and Test the model with data for 2020. This is a Binary Classification Problem.

Data Set :https://finance.yahoo.com/quote/MATX/history?p=MATX



In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime               
from sagemaker.predictor import csv_serializer   

from sklearn.preprocessing import MinMaxScaler

In [2]:
# Define IAM role

role = get_execution_role()

#Region where we are running the job, set the region of the instance

my_region = boto3.session.Session().region_name

In [3]:
import s3fs

In [4]:
fs = s3fs.S3FileSystem()

In [5]:
fs.ls(path='sukesh-ml-sagemaker/Matson-Stock-Prediction')

['sukesh-ml-sagemaker/Matson-Stock-Prediction/MATX_Test.csv',
 'sukesh-ml-sagemaker/Matson-Stock-Prediction/MATX_Train.csv']

In [6]:
# Load data from CSV to Pandas dataframe, training has data from 2001 - 2019 and test has data of 2020

In [16]:
df_train = pd.read_csv('s3://sukesh-ml-sagemaker/Matson-Stock-Prediction/MATX_Train.csv')

In [17]:
df_test = pd.read_csv('s3://sukesh-ml-sagemaker/Matson-Stock-Prediction/MATX_Test.csv')

In [18]:
df_train.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2001-01-02,12.856594,13.396031,12.197281,12.257218,4.932734,306900
1,2001-01-03,12.437031,13.366062,12.257218,13.306125,5.354849,248300
2,2001-01-04,13.306125,13.306125,12.317156,12.916532,5.198065,216600
3,2001-01-05,12.129851,12.63932,11.994992,12.347125,4.968913,158400
4,2001-01-08,12.347125,13.845563,12.347125,13.66575,5.499575,919900


In [19]:
df_test.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-02,41.18,41.18,40.07,40.310001,40.066036,130900
1,2020-01-03,39.490002,40.029999,39.490002,39.73,39.489544,194000
2,2020-01-06,39.32,39.389999,38.93,39.299999,39.062145,119000
3,2020-01-07,39.029999,39.209999,38.549999,38.93,38.694386,109600
4,2020-01-08,38.990002,39.360001,38.82,38.84,38.604931,85100


In [20]:
# Trend Label,close > open then 1 else 0

df_train['y'] = (df_train.Open < df_train.Close).astype(np.int)
df_test['y'] = (df_test.Open < df_test.Close).astype(np.int)

In [21]:
df_train.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,y
0,2001-01-02,12.856594,13.396031,12.197281,12.257218,4.932734,306900,0
1,2001-01-03,12.437031,13.366062,12.257218,13.306125,5.354849,248300,1
2,2001-01-04,13.306125,13.306125,12.317156,12.916532,5.198065,216600,0
3,2001-01-05,12.129851,12.63932,11.994992,12.347125,4.968913,158400,1
4,2001-01-08,12.347125,13.845563,12.347125,13.66575,5.499575,919900,1


In [31]:
# Adding lags to the data set , it will stationarize the data 

def add_lags(df):
    for i in range(1, 7):
        for col in ['Open', 'High', 'Low', 'Close']:    
            df[f"{col}_{i}"] = df[col].shift(i)

    del df['Date']
    
    cols = df.columns.tolist()
    cols.remove('y')
    
    df.dropna(inplace=True)
    
    return df[cols], df['y']

In [32]:
# transforming test and train data to add lags

df_tr_with_lags, labels = add_lags(df_train.copy())
df_te_with_lags, test_labels = add_lags(df_test.copy())

In [33]:
df_tr_with_lags.tail(10)

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Open_1,High_1,Low_1,Close_1,...,Low_4,Close_4,Open_5,High_5,Low_5,Close_5,Open_6,High_6,Low_6,Close_6
4767,37.52,37.73,37.049999,37.279999,37.054371,135900,36.939999,38.139999,36.810001,37.41,...,37.080002,37.34,37.419998,37.900002,37.419998,37.57,37.169998,37.240002,36.630001,36.869999
4768,37.98,39.889999,37.98,39.52,39.280815,290300,37.52,37.73,37.049999,37.279999,...,36.950001,37.040001,37.389999,37.689999,37.080002,37.34,37.419998,37.900002,37.419998,37.57
4769,39.459999,39.599998,38.889999,39.259998,39.022388,162600,37.98,39.889999,37.98,39.52,...,36.799999,37.060001,37.299999,37.540001,36.950001,37.040001,37.389999,37.689999,37.080002,37.34
4770,39.400002,39.709999,38.849998,39.619999,39.380207,93800,39.459999,39.599998,38.889999,39.259998,...,36.810001,37.41,37.09,37.330002,36.799999,37.060001,37.299999,37.540001,36.950001,37.040001
4771,39.540001,40.48,39.540001,40.380001,40.135612,103400,39.400002,39.709999,38.849998,39.619999,...,37.049999,37.279999,36.939999,38.139999,36.810001,37.41,37.09,37.330002,36.799999,37.060001
4772,40.59,41.310001,40.59,40.860001,40.612705,1131600,39.540001,40.48,39.540001,40.380001,...,37.98,39.52,37.52,37.73,37.049999,37.279999,36.939999,38.139999,36.810001,37.41
4773,40.830002,41.150002,40.57,40.959999,40.712097,121500,40.59,41.310001,40.59,40.860001,...,38.889999,39.259998,37.98,39.889999,37.98,39.52,37.52,37.73,37.049999,37.279999
4774,40.919998,40.919998,40.43,40.740002,40.493431,46500,40.830002,41.150002,40.57,40.959999,...,38.849998,39.619999,39.459999,39.599998,38.889999,39.259998,37.98,39.889999,37.98,39.52
4775,40.75,41.040001,40.549999,40.970001,40.722042,84300,40.919998,40.919998,40.43,40.740002,...,39.540001,40.380001,39.400002,39.709999,38.849998,39.619999,39.459999,39.599998,38.889999,39.259998
4776,41.110001,41.290001,40.93,41.099998,40.85125,126300,40.75,41.040001,40.549999,40.970001,...,40.59,40.860001,39.540001,40.48,39.540001,40.380001,39.400002,39.709999,38.849998,39.619999


In [34]:
df_te_with_lags.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Open_1,High_1,Low_1,Close_1,...,Low_4,Close_4,Open_5,High_5,Low_5,Close_5,Open_6,High_6,Low_6,Close_6
6,38.310001,38.77,38.18,38.389999,38.157654,84500,39.110001,39.110001,38.32,38.470001,...,38.93,39.299999,39.490002,40.029999,39.490002,39.73,41.18,41.18,40.07,40.310001
7,38.41,39.040001,38.360001,38.959999,38.724205,76800,38.310001,38.77,38.18,38.389999,...,38.549999,38.93,39.32,39.389999,38.93,39.299999,39.490002,40.029999,39.490002,39.73
8,38.799999,39.5,38.639999,39.139999,38.903114,85600,38.41,39.040001,38.360001,38.959999,...,38.82,38.84,39.029999,39.209999,38.549999,38.93,39.32,39.389999,38.93,39.299999
9,38.919998,39.59,38.919998,39.380001,39.141663,120000,38.799999,39.5,38.639999,39.139999,...,38.32,38.470001,38.990002,39.360001,38.82,38.84,39.029999,39.209999,38.549999,38.93
10,39.860001,40.5,39.459999,40.09,39.847366,177900,38.919998,39.59,38.919998,39.380001,...,38.18,38.389999,39.110001,39.110001,38.32,38.470001,38.990002,39.360001,38.82,38.84


In [35]:
# Normalizing the values in the range 0 to 1 

scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(df_tr_with_lags.values)

In [36]:
#transform the data in test and train
x_tr = np.array(scaler.transform(df_tr_with_lags))
x_te = np.array(scaler.transform(df_te_with_lags))

In [37]:
#normalized trained data
x_tr

array([[0.12529312, 0.13120883, 0.12638398, ..., 0.11993886, 0.10465403,
        0.09955142],
       [0.12529312, 0.12457943, 0.12015134, ..., 0.11927592, 0.10600161,
        0.12283598],
       [0.11533449, 0.12126475, 0.1107182 , ..., 0.11795005, 0.10734922,
        0.11418745],
       ...,
       [0.73305127, 0.72879651, 0.7394186 , ..., 0.7060119 , 0.68433453,
        0.70475481],
       [0.72928524, 0.73145109, 0.74211658, ..., 0.69959679, 0.7047943 ,
        0.69898306],
       [0.73726049, 0.73698134, 0.75066025, ..., 0.70203012, 0.70389495,
        0.70697468]])

In [38]:
# We changing data into Binary form which is required by sagemaker to run the data against Linear Learner Model

import sagemaker.amazon.common as smac
import io

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf,
                                 x_tr.astype('float32'),
                                 np.array(labels).astype('float32'))
buf.seek(0)

0

In [39]:
# bucket in which we saved the train,test and model
bucket = 'sukesh-ml-sagemaker'
prefix='Matson-Stock-Prediction'

In [40]:
# uploading binary data into the bucked called 'train' after the transformation

import boto3
import os

key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sukesh-ml-sagemaker/Matson-Stock-Prediction/train/recordio-pb-data


In [41]:
# providing bucket info where the Training artifacts will be saved ( this is file which contains the model and logic to it)
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sukesh-ml-sagemaker/Matson-Stock-Prediction/output


In [42]:
# using amazon sdk we call the container which will run the data against the algorithm

from sagemaker.amazon.amazon_estimator import get_image_uri

In [43]:
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [44]:
# Creating the model, setting hyper parameters and training the model against the train data.

import boto3
import sagemaker

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=30,
                           predictor_type='binary_classifier',
                           mini_batch_size=100)

linear.fit({'train': s3_train_data})

2020-03-03 20:42:17 Starting - Starting the training job...
2020-03-03 20:42:19 Starting - Launching requested ML instances...
2020-03-03 20:43:17 Starting - Preparing the instances for training.........
2020-03-03 20:44:26 Downloading - Downloading input data...
2020-03-03 20:45:19 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34m[03/03/2020 20:45:22 INFO 140461005748032] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma':

[34m[2020-03-03 20:45:28.705] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 10, "duration": 1268, "num_examples": 48, "num_bytes": 801528}[0m
[34m#metrics {"Metrics": {"train_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6933382318374959, "sum": 0.6933382318374959, "min": 0.6933382318374959}}, "EndTime": 1583268328.705236, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1583268328.705159}
[0m
[34m#metrics {"Metrics": {"train_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6981811799394323, "sum": 0.6981811799394323, "min": 0.6981811799394323}}, "EndTime": 1583268328.705349, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1583268328.705328}
[0m
[34m#metrics {"Metrics": {"train_binary_classification_cross_entropy_objective": {"count": 1, "max


2020-03-03 20:45:45 Uploading - Uploading generated training model[34m[2020-03-03 20:45:39.205] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 26, "duration": 1263, "num_examples": 48, "num_bytes": 801528}[0m
[34m#metrics {"Metrics": {"train_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6845950999158494, "sum": 0.6845950999158494, "min": 0.6845950999158494}}, "EndTime": 1583268339.205772, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 11}, "StartTime": 1583268339.205684}
[0m
[34m#metrics {"Metrics": {"train_binary_classification_cross_entropy_objective": {"count": 1, "max": 0.6855924176155253, "sum": 0.6855924176155253, "min": 0.6855924176155253}}, "EndTime": 1583268339.205861, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 11}, "StartTime": 1583268339.205841}
[0m
[34m#metrics {"Metrics": {"tr


2020-03-03 20:45:52 Completed - Training job completed
Training seconds: 86
Billable seconds: 86


In [45]:
# deploying the model to an endpoint
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

-----------------!

In [46]:
# to convert input data to csv and output in Json when run against the endpoint
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [47]:
df_test_with_lags, test_labels = add_lags(df_test.copy())

In [51]:
test_labels.head()

6     1
7     1
8     1
9     1
10    1
Name: y, dtype: int64

In [52]:
# running test data against the endpoint to get inference in json format

result = linear_predictor.predict(x_te.astype('float32'))
print(result)

{'predictions': [{'score': 0.47903531789779663, 'predicted_label': 0.0}, {'score': 0.4943349361419678, 'predicted_label': 0.0}, {'score': 0.5024557113647461, 'predicted_label': 0.0}, {'score': 0.5066857933998108, 'predicted_label': 0.0}, {'score': 0.5215806365013123, 'predicted_label': 1.0}, {'score': 0.5115366578102112, 'predicted_label': 0.0}, {'score': 0.48280069231987, 'predicted_label': 0.0}, {'score': 0.48153215646743774, 'predicted_label': 0.0}, {'score': 0.4931122064590454, 'predicted_label': 0.0}, {'score': 0.4870142340660095, 'predicted_label': 0.0}, {'score': 0.48093336820602417, 'predicted_label': 0.0}, {'score': 0.4857831299304962, 'predicted_label': 0.0}, {'score': 0.48173296451568604, 'predicted_label': 0.0}, {'score': 0.45440298318862915, 'predicted_label': 0.0}, {'score': 0.4499068260192871, 'predicted_label': 0.0}, {'score': 0.47156769037246704, 'predicted_label': 0.0}, {'score': 0.484487920999527, 'predicted_label': 0.0}, {'score': 0.5081357359886169, 'predicted_labe

In [53]:
result['predictions']

[{'score': 0.47903531789779663, 'predicted_label': 0.0},
 {'score': 0.4943349361419678, 'predicted_label': 0.0},
 {'score': 0.5024557113647461, 'predicted_label': 0.0},
 {'score': 0.5066857933998108, 'predicted_label': 0.0},
 {'score': 0.5215806365013123, 'predicted_label': 1.0},
 {'score': 0.5115366578102112, 'predicted_label': 0.0},
 {'score': 0.48280069231987, 'predicted_label': 0.0},
 {'score': 0.48153215646743774, 'predicted_label': 0.0},
 {'score': 0.4931122064590454, 'predicted_label': 0.0},
 {'score': 0.4870142340660095, 'predicted_label': 0.0},
 {'score': 0.48093336820602417, 'predicted_label': 0.0},
 {'score': 0.4857831299304962, 'predicted_label': 0.0},
 {'score': 0.48173296451568604, 'predicted_label': 0.0},
 {'score': 0.45440298318862915, 'predicted_label': 0.0},
 {'score': 0.4499068260192871, 'predicted_label': 0.0},
 {'score': 0.47156769037246704, 'predicted_label': 0.0},
 {'score': 0.484487920999527, 'predicted_label': 0.0},
 {'score': 0.5081357359886169, 'predicted_lab

In [54]:

scores = [x['score'] for x in result['predictions']]
predicted_labels = [x['predicted_label'] for x in result['predictions']]
df_te_with_lags['scores'] = scores
df_te_with_lags['predicted_labels'] = predicted_labels

df_te_with_lags.tail()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Open_1,High_1,Low_1,Close_1,...,Open_5,High_5,Low_5,Close_5,Open_6,High_6,Low_6,Close_6,scores,predicted_labels
36,37.110001,37.110001,35.869999,36.470001,36.470001,271500,36.669998,37.209999,36.459999,37.099998,...,36.389999,36.98,36.389999,36.98,36.450001,36.93,36.27,36.450001,0.482091,0.0
37,34.0,34.27,32.02,32.639999,32.639999,792500,37.110001,37.110001,35.869999,36.470001,...,37.099998,37.959999,36.970001,37.900002,36.389999,36.98,36.389999,36.98,0.431747,0.0
38,32.650002,33.669998,31.51,33.209999,33.209999,372800,34.0,34.27,32.02,32.639999,...,37.619999,38.299999,37.619999,37.990002,37.099998,37.959999,36.970001,37.900002,0.449476,0.0
39,32.279999,33.380001,32.0,33.209999,33.209999,364900,32.650002,33.669998,31.51,33.209999,...,38.0,38.279999,37.48,37.919998,37.619999,38.299999,37.619999,37.990002,0.486458,0.0
40,33.32,33.389999,32.400002,33.209999,33.209999,147800,32.279999,33.380001,32.0,33.209999,...,36.669998,37.209999,36.459999,37.099998,38.0,38.279999,37.48,37.919998,0.494332,0.0


In [56]:
file_name = "Inference_MatsonStock.csv" 
df_te_with_lags.to_csv(file_name)