In [1]:
import sys
import numpy as np
import pandas as pd
import math
import os

### PCA Cloud prediction 

#### Invoke sagemaker prediction service

In [2]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializer and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
# SDK 2.0
endpoint_name = 'pca-biketrain-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [4]:
predictor.serializer=CSVSerializer()
predictor.deserializer=JSONDeserializer()

In [5]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
columns_for_pca = ['temp','atemp','humidity','windspeed']

In [6]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [7]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,0.118761,0.173736,0.977605,0.0,2011,1,1,5,0
1,3.713572,1,0,0,1,0.110467,0.166986,0.979751,0.0,2011,1,1,5,1


In [8]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,0.167404,0.178475,0.879419,0.408344,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,0.181869,0.232625,0.95541,0.0,2011,1,20,3,1


In [100]:
df[colums_for_pca].head(2)

Unnamed: 0,temp,atemp,humidity,windspeed
0,0.118761,0.173736,0.977605,0.0
1,0.110467,0.166986,0.979751,0.0


In [101]:
df[columns_for_pca].head().values

array([[0.11876091, 0.17373611, 0.97760504, 0.        ],
       [0.11046696, 0.16698636, 0.9797513 , 0.        ],
       [0.11046696, 0.16698636, 0.9797513 , 0.        ],
       [0.12779176, 0.1869474 , 0.97402255, 0.        ],
       [0.12779176, 0.1869474 , 0.97402255, 0.        ]])

In [9]:
test = df[columns_for_pca].head().values

In [57]:
type(test)

numpy.ndarray

In [10]:
result = predictor.predict(test)

In [11]:
result

{'projections': [{'projection': [-0.3131008744239807, -0.9453434944152832]},
  {'projection': [-0.32269516587257385, -0.9424567222595215]},
  {'projection': [-0.32269516587257385, -0.9424567222595215]},
  {'projection': [-0.2985489070415497, -0.9494175910949707]},
  {'projection': [-0.2985489070415497, -0.9494175910949707]}]}

In [104]:
l = [values['projection'] for values in result['projections']]

In [105]:
l

[[-0.3131008744239807, -0.9453434944152832],
 [-0.32269516587257385, -0.9424567222595215],
 [-0.32269516587257385, -0.9424567222595215],
 [-0.2985489070415497, -0.9494175910949707],
 [-0.2985489070415497, -0.9494175910949707]]

In [12]:
# for a large number of predictions we can split the input data
def get_projection(arr_features):
    projections=[]
    for arr in np.array_split(arr_features,100):
        #print(arr.shape)
        result = predictor.predict(arr)
        projections +=[values['projection'] for values in result['projections']]
    return projections

In [13]:
def replace_features(df,columns_for_pca):
    
    arr_features = df[columns_for_pca].values
    
    projections = get_projection(arr_features)
    df_projection = pd.DataFrame(projections)
    
    tcols = []
    
    for i in range(df_projection.shape[1]):
        tcols.append('component_'+str(i))
        
    df_projection.columns = tcols
    print('components:',tcols)
    
    for col in df_projection.columns:
        df[col] = df_projection[col]
        
    df.drop(columns_for_pca,inplace=True, axis=1)
    
    return tcols

In [91]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,0.118761,0.173736,0.977605,0.0,2011,1,1,5,0
1,3.713572,1,0,0,1,0.110467,0.166986,0.979751,0.0,2011,1,1,5,1


In [14]:
new_cols = replace_features(df,columns_for_pca)

components: ['component_0', 'component_1']


In [15]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1
0,2.833213,1,0,0,1,2011,1,1,5,0,-0.313101,-0.945343
1,3.713572,1,0,0,1,2011,1,1,5,1,-0.322695,-0.942457


In [16]:
replace_features(df_test,columns_for_pca)

components: ['component_0', 'component_1']


['component_0', 'component_1']

In [111]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour']

In [17]:
for col in new_cols:
    columns.append(col)

In [18]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1']

In [114]:
df_test

Unnamed: 0,datetime,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1
0,2011-01-20 00:00:00,1,0,1,1,2011,1,20,3,0,-0.102386,-0.955772
1,2011-01-20 01:00:00,1,0,1,1,2011,1,20,3,1,-0.232725,-0.964794
2,2011-01-20 02:00:00,1,0,1,1,2011,1,20,3,2,-0.232725,-0.964794
3,2011-01-20 03:00:00,1,0,1,1,2011,1,20,3,3,-0.176658,-0.982781
4,2011-01-20 04:00:00,1,0,1,1,2011,1,20,3,4,-0.176658,-0.982781
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,2012,12,31,0,19,-0.197231,-0.979067
6489,2012-12-31 20:00:00,1,0,1,2,2012,12,31,0,20,-0.197231,-0.979067
6490,2012-12-31 21:00:00,1,0,1,1,2012,12,31,0,21,-0.197231,-0.979067
6491,2012-12-31 22:00:00,1,0,1,1,2012,12,31,0,22,-0.180584,-0.983338


In [19]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [20]:
rows = df.shape[0]
train = int(rows*0.7)
test = rows-train

In [21]:
rows, train, test

(10886, 7620, 3266)

In [124]:
7620+3266

10886

In [22]:
# Write Training Set
df[:train].to_csv('bike_train_pca_cloud.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [23]:
# Write Training Set
df[train:].to_csv('bike_validation_pca_cloud.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [128]:
# Test Data has only input features
df_test.to_csv('bike_test_pca_cloud.csv',index=False)

In [129]:
# Write Column List
with open('bike_train_column_list_pca_cloud.txt','w') as f:
    f.write(','.join(columns))

In [24]:
predictor.delete_endpoint(endpoint_name)