In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
# ## load the data
# from google.colab import files
# uploaded = files.upload()

In [3]:
## read the data sets
df_train = pd.read_csv("train.csv", index_col=0, parse_dates=['datetime']) 
# parse_dates parameter to convince pandas to turn things into real datetime types
display(df_train.head())

df_test = pd.read_csv("test.csv", index_col=0, parse_dates=['datetime'])
display(df_test.head())

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [4]:
## Convert datetime to numeric for training
# Extract year, month, day, dayofweek, and hour features from datetime
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour

## Add new features
add_features(df_train)
add_features(df_test)

## Add log transformed 'count' feature
df_train['count'] = df_train['count'].map(np.log1p)

## check dataframes
display(df_train.head())
display(df_test.head())

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayofweek,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,2.833213,2011,1,1,5,0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,3.713572,2011,1,1,5,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,3.496508,2011,1,1,5,2
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,2.639057,2011,1,1,5,3
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,0.693147,2011,1,1,5,4


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [5]:
## check the info of data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 16 columns):
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null float64
year          10886 non-null int64
month         10886 non-null int64
day           10886 non-null int64
dayofweek     10886 non-null int64
hour          10886 non-null int64
dtypes: float64(4), int64(12)
memory usage: 1.4 MB


In [6]:
## PCA only accept numeric column
cols_normalize = ['temp','atemp','humidity','windspeed']

In [7]:
df_train[cols_normalize].head()

Unnamed: 0_level_0,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,9.84,14.395,81,0.0
2011-01-01 01:00:00,9.02,13.635,80,0.0
2011-01-01 02:00:00,9.02,13.635,80,0.0
2011-01-01 03:00:00,9.84,14.395,75,0.0
2011-01-01 04:00:00,9.84,14.395,75,0.0


In [8]:
## Normalize numeric columns
scaler = StandardScaler()
scaler.fit(df_train[cols_normalize])

## Train data
df_train[cols_normalize] = scaler.transform(df_train[cols_normalize])
display(df_train.head())

  return self.partial_fit(X, y)


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,dayofweek,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,3,13,2.833213,2011,1,1,5,0
2011-01-01 01:00:00,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,8,32,3.713572,2011,1,1,5,1
2011-01-01 02:00:00,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,5,27,3.496508,2011,1,1,5,2
2011-01-01 03:00:00,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,3,10,2.639057,2011,1,1,5,3
2011-01-01 04:00:00,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,0,1,0.693147,2011,1,1,5,4


In [9]:
## Test data
df_test[cols_normalize] = scaler.transform(df_test[cols_normalize])
display(df_test.head())

  from ipykernel import kernelapp as app


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-20 00:00:00,1,0,1,1,-1.228414,-1.450292,-0.305883,1.617227,2011,1,20,3,0
2011-01-20 01:00:00,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,1
2011-01-20 02:00:00,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,2
2011-01-20 03:00:00,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,3
2011-01-20 04:00:00,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,4


In [10]:
## Save the normalized train and test data
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

df_train.to_csv("rob-train_normalized.csv", index=False, columns=columns)
df_test.to_csv("rob-test_normalized.csv", index=False)

In [11]:
## check
check_tr = pd.read_csv("rob-train_normalized.csv")
display(check_tr.head())

check_te = pd.read_csv("rob-test_normalized.csv")
display(check_te.head())

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1
2,3.496508,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,2
3,2.639057,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,2011,1,1,5,3
4,0.693147,1,0,0,1,-1.333661,-1.092737,0.68143,-1.567754,2011,1,1,5,4


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,1,0,1,1,-1.228414,-1.450292,-0.305883,1.617227,2011,1,20,3,0
1,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,1
2,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,2
3,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,3
4,1,0,1,1,-1.228414,-1.271515,-0.305883,-0.22023,2011,1,20,3,4


### For PCA, we only need 4 numeric columns
`'temp','atemp','humidity','windspeed'`


In [13]:
import sagemaker.amazon.common as smac

def write_recordio_file(filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_numpy_to_dense_tensor(f, x, y)

# Store All Normalized data as RecordIO File for PCA Training in SageMaker
# Need to pass as an array to create RecordIO file
X = df_train.as_matrix(columns=['temp','atemp','humidity','windspeed'])
write_recordio_file('rob-bike_train_numeric_columns.recordio', X)

