In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import boto3
import sagemaker.amazon.common as smac

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/saravanan/.config/sagemaker/config.yaml


### Preparing Kaggle Bike Sharing dataset for PCA

Use PCA to find new components to replace 'temp','atemp','humidity','windspeed'

Input Features: ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek','hour', 'pca components']<br>
Target Feature: [log1p('count')]</br>
PCA Training: ['temp','atemp','humidity','windspeed']

In [2]:
# separating columns

columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA training columns

columns_for_pca = ['temp','atemp','humidity','windspeed']

In [3]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [4]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,0.118761,0.173736,0.977605,0.0,2011,1,1,5,0
1,3.713572,1,0,0,1,0.110467,0.166986,0.979751,0.0,2011,1,1,5,1


In [5]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,0.167404,0.178475,0.879419,0.408344,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,0.181869,0.232625,0.95541,0.0,2011,1,20,3,1


In [6]:
# find pca

pca = PCA(n_components=0.90)
pca.fit(df[columns_for_pca])

In [7]:
# Number of components
print('Variance: ', pca.n_components)
print('No. of components to keep: ', pca.n_components_)

Variance:  0.9
No. of components to keep:  2


In [8]:
def transform_with_pca(pca,df,columns):
    transformed_data = pca.transform(df[columns])

    tcols =[]

    for i in range(pca.n_components_):
        tcols.append('component_'+str(i))

    print('components:', tcols)
    df_transformed = pd.DataFrame(transformed_data,columns=tcols)

    for col in df_transformed.columns:
        df[col] = df_transformed[col]

    df.drop(columns, inplace=True, axis=1)
        

In [9]:
new_cols = transform_with_pca(pca,df, columns_for_pca)

components: ['component_0', 'component_1']


In [10]:
transform_with_pca(pca,df_test,columns_for_pca)

components: ['component_0', 'component_1']


In [11]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1
0,2.833213,1,0,0,1,2011,1,1,5,0,-0.314289,-0.084417
1,3.713572,1,0,0,1,2011,1,1,5,1,-0.323481,-0.080062


In [12]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1
0,2011-01-20 00:00:00,1,0,1,1,2011,1,20,3,0,-0.103577,0.27537
1,2011-01-20 01:00:00,1,0,1,1,2011,1,20,3,1,-0.23676,-0.119577


In [13]:
new_cols=['component_0', 'component_1']

In [14]:
for col in new_cols:
    columns.append(col)

In [15]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1']

#### Training, Validation and Test datasets


In [16]:
# Training 70%
# Validation 30% 
# Randomize the dataset

np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [17]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1
3650,4.477337,3,0,0,2,2011,9,3,5,0,-0.024157,-0.117305
8909,5.517453,3,0,1,1,2012,8,13,0,14,0.421581,-0.206154


In [18]:
rows = df.shape[0]
train = int(rows* 0.7)
test = rows-train

rows,train,test

(10886, 7620, 3266)

In [19]:
7620 + 3266

10886

In [20]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1']

In [21]:
# writing training data set
df[:train].to_csv('bike_train_pca.csv',
                     index=False,header=False,
                     columns=columns)

In [22]:
# writing training data set
df[train:].to_csv('bike_validation_pca.csv',
                     index=False,header=False,
                     columns=columns)

In [23]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [24]:
#Write Column list
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))