<h2>Kaggle Bike Sharing Demand Dataset Preparation For PCA</h2>
<h4>Use PCA to find new components to replace 'temp','atemp','humidity','windspeed' in both training and test datasets</h4>
<h4>To download dataset, sign-in and download from this link: https://www.kaggle.com/c/bike-sharing-demand/data</h4>
<br>
Input Features: ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek','hour', <b>'pca components'</b>]<br>
Target Feature: [log1p('count')]<br>
PCA Training: ['temp','atemp','humidity','windspeed']<br><br>

Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()


In [None]:
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

cols_normalize = ['temp','atemp','humidity','windspeed']

In [None]:
df = pd.read_csv('../input/bike-sharing-demand/train.csv',parse_dates=['datetime'])
df_test = pd.read_csv('../input/bike-sharing-demand/test.csv',parse_dates=['datetime'])

In [None]:
df.head()

In [None]:
# We need to convert datetime to numeric for training.
# Let's extract key features into separate numeric columns
def add_features(df):
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour

In [None]:
add_features(df)
add_features(df_test)

In [None]:
df["count"] = df["count"].map(np.log1p)

In [None]:
df.head(2)

In [None]:
df_test.head(2)

In [None]:
# Normalize the dataset
transformer = Normalizer()

In [None]:
# Normalization parameters based on Training
transformer.fit(df[cols_normalize])

In [None]:
def transform_data(scaler, df, columns):
    transformed_data = scaler.transform(df[columns])
    df_transformed = pd.DataFrame(transformed_data, columns=columns)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]

In [None]:
transform_data(transformer, df, cols_normalize)
transform_data(transformer, df_test, cols_normalize)

In [None]:
df.head(2)

In [None]:
df_test.head(2)

In [None]:
# Store Original train and test data in normalized form
df.to_csv('train_normalized.csv',index=False, columns=columns)
df_test.to_csv('test_normalized.csv',index=False)

Store only the 4 numeric colums for PCA Training and Test

Data Needs to be normalized

In [None]:
df = pd.read_csv('./train_normalized.csv')
df_test = pd.read_csv('./test_normalized.csv')

In [None]:
df.head(2)

In [None]:
df_test.head(2)

In [None]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [None]:
# Find PCA
pca = PCA(n_components=0.9) # Capture 90% total variation

In [None]:
# Find new components
pca.fit(df[colums_for_pca])

In [None]:
# No. of PCA Components
print ('Variance: ', pca.n_components)
print ('No. of components to keep: ', pca.n_components_)

In [None]:
def transform_with_pca(pca, df, columns):
    transformed_data = pca.transform(df[columns])
    
    tcols = []
    for i in range(pca.n_components_):       
        tcols.append('component_' + str(i))
    
    print ('components:',tcols)
    df_transformed = pd.DataFrame(transformed_data, columns=tcols)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]
    
    df.drop(columns, inplace=True, axis=1)
    
    return tcols

In [None]:
new_cols = transform_with_pca(pca, df, colums_for_pca)

In [None]:
transform_with_pca(pca, df_test, colums_for_pca)

In [None]:
df.head(2)

In [None]:
df_test.head(2)

In [None]:
for col in new_cols:
    columns.append(col)

In [None]:
columns

## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
columns

In [None]:
# Write Training Set
df[:train].to_csv('bike_train_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('bike_validation_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [None]:
# Write Column List
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))

## Train  model using PCA Components
###  Model is trained with XGBoost installed in notebook instance

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost==0.90

In [None]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

In [None]:
column_list_file = 'bike_train_column_list_pca.txt'
train_file = 'bike_train_pca.csv'
validation_file = 'bike_validation_pca.csv'
test_file = 'bike_test_pca.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head(2)

In [None]:
df_validation.head(2)

In [None]:
df_train.iloc[:,1:-2].head(2)

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150,n_estimators=150)

In [None]:
regressor

In [None]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])

In [None]:
eval_result = regressor.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()

In [None]:
xgb.plot_importance(regressor)

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
df_test.head(2)

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head(2)

In [None]:
result = regressor.predict(X_test)

In [None]:
result[:5]

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test['count'] = df_test['count'].map(adjust_count)

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('predicted_count_pca.csv',index=False)