# Welcome to my Tutorial Notebook
* I am supper excited to share with you guys the results of new insights for the data preparation and building two models such as:
*  Regression using SKLearn's Neural Network (NN)
* Train unisng Keras API with Tensorflow as Backend

# Kaggle Bike Sharing Demand Dataset
Modified 'count' to log1p(count) for training

Log can be used when target represents a count (that is non-negative values)

Model now predicts as log1p(count). We need to convert it back to actual count using expm1(predicted_target)



Input Features: ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']
Target Feature: [log1p('count')]

Objective: We are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)

# In this Notebook, we will go through some steps and different insights such as:
# Contains
# Regression using SKLearn's Neural Network (NN)
* Data Underestanding
* Data Visualization
* Data Preparation:
 1. One Hot Encode all the Categorical Features
 2. Standardize or Normalize all the Numeric Features
* Train using SKLearn's MLPRegressor (Multi-Layer Perceptron)/Regression using SKLearn's Neural Network (NN)
* Prediction
* Evaluating the Results
* Submission
# New Insight for Modeling
# Train unisng Keras API with Tensorflow as Backend
* Data Underestanding
* Data Visualization
* Data Preparation:
 1. One Hot Encode all the Categorical Features
 2. Standardize or Normalize all the Numeric Features
* Train unisng Keras API with Tensorflow as Backend
* Prediction
* Evaluating the Results
* Submission



In [None]:
#Let's import the Necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
# Example
# Converts to log1p(count)
# Print original count back using expm1
print('Test log and exp')
test_count = 100
print('original value', test_count)
x = np.log1p(test_count) # log (x+1)
print('log1p', x)
print('expm1', np.expm1(x)) # exp(x) - 1

In [None]:
columns = ['count', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek','hour']

In [None]:
#loading the datasets
df = pd.read_csv('../input/bike-sharing-demand/train.csv', parse_dates=['datetime'],index_col=0)
df_test = pd.read_csv('../input/bike-sharing-demand/test.csv', parse_dates=['datetime'],index_col=0)

In [None]:
# We need to convert datetime to numeric for training.
# Let's extract key features into separate numeric columns
def add_features(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['hour'] = df.index.hour

In [None]:
add_features(df)
add_features(df_test)

* Data Visualization

In [None]:
plt.plot(df['2011']['count'],label='2011')
plt.plot(df['2012']['count'],label='2012')
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.legend()
plt.show()

In [None]:
plt.plot(df['2011']['count'].map(np.log1p),label='2011')
plt.plot(df['2012']['count'].map(np.log1p),label='2012')
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Log(Rental Count)')
plt.title('2011 and 2012 Rentals (Year to Year)')
plt.legend()
plt.show()

In [None]:
plt.boxplot([df['count']], labels=['count'])
plt.title('Box Plot - Count')
plt.ylabel('Target')
plt.grid(True)

In [None]:
# Let's see how the data distribution changes with log1p
# Evenly distributed
plt.boxplot([df['count'].map(np.log1p)], labels=['log1p(count)'])
plt.title('Box Plot - log1p(Count)')
plt.ylabel('Target')
plt.grid(True)

In [None]:
df["count"] = df["count"].map(np.log1p)

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
df.dtypes

In [None]:
# Save all data
df.to_csv('bike_all.csv',index=True,index_label='datetime',columns=columns)

# Training and Validation Set
* Target Variable as first column followed by input features
* Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.loc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = rows-train

In [None]:
rows, train, test

In [None]:
columns

In [None]:
# Write Training Set
df.iloc[:train].to_csv('bike_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df.iloc[train:].to_csv('bike_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Test Data has only input features
df_test.to_csv('bike_test.csv',index=True,index_label='datetime')

In [None]:
print(','.join(columns))

In [None]:
# Write Column List
with open('bike_train_column_list.txt','w') as f:
    f.write(','.join(columns))

# Regression using SKLearn's Neural Network (NN)
* One-Hot Encode categorical features, Standardize numeric features
* Objective:

* Train a bike rental prediction model
* NN requires one hot encoding of categorical data
* NN also requires features to be on similar scale
* Perform one-hot encoding of all categorical features: ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek', 'hour']
* Verify model performance

In [None]:
import sys
import numpy as np
# Set random seed
np.random.seed(0)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# NN
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer
# Column Transformer
from sklearn.compose import ColumnTransformer

In [None]:
column_list_file = 'bike_train_column_list.txt'
train_file = 'bike_train.csv'
validation_file = 'bike_validation.csv'
test_file = 'bike_test.csv'

# One Hot Encoding

In [None]:
# One Hot Encode all Categorical Features
# Let's define all the categorical features
categorical_features = ['season','holiday','workingday','weather','year','month','dayofweek','hour']

# Standardize Features
standardize_features = ['temp', 'atemp', 'humidity', 'windspeed']

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
df_train.head()

In [None]:
# Features to one-hot encode
categorical_features+['day']

In [None]:
# Features to standardize
standardize_features

# Column Transformer/New Idea 
* Chain all data transformations
* Easy and straight forward

In [None]:
colTransformer = ColumnTransformer([('onehot',
                                     OneHotEncoder(categories='auto',sparse=False),
                                     categorical_features),
                                    ('onehotday',
                                     OneHotEncoder(categories=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
                                                   sparse=False),
                                     ['day']),
                                    ('standardize',
                                    StandardScaler(),standardize_features)
                                   ],
                                   remainder="passthrough")

In [None]:
colTransformer.fit(X_train)

In [None]:
X_train_encoded = colTransformer.transform(X_train)
X_validation_encoded = colTransformer.transform(X_validation)

In [None]:
print('Training Data',X_train.shape, 'OneHot Encoded',X_train_encoded.shape)
print('Val Data',X_validation.shape, 'OneHot Encoded',X_validation_encoded.shape)

In [None]:
X_train_encoded[:1]

In [None]:
X_validation_encoded[:1]

# Train a neural network regressor

In [None]:
nn_regressor = MLPRegressor(random_state=5, 
                            hidden_layer_sizes=[100],
                            activation='relu',
                            max_iter=100)

In [None]:
nn_regressor

In [None]:
%%time
nn_regressor.fit(X_train_encoded,y_train)

In [None]:
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)

In [None]:
df.head()

In [None]:
result = nn_regressor.predict(X_validation_encoded)

In [None]:
result[:5]

In [None]:
df['count_predicted'] = result

In [None]:
df.head()

In [None]:
df['count_predicted'].describe()

In [None]:
# Convert log(count) to count
df['count'] = df['count'].map(np.expm1)
df['count_predicted'] = df['count_predicted'].map(np.expm1)

In [None]:
# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0:.2f}".format(metrics.mean_squared_error(df['count'],
                                                    df['count_predicted'])**.5))

In [None]:
# Metric Use By Kaggle
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

In [None]:
print("RMSLE: {0:.2f}".format(compute_rmsle(df['count'],df['count_predicted'])))

# Submission

In [None]:
# Optional Test Data
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head()

In [None]:
# Transform data first with column transformer
result = nn_regressor.predict(colTransformer.transform(X_test))

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('My_New_Insight_Predicted_Count.csv',index=False)

# NEW Insight
# Regression using TensorFlow
Build the Neural Network using Keras - Easy and Portable across different implementations
https://keras.io/




# Objective:

* Train a bike rental prediction model
* NN requires one hot encoding of categorical data
* NN also requires features to be on similar scale
* Perform one-hot encoding of all categorical features: ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek', 'hour']
* Verify model performance

In [None]:
# https://keras.io/
# https://github.com/keras-team/keras/issues/2743
import sys
import numpy as np
# Set random seed
np.random.seed(0)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer

# Keras Library
from keras.models import Sequential
from keras.layers import Dense, Activation

In [None]:
column_list_file = 'bike_train_column_list.txt'
train_file = 'bike_train.csv'
validation_file = 'bike_validation.csv'
test_file = 'bike_test.csv'

In [None]:
# One Hot Encode all Categorical Features
# Let's define all the categorical features
categorical_features = ['season','holiday','workingday','weather','year','month','dayofweek','hour']

# Separated day
categorical_day = ['day']

# Standardize Features
standardize_features = ['temp', 'atemp', 'humidity', 'windspeed']

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
colTransformer = ColumnTransformer([('onehot',
                                     OneHotEncoder(categories='auto',sparse=False),
                                     categorical_features),
                                    ('onehotday',
                                     OneHotEncoder(categories=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
                                                   sparse=False),
                                     ['day']),
                                    ('standardize',
                                    StandardScaler(),standardize_features)
                                   ],
                                   remainder="passthrough")

In [None]:
colTransformer.fit(X_train)

In [None]:
X_train_encoded = colTransformer.transform(X_train)
X_validation_encoded = colTransformer.transform(X_validation)

In [None]:
print('Training Data',X_train.shape, 'OneHot Encoded',X_train_encoded.shape)
print('Val Data',X_validation.shape, 'OneHot Encoded',X_validation_encoded.shape)

In [None]:
X_train_encoded[:1]

In [None]:
X_validation_encoded[:1]

# Build Model using Keras
Reference: https://keras.io/getting-started/sequential-model-guide/

In [None]:
# Dimension of input data
# We need to specify number of features when configuring the first hidden layer
X_train_encoded.shape

In [None]:
model = Sequential()
# 1 hidden layer with 100 neurons with relu activation
# output layer - regression, so no activation
model.add(Dense(100, input_dim=X_train_encoded.shape[1],activation='relu'))
model.add(Dense(1,activation=None))

In [None]:
# Need to compile the model, specify the optimizer and loss function to use
# For a mean squared error regression problem
model.compile(optimizer='adam',
              loss='mse')

* One creative idea to avoid from overfitting is using the Early Sopping Method

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# We can optionally configure early stopping to prevent overfitting - stop when validation loss does not improve
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [None]:
history = model.fit(X_train_encoded, y_train, epochs=20, batch_size=32, 
          validation_data=(X_validation_encoded,y_validation),callbacks=[early_stopping])

In [None]:
plt.scatter(x=history.epoch,y=history.history['loss'],label='Training Error')
plt.scatter(x=history.epoch,y=history.history['val_loss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)

In [None]:
df.head()

In [None]:
result = model.predict(X_validation_encoded)

In [None]:
result[:5]

In [None]:
df.head()

In [None]:
df['count_predicted'] = result

In [None]:
df.head()

In [None]:
df['count_predicted'].describe()

In [None]:
df['count'] = df['count'].map(np.expm1)
df['count_predicted'] = df['count_predicted'].map(np.expm1)

In [None]:
# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0:.2f}".format(metrics.mean_squared_error(df['count'],
                                                    df['count_predicted'])**.5))

In [None]:
# Metric Use By Kaggle
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

In [None]:
print("RMSLE: {0:.2f}".format(compute_rmsle(df['count'],df['count_predicted'])))

# Submission

In [None]:
# Optional Test Data
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
# Transform data first with column transformer
result = model.predict(colTransformer.transform(X_test))

In [None]:
result[:5]

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
def adjust_count(x):
    if x < 0:
        return 0
    else:
        return x

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test['count'] = df_test['count'].map(adjust_count)

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('Mew_Insight_Keras_Predicted_Count.csv',index=False)