## Definition

### Problem Statement  

The goal is to forecast the demand for bikes in dependency of weather conditions like outside temperature and calendric informations e.g. holidays. These information and the demand structure is provided in a set with two years of daily historic data.  
The demand is given as the total daily demand and as a split for registered users and casual users. To increase the quality of the prediction registered user demand and casual user demand will be predicted separately in step two.  
To make predictions machine learning is used to train regressors. Scikit-Learn recommends a support vector regressor (SVR) for this kind of problem and data amount. In addition a deep neuronal network (DNN) regressor is trained for comparison. To find the hyper-parameters for these regressors grid search and randomized search are utilized. Due to the small dataset cross validation is applied.    

> http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html  
> http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR  
> https://github.com/tensorflow/skflow/blob/master/g3doc/api_docs/python/estimators.md  
> http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html  
> http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar

from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from math import sqrt



## Analysis

In [None]:
# Fetching Dataset

bike_data = pd.read_csv("day.csv", header=0)

print("Data read successfully!")

In [None]:
bike_data.head()

### Data Exploration

In [None]:
# Extracting

feature_cols = bike_data.columns[:-3]  # all columns but last are features
target_col = bike_data.columns[-1]  # last column is the target

print ("Feature column(s):\n{}\n".format(feature_cols))
print ("Target column:\n{}".format(target_col))

In [None]:
#### Function to Calculate Profit

In [2]:
def profit(y,y_cap):
    return 3 * np.minimum(y[::1], y_cap[::1]) - 2 * y_cap[::1]
    

#### Function to Convert from percentage to Actual Prediction

In [3]:
def convertToPrediction(data,percentage_predictions):
      return np.around(data + (np.multiply(data, percentage_predictions)/100))

### Base Model

#### For the base model the demand for today is the previous days demand. 

In [None]:
y_actual = bike_data[target_col][365:731]  # corresponding targets
y_actual = y_actual.reset_index(drop = True)

In [None]:
y_staged = y_actual.copy()

In [None]:
data = []
data.insert(0, bike_data[target_col][364])
data.insert(0, bike_data[target_col][363])

In [None]:
y_predicted_df = pd.concat([pd.DataFrame(data), y_staged], ignore_index=True)

In [None]:
y_predicted_df.drop(y_predicted_df.tail(2).index,inplace=True)

In [None]:
y_predicted = y_predicted_df[0]


##### Calculate Base Model Profit

In [None]:
print(profit(y_actual,y_predicted).sum())

### Algorithms and Techniques

In [None]:
X_raw_train = pd.read_csv("train.csv", header=0)
X_raw_test  = pd.read_csv("test.csv", header=0)

In [None]:
cols = ["temp","hum", "windspeed" ,"cnt_normal","week_moving_avg_normal","season_1","season_2","season_3","season_4","mnth_1","mnth_2","mnth_3","mnth_4","mnth_5","mnth_6","mnth_7","mnth_8","mnth_9","mnth_10","mnth_11","mnth_12","holiday_1","holiday_2","weekday_1","weekday_2","weekday_3","weekday_4","weekday_5","weekday_6","weekday_7","workingday_1","workingday_2","weathersit_1","weathersit_2","weathersit_3"]

In [None]:
cols = ["atemp","hum", "windspeed" ,"cnt_normal","week_moving_avg_normal","season_1","season_2","season_3","season_4","mnth_1","mnth_2","mnth_3","mnth_4","mnth_5","mnth_6","mnth_7","mnth_8","mnth_9","mnth_10","mnth_11","mnth_12","holiday_1","holiday_2","weekday_1","weekday_2","weekday_3","weekday_4","weekday_5","weekday_6","weekday_7","workingday_1","workingday_2","weathersit_1","weathersit_2","weathersit_3"]

In [None]:
X_train = X_raw_train[cols].values.tolist()
y_train_df = X_raw_train[['target']]
y_train = y_train_df['target'].tolist()

In [None]:
X_test = X_raw_test[cols].values.tolist()
y_test_df = X_raw_test[['target']]
y_test = y_test_df['target'].tolist()

#### Alternate dataset with percentage change

In [95]:
data = pd.read_csv("processed_Data.csv", header=0)
data['instant'] = data['instant'] % 30
X_raw_train = data[0:359]
X_raw_test  = data[359:]

In [96]:
cols =[
       "season__1","season__2","season__3","season__4","season__5",
       "weathersit__1","weathersit__2","weathersit__3","weathersit__4","weathersit__5",
        "atemp","hum","windspeed",
        "mnth","instant","holiday","weekday","workingday",
        "moving_avg_weekly_cnt"]     

In [97]:
X_train = X_raw_train[cols].values.tolist()
y_train_df = X_raw_train[['demand_pc_inc']]
y_train = y_train_df['demand_pc_inc'].tolist()

In [98]:
X_test = X_raw_test[cols].values.tolist()
y_test_df = X_raw_test[['demand_pc_inc']]
y_test = y_test_df['demand_pc_inc'].tolist()

In [99]:
data_cnt = data['cnt']

In [100]:
actual_predictions = data_cnt[359:].values

In [101]:
y_for_calculations = data_cnt[357:723].values

### Benchmark

In [102]:
# Training SVR
svr = SVR()
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [103]:
# Validation SVR

svr_pred = svr.predict(X_test)
score_svr = r2_score(y_test, svr_pred)
rmse_svr = sqrt(mean_squared_error(y_test, svr_pred))

print("Score SVR: %f" % score_svr)
print("RMSE SVR: %f" % rmse_svr)

Score SVR: -0.003308
RMSE SVR: 1318.398984


In [104]:
svr_pred

array([0.394235  , 0.38637924, 0.31425885, 0.38640337, 0.38592272,
       0.38599109, 0.38592236, 0.53607766, 0.38592552, 0.38592202,
       0.38592201, 0.38723512, 0.38591896, 0.38592202, 0.38533663,
       0.38592202, 0.68234301, 0.38709951, 0.38592164, 0.38591182,
       0.38592205, 0.38581424, 0.38592224, 0.38467478, 0.38592309,
       0.35655385, 0.38584931, 0.38592202, 0.38602622, 0.46145605,
       0.38592202, 0.38591842, 0.38592205, 0.38364073, 0.38592207,
       0.38594366, 0.38583179, 0.38605335, 0.39788321, 0.38592852,
       0.38592202, 0.38538828, 0.38646848, 0.38605953, 0.38592201,
       0.38592262, 0.38593414, 0.38592201, 0.38592201, 0.38505089,
       0.38595021, 0.38035131, 0.38601654, 0.46149189, 0.38592202,
       0.38592202, 0.38584661, 0.38572556, 0.38592202, 0.38604506,
       0.43560603, 0.38592202, 0.38593152, 0.38593926, 0.38592198,
       0.39189013, 0.3859355 , 0.38592202, 0.38592318, 0.38631169,
       0.38919515, 0.76828721, 0.29207917, 0.38703043, 0.38592

In [105]:
model_predictions = convertToPrediction(y_for_calculations,svr_pred)

In [106]:
model_predictions

array([3011., 2495., 2301., 1959., 2245., 2377., 3285., 4120., 4538.,
       3438., 2385., 3612., 2185., 4113., 3226., 2503., 2327., 2307.,
       2946., 3389., 3305., 3175., 1306., 1985., 2441., 4354., 4286.,
       4091., 3469., 4042., 3256., 3638., 4526., 4597., 3776., 4167.,
       2843., 2958., 3799., 4392., 2813., 3845., 3846., 2177., 1535.,
       3435., 3937., 4185., 3017., 4170., 4335., 2699., 3141., 3794.,
       4791., 5082., 3500., 2743., 3402., 4339., 4382., 1841., 5009.,
       3206., 4082., 3436., 3346., 3971., 4935., 5403., 4587., 4150.,
       4925., 5319., 5870., 6336., 6216., 4395., 7866., 5915., 6177.,
       6117., 6254., 6898., 8394., 3385., 5015., 5579., 5122., 5721.,
       6157., 5480., 6259., 6064., 5959., 6798., 6461., 6482., 6485.,
       6883., 5189., 5607., 5941., 4881., 5430., 6423., 7489., 7160.,
       6395., 6717., 4384., 6590., 7318., 6650., 1031., 3227., 5655.,
       6219., 5053., 6257., 4236., 6328., 5594., 5762., 6193., 6446.,
       6320., 6910.,

In [107]:
print(profit(actual_predictions,model_predictions).sum())

1438598.0


## Methodology

### Implementation

The regressors are trained using randomized search and cross-validation to identify the area of the best parameters. Then a grid search is used to tune parameter values of the regressor functions.

> http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html  
> http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html

In [108]:
# Tuning SVR with GridSearch

tuned_parameters = [{'C': [1000, 3000, 10000], 
                     'kernel': ['linear', 'rbf']}
                   ]

#svr_tuned = GridSearchCV(SVR (C=1), param_grid = tuned_parameters, scoring = 'mean_squared_error') #default 3-fold cross-validation, score method of the estimator
svr_tuned_GS = GridSearchCV(SVR (C=1), param_grid = tuned_parameters, scoring = 'r2', n_jobs=-1) #default 3-fold cross-validation, score method of the estimator

svr_tuned_GS.fit(X_train, y_train)

print (svr_tuned_GS)
print ('\n' "Best parameter from grid search: " + str(svr_tuned_GS.best_params_) +'\n')

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'C': [1000, 3000, 10000], 'kernel': ['linear', 'rbf']}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

Best parameter from grid search: {'C': 1000, 'kernel': 'rbf'}



In [109]:
# Validation - SVR tuned 

svr_tuned_pred_GS = svr_tuned_GS.predict(X_test)

score_svr_tuned_GS = r2_score(y_test, svr_tuned_pred_GS)
rmse_svr_tuned_GS = sqrt(mean_squared_error(y_test, svr_tuned_pred_GS))

print("SVR Results\n")

print("Score SVR: %f" % score_svr)
print("Score SVR tuned GS: %f" % score_svr_tuned_GS)

print("\nRMSE SVR: %f" % rmse_svr)
print("RMSE SVR tuned GS: %f" % rmse_svr_tuned_GS)

SVR Results

Score SVR: -0.003308
Score SVR tuned GS: -0.002516

RMSE SVR: 1318.398984
RMSE SVR tuned GS: 1317.878750


In [110]:
svr_tuned_pred_GS


##Profit Calculation for pct approach
model_predictions = convertToPrediction(y_for_calculations,svr_tuned_pred_GS)
print(profit(actual_predictions,model_predictions).sum())

#Profit is just 1.26million!!

1260233.0


In [111]:
# SVR tuned with RandomizesSearch
# may take a while!

# Parameters
param_dist = {  'C': sp_uniform (1000, 10000), 
                'kernel': ['rbf']
             }

n_iter_search = 1

# MSE optimized
#SVR_tuned_RS = RandomizedSearchCV(SVR (C=1), param_distributions = param_dist, scoring = 'mean_squared_error', n_iter=n_iter_search)

# R^2 optimized
SVR_tuned_RS = RandomizedSearchCV(SVR (C=1), param_distributions = param_dist, scoring = 'r2', n_iter=n_iter_search)

# Fit
SVR_tuned_RS.fit(X_train, y_train)

# Best score and corresponding parameters.
print('best CV score from grid search: {0:f}'.format(SVR_tuned_RS.best_score_))
print('corresponding parameters: {}'.format(SVR_tuned_RS.best_params_))

# Predict and score
predict = SVR_tuned_RS.predict(X_test)

score_svr_tuned_RS = r2_score(y_test, predict)
rmse_svr_tuned_RS = sqrt(mean_squared_error(y_test, predict))

best CV score from grid search: -0.037775
corresponding parameters: {'C': 9490.133679029495, 'kernel': 'rbf'}


In [112]:
print('SVR Results\n')

print("Score SVR: %f" % score_svr)
print("Score SVR tuned GS: %f" % score_svr_tuned_GS)
print("Score SVR tuned RS: %f" % score_svr_tuned_RS)

print("\nRMSE SVR: %f" % rmse_svr)
print("RMSE SVR tuned GS: %f" % rmse_svr_tuned_GS)
print("RMSE SVR tuned RS: %f" % rmse_svr_tuned_RS)

SVR Results

Score SVR: -0.003308
Score SVR tuned GS: -0.002516
Score SVR tuned RS: -0.002516

RMSE SVR: 1318.398984
RMSE SVR tuned GS: 1317.878750
RMSE SVR tuned RS: 1317.878750


In [113]:
##Profit Calculation for pct approach
model_predictions = convertToPrediction(y_for_calculations,predict)
print(profit(actual_predictions,model_predictions).sum())

1260233.0


The tuning works for the SVR.

### DNN Regressor

In [114]:
from sklearn.neural_network import MLPRegressor

In [115]:
import logging
from concurrent.futures import ThreadPoolExecutor, wait
from time import time
from typing import List

In [116]:
bike_model = MLPRegressor(hidden_layer_sizes=(5,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)

In [117]:
start_time = int(time() * 1000)
bike_model.fit(X_train, y_train)
end_time = int(time() * 1000)
logging.debug('Finished training universal model')
logging.debug('Training took {} ms'.format(end_time - start_time)) 

In [118]:
predict = bike_model.predict(X_test)

In [119]:
predict

array([ -0.77751408,  -2.28696885,  -1.83510422,  -1.5752995 ,
        -1.36235334,  -1.10147798,  -2.23428067,  -1.72150293,
        -1.27831887,  -2.86140981,  -2.15651825,  -2.86291191,
        -1.58651975,  -1.79501972,  -2.96574772,  -0.67574913,
        -1.04164939,  -1.92313006,  -0.90912715,  -1.56278038,
        -1.05996477,  -0.67197839,  -1.97623013,  -0.83450369,
        -1.41903685,  -2.07934445,  -1.75287998,  -2.79977593,
        -4.66926668,  -3.7859783 ,  -1.99700761,  -3.30337019,
        -4.17063264,  -3.83010069,  -3.95039498,  -4.92459413,
        -3.3122308 ,  -3.02200252,  -3.93327902,  -2.02363312,
        -2.39767548,  -4.55686566,  -3.35476197,  -2.46027319,
        -3.57687536,  -0.68392344,  -0.82844783,  -3.81701901,
        -2.15953806,  -2.99836439,  -3.31914802,  -1.69198986,
        -3.47223053,  -2.01923111,  -3.20797431,  -5.21304823,
        -4.65215547,  -4.8731797 ,  -3.72762329,  -3.01093175,
        -4.91218707,  -3.88674543,  -5.32911957,  -4.48

In [120]:
model_predictions = convertToPrediction(y_for_calculations,predict)

In [121]:
print(profit(actual_predictions,model_predictions).sum())

1491292.0


#### BOOSTING

In [123]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [126]:
#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification
 

In [145]:
models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),KNeighborsRegressor()]
model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','KNeighborsRegressor']
rmsle=[]
d={}
for model in range (len(models)):
    clf=models[model]
    clf.fit(X_train,y_train)
    test_pred=clf.predict(X_test)
    model_predictions = convertToPrediction(y_for_calculations,test_pred)
    print(profit(actual_predictions,model_predictions).sum())

1443003.0
1189343.0
1412028.0
1438598.0
1403696.0


In [150]:
#NOW LET'S Dig deeper into each of these ...
#for random forest regresion.
from sklearn.ensemble import RandomForestRegressor
from sklearn import pipeline

regressor = RandomForestRegressor(random_state = 0, max_depth = 20, n_estimators = 150)
estimator = pipeline.Pipeline(steps = [       
    ('model_fitting', regressor)
    ]
)
estimator.fit(X_train, y_train)
test_pred = estimator.predict(X_test)
model_predictions = convertToPrediction(y_for_calculations,test_pred)
print(profit(actual_predictions,model_predictions).sum())

1462796.0
