In [37]:
#importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import VotingClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.svm import SVC
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Bike Sharing Demand Dataset

https://www.kaggle.com/competitions/bike-sharing-demand/data?select=train.csv

In [6]:
#importing the data-set
x = pd.read_csv('bikeSharingDemand.csv')

#displaying the dataset
x

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [7]:
#dataset size - 10886 rows x 12 columns

In [8]:
#check if there is any NULL value
x.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [9]:
#the dataset has NO null values, so don't need to handle them

In [10]:
#'registered' and 'count' columns represent the number of non-registered and registered user, related to 'count' variable
#so we can't include them in 'features' as it will lead to OVERFITTING

#'datatime' column has been already decomposed into 'season', 'holiday', 'workingday' columns, so we won't include it as well

In [13]:
#defining the target variable and the features
target = 'count'
features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']

In [14]:
#splitting training and testing dataset
X = x[features]
y = x[target]

In [17]:
#standardizing the training dataset
sc = StandardScaler()
X = sc.fit_transform(X)

In [23]:
#training dataframe after standardization
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.993213,-1.567754
1,-1.349864,-0.17149,-1.460672,-0.660178,-1.438907,-1.182421,0.941249,-1.567754
2,-1.349864,-0.17149,-1.460672,-0.660178,-1.438907,-1.182421,0.941249,-1.567754
3,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.681430,-1.567754
4,-1.349864,-0.17149,-1.460672,-0.660178,-1.333661,-1.092737,0.681430,-1.567754
...,...,...,...,...,...,...,...,...
10881,1.338012,-0.17149,0.684616,-0.660178,-0.596935,-0.467310,-0.617666,1.617227
10882,1.338012,-0.17149,0.684616,-0.660178,-0.702182,-0.735182,-0.253919,0.269704
10883,1.338012,-0.17149,0.684616,-0.660178,-0.807428,-0.913959,-0.046064,0.269704
10884,1.338012,-0.17149,0.684616,-0.660178,-0.807428,-0.735182,-0.046064,-0.832442


In [43]:
#splitting the data into train and test sets for 4 combinations
ts = 0.2
for i in range(0, 6):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=49)
    
    #defining the models and hyperparameters
    models = {
    'lr': Pipeline([('lr', LinearRegression())]),
    'rf': Pipeline([('rf', RandomForestRegressor())]),
    'gb': Pipeline([('gb', GradientBoostingRegressor())]),
    'ada': Pipeline([('ada', AdaBoostRegressor())]),
    'vote': VotingRegressor([('lr', LinearRegression()),('rf', RandomForestRegressor()),('gb', GradientBoostingRegressor()),('ada', AdaBoostRegressor())])
    }
    


In [39]:
X_train

array([[ 1.33801198, -0.17149048, -1.46067232, ..., -0.19884848,
         0.00589999,  0.5142603 ],
       [-0.45390515, -0.17149048,  0.68461625, ...,  0.78472387,
         0.57750229, -0.83244247],
       [ 1.33801198, -0.17149048, -1.46067232, ..., -1.18242083,
        -0.72159384,  0.5142603 ],
       ...,
       [ 0.44205341, -0.17149048,  0.68461625, ...,  1.32105696,
        -0.51373846,  0.5142603 ],
       [-1.34986372, -0.17149048,  0.68461625, ..., -1.27151467,
         0.16179153,  1.37184982],
       [-1.34986372, -0.17149048,  0.68461625, ..., -1.09273697,
        -0.25391923,  1.86178373]])