## Preprocessing `Bike Sharing` dataset

### Importing libraries

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
    )
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder


import pandas as pd
import numpy as np
import time

### Preprocessing

In [7]:
# Load the bike sharing dataset
bike_sharing = pd.read_csv('../datasets/original/bike_sharing.csv')

encoderOHE = OneHotEncoder()

# one-hot encoding for the categorical variables season, weathersit
season_encoded = encoderOHE.fit_transform(bike_sharing[['season']]).toarray()
weathersit_encoded = encoderOHE.fit_transform(bike_sharing[['weathersit']]).toarray()

# Add the encoded columns to the dataset
bike_sharing['spring'] = season_encoded[:, 0]
bike_sharing['summer'] = season_encoded[:, 1]
bike_sharing['fall'] = season_encoded[:, 2]
bike_sharing['winter'] = season_encoded[:, 3]

bike_sharing['weather_1'] = weathersit_encoded[:, 0]
bike_sharing['weather_2'] = weathersit_encoded[:, 1]
bike_sharing['weather_3'] = weathersit_encoded[:, 2]
bike_sharing['weather_4'] = weathersit_encoded[:, 3]

# Drop the original columns
bike_sharing = bike_sharing.drop(['season', 'weathersit'], axis=1)

# Drop columns that are not needed
bike_sharing = bike_sharing.drop(['instant', 'dteday'], axis=1)

## Splitting the dataset and normalization

In [43]:
# Split the dataset into features and target variable
X = bike_sharing.drop('cnt', axis=1)
y = bike_sharing['cnt']

# Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalize the data
scaler = MinMaxScaler()



variables_to_normalize = ['mnth', 'hr', 'weekday']

# Fit transform varirables mnth, temp, hum, windspeed
X_train[variables_to_normalize] = scaler.fit_transform(X_train[variables_to_normalize]) 
X_test[variables_to_normalize] = scaler.transform(X_test[variables_to_normalize])

X_train
 

Unnamed: 0,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,casual,registered,spring,summer,fall,winter,weather_1,weather_2,weather_3,weather_4
1945,0,0.181818,0.869565,0,1.000000,0,0.28,0.2727,0.45,0.2537,18,67,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
13426,1,0.545455,0.652174,0,0.500000,1,0.92,0.8485,0.35,0.3582,42,152,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
12898,1,0.454545,0.652174,0,0.333333,1,0.72,0.6515,0.28,0.4627,88,229,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2570,0,0.272727,0.000000,0,0.833333,1,0.36,0.3485,0.62,0.1940,3,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
10764,1,0.181818,0.652174,0,0.666667,1,0.52,0.5000,0.39,0.3582,109,198,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,0.272727,0.391304,0,0.833333,1,0.46,0.4545,0.88,0.0896,30,329,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11964,1,0.363636,0.739130,0,0.833333,1,0.66,0.6212,0.34,0.1343,124,688,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5390,0,0.636364,0.521739,0,0.500000,1,0.80,0.7273,0.43,0.2836,26,163,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
860,0,0.090909,0.304348,0,0.333333,1,0.24,0.1970,0.65,0.4179,3,97,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Tests

## KNN - scikit-learn

In [65]:
knn = KNeighborsRegressor(n_neighbors=7, weights='distance')
knn.fit(X_train, y_train)

runtime_start = time.process_time()
runtime_end = time.process_time()

y_pred = knn.predict(X_test)
runtime = runtime_end - runtime_start

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2, runtime

(0.9264178859115567,
 5.042309950447381,
 0.9998432035233822,
 2.199999999419333e-05)

In [46]:
knn = KNeighborsRegressor(n_neighbors=5, weights='distance')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2

(0.828613503629994, 3.880649117618418, 0.9998793267144201)

## Gradient Boosting - scikit-learn