## Preprocessing `Bike Sharing` dataset

### Importing libraries

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
    )
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier


import pandas as pd
import numpy as np
import time

### Preprocessing

In [53]:
# Load the bike sharing dataset
bike_sharing = pd.read_csv('../datasets/original/bike_sharing.csv')

# Drop casual and registered since they are basically cnt split into two
bike_sharing = bike_sharing.drop(['casual', 'registered'], axis=1)

encoderOHE = OneHotEncoder()

# one-hot encoding for the categorical variables season, weathersit
season_encoded = encoderOHE.fit_transform(bike_sharing[['season']]).toarray()
weathersit_encoded = encoderOHE.fit_transform(bike_sharing[['weathersit']]).toarray()

# Add the encoded columns to the dataset
bike_sharing['spring'] = season_encoded[:, 0]
bike_sharing['summer'] = season_encoded[:, 1]
bike_sharing['fall'] = season_encoded[:, 2]
bike_sharing['winter'] = season_encoded[:, 3]

bike_sharing['weather_1'] = weathersit_encoded[:, 0]
bike_sharing['weather_2'] = weathersit_encoded[:, 1]
bike_sharing['weather_3'] = weathersit_encoded[:, 2]
bike_sharing['weather_4'] = weathersit_encoded[:, 3]

# Drop the original columns
bike_sharing = bike_sharing.drop(['season', 'weathersit'], axis=1)

# Drop columns that are not needed
bike_sharing = bike_sharing.drop(['instant', 'dteday'], axis=1)

### Splitting the dataset and normalization

In [54]:
# Split the dataset into features and target variable
X = bike_sharing.drop('cnt', axis=1)
y = bike_sharing['cnt']

# Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalize the data
scaler = MinMaxScaler()



variables_to_normalize = ['mnth', 'hr', 'weekday']

# Fit transform varirables mnth, temp, hum, windspeed
X_train[variables_to_normalize] = scaler.fit_transform(X_train[variables_to_normalize]) 
X_test[variables_to_normalize] = scaler.transform(X_test[variables_to_normalize])

X_train
 

Unnamed: 0,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,spring,summer,fall,winter,weather_1,weather_2,weather_3,weather_4
1945,0,0.181818,0.869565,0,1.000000,0,0.28,0.2727,0.45,0.2537,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
13426,1,0.545455,0.652174,0,0.500000,1,0.92,0.8485,0.35,0.3582,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
12898,1,0.454545,0.652174,0,0.333333,1,0.72,0.6515,0.28,0.4627,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2570,0,0.272727,0.000000,0,0.833333,1,0.36,0.3485,0.62,0.1940,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
10764,1,0.181818,0.652174,0,0.666667,1,0.52,0.5000,0.39,0.3582,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,0.272727,0.391304,0,0.833333,1,0.46,0.4545,0.88,0.0896,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11964,1,0.363636,0.739130,0,0.833333,1,0.66,0.6212,0.34,0.1343,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5390,0,0.636364,0.521739,0,0.500000,1,0.80,0.7273,0.43,0.2836,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
860,0,0.090909,0.304348,0,0.333333,1,0.24,0.1970,0.65,0.4179,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Tests

## Fining optimal K

In [45]:
# Optimal K with cross-validation cross_val_score
k_values = list(range(1, 30))
cross_val_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    cross_val_scores.append(cross_val_score(knn, X_train, y_train, cv=10).mean())
    
# optimal_k = k_values[np.argmax(cross_val_scores)]


# Get the index of the top 3 scores
top_3_scores = np.argsort(cross_val_scores)[::-1][:3]

# Get the k values for the top 3 scores
top_3_k_values = [k_values[i] for i in top_3_scores]

top_3_k_values

[7, 6, 8]

## Attributes importance

In [46]:
# Train a Random Forest model to get which Features are more important
model = RandomForestClassifier(max_leaf_nodes=2,
                        max_features=5,
                        max_depth=5,
                        random_state=42)

# Fit the model to the data
model.fit(X_train, y_train)

# Get the most important Features
most_important_attributes = pd.DataFrame(
                            model.feature_importances_,
                            index = X_train.columns,
                            columns=['importance']
                        ).sort_values('importance', ascending=False)


most_important_attributes


Unnamed: 0,importance
hr,0.39
temp,0.16
atemp,0.16
hum,0.08
mnth,0.07
spring,0.06
yr,0.05
weather_3,0.01
fall,0.01
windspeed,0.01


In [47]:
# Drop the least important features
# NOT WORKING FOR THIS DATASET FOR KNN, WE ARE GETTING WORSE RESULTS BY DROPPING THE LEAST IMPORTANT FEATURES

# least_important_features = [
# 'workingday',
# 'summer',
# 'weekday',
# 'winter',
# 'weather_1',
# 'weather_2',
# 'holiday',
# 'weather_4'
# ]

# X_train = X_train.drop(least_important_features, axis=1)
# X_test = X_test.drop(least_important_features, axis=1)

## KNN - scikit-learn

In [102]:
knn = KNeighborsRegressor(n_neighbors=6, weights='uniform')
knn.fit(X_train, y_train)

runtime_start = time.process_time()
runtime_end = time.process_time()

y_pred = knn.predict(X_test)
runtime = runtime_end - runtime_start

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2, runtime

(64.45581127733027,
 9488.986312492008,
 0.704928964086922,
 2.1000000003823516e-05)

## Gradient Boosting - scikit-learn