In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# dataset location: https://www.kaggle.com/itssuru/bike-sharing-system-washington-dc

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the Data

In [None]:
train = pd.read_csv('/kaggle/input/bike-sharing-system-washington-dc/train_bikes.csv',parse_dates=['datetime'])
test = pd.read_csv('/kaggle/input/bike-sharing-system-washington-dc/test_bikes.csv',parse_dates=['datetime'])

In [None]:
train.head()

# EDA
## Checking if there are null values
There are no null values in the training or test data

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Checking the Data Types
All of the data is numerical, which means I won't need to do any label encoding or one hot encoding. The datetime, which is the index, can be modified to reflect time of the year. This is most likely going to be cyclical and dependent on the time of year

In [None]:
train.dtypes

## Inspecting the data
I will be predicting the 'count' feature. I will also most likely want to remove the 'registered' feature since it looks like this feature contains information about the count that the AI model shouldn't have access to when making predictions since that would be an example of target leakage. I'm not quite sure what the 'casual' column is, but it may also be a source of leakage.

Also, I can see that there is a season feature, which will be helpful. Having features on the season and the time of year could be useful. 

I'm going to normalize the data for preprocessing.

In [None]:
train.describe().transpose()

# Feature Engineering

In [None]:
useful_columns = ['datetime','season','holiday','workingday','weather','temp','atemp','humidity','windspeed']

target = train['count']
train = train[useful_columns]
test = test[useful_columns]

In [None]:
date_time = train.pop('datetime')
date_time_test = test.pop('datetime')

In [None]:
timestamp_s = date_time.map(dt.datetime.timestamp)
timestamp_test = date_time_test.map(dt.datetime.timestamp)

In [None]:
timestamp_s

In [None]:
day = 24*60*60
year = (365.2425)*day
train.loc[:,'Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
train.loc[:,'Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))

train.loc[:,'Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
train.loc[:,'Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

test.loc[:,'Day sin'] = np.sin(timestamp_test * (2 * np.pi / day))
test.loc[:,'Day cos'] = np.cos(timestamp_test * (2 * np.pi / day))

test.loc[:,'Year sin'] = np.sin(timestamp_test * (2 * np.pi / year))
test.loc[:,'Year cos'] = np.cos(timestamp_test * (2 * np.pi / year))

In [None]:
train.head()

In [None]:
plt.plot(np.array(train['Day sin'])[:25])
plt.plot(np.array(train['Day cos'])[:25])
plt.xlabel('Time [h]')
plt.title('Time of day signal')

## Split the data

In [None]:
column_indices = {name: i for i, name in enumerate(train.columns)}

n = len(train)
train_df = train[0:int(n*0.8)]
train_target = target[0:int(n*0.8)]

val_df = train[int(n*0.8):]
val_target = target[int(n*0.8):]


num_features = train.shape[1]

## Normalize the data
It is important to scale features before training an AI algorithm. Normalization is a common way of doing this scaling. Subtract the mean and divide by the standard deviation of each feature.

The mean and standard deviation should only be computed using the training data so that the models have no access to the values in the validation and test sets.

In [None]:
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

In [None]:
train_df.describe()

# Feed into AI algorithms

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

import sklearn.metrics as sm

In [None]:
# Random Forest Regressor
random_forest = RandomForestRegressor()

# K Nearest Neighbors Regressor
kneighbors = KNeighborsRegressor()

# SGD Regressor
sgd = SGDRegressor()

# xgb regressor
xgb = XGBRegressor()

In [None]:
regressors_dict = {'random_forest':random_forest,'kneighbors':kneighbors,'sgd':sgd,'xgb':xgb}

Without tuning the hyperparameters, the random_forest regressor seems to be performing the best

In [None]:
mae_df = pd.DataFrame(columns = ["regressor","score"])
mse_df = pd.DataFrame(columns=["regressor","score"])

for regressor_name in regressors_dict.keys():
    regressor = regressors_dict[regressor_name]
    regressor.fit(train_df, train_target)
    predictions = regressor.predict(val_df)
    
    mae =  round(sm.mean_absolute_error(predictions, val_target), 2)
    mse = round(sm.mean_squared_error(predictions, val_target), 2)
    
    mae_row = pd.DataFrame({'regressor':regressor_name,'score':mae}, index=[0])
    mae_df = mae_df.append(mae_row)
    
    mse_row = pd.DataFrame({'regressor':regressor_name,'score':mse}, index=[0])
    mse_df = mse_df.append(mse_row)
    
    print("Mean absolute error " + regressor_name + " =", mae) 
    print("Mean squared error " + regressor_name + " =", mse)
    print("-"*20)
    
mae_df = mae_df.reset_index(drop=True)
mse_df = mse_df.reset_index(drop=True)

In [None]:
plt.title('MAE Classifier Comparisons')
sns.set_color_codes("muted")
sns.barplot(x='score', y='regressor', data=mae_df, color="b")
plt.xlabel('MAE Score')
plt.ylabel('Regressor')
plt.show()

In [None]:
plt.title('MSE Classifier Comparisons')
sns.set_color_codes("muted")
sns.barplot(x='score', y='regressor', data=mse_df, color="b")
plt.xlabel('MSE Score')
plt.ylabel('Regressor')
plt.show()

I'm going to use random_forest since it performs the best

In [None]:
random_forest.fit(train_df, train_target)

In [None]:
predictions = random_forest.predict(val_df)

# Making Final Predictions

In [None]:
test.head()

In [None]:
test_predictions = random_forest.predict(test)

In [None]:
test_predictions = pd.DataFrame(test_predictions)
test_predictions.head(20)