In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/bike-sharing-dataset/hour.csv")

In [None]:
data.head(3)

In [None]:
data.info()

In [None]:
# hours spread from 00 (midnight) to 23 (11 pm)
data.hr.value_counts()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import plotly_express as px
import seaborn as sns
import math

In [None]:
# We already have relevant info on date with yr, month and hour
# and we want only the total count
# also instant is the irrelevant for prediction
pre_dropped = ["dteday", "casual", "registered", "instant"]
data_prep = data.drop(pre_dropped, axis=1)
data_prep.isnull().sum() # no missing data

In [None]:
data_prep.columns

In [None]:
# let's plot the distributions of the different columns
data_prep.hist(rwidth=0.9, figsize=(20, 20))
plt.tight_layout()
plt.show()

cnt distribution is not normal, might need a change of variable

## Data Visualization

In [None]:
# there's a few numerical columns
data_prep.head(10)

In [None]:
plt.figure(figsize=(12, 12))
plt.subplot(2, 2, 1)
plt.title("Demand = f(Temperature)")
plt.scatter(x=data_prep.temp, y=data_prep.cnt, s=2, c="magenta")
##
plt.subplot(2, 2, 2)
plt.title("Demand = f(Feeled Temperature)")
plt.scatter(x=data_prep.atemp, y=data_prep.cnt, s=2, c="blue")
##
plt.subplot(2, 2, 3)
plt.title("Demand = f(Humidity)")
plt.scatter(x=data_prep.hum, y=data_prep.cnt, s=2, c="green")
##
plt.subplot(2, 2, 4)
plt.title("Demand = f(Wind speed)")
plt.scatter(x=data_prep.windspeed, y=data_prep.cnt, s=2, c="red")

plt.tight_layout()

We can spot some dependency for all of these features, except maybe for humidity.

In [None]:
# correlation degree of all the numerical features wrt to the total count of bike.
data_prep[["temp", "atemp", "hum", "windspeed", "cnt"]].corr()["cnt"].plot(kind="bar", title="Correlation of variable features wrt to total number of bikes")

windspeed is maybe less related to how much bikes are used. Let's keep that in mind.

In [None]:
# let's plot the evolution of total number of bikes wrt the different categorical features
cm = matplotlib.cm.get_cmap("rainbow")
fig, ax = plt.subplots(3, 3, figsize=(15, 15))
data_prep.groupby("season").mean()["cnt"].plot(ax=ax[0,0], kind="bar", color=cm(data_prep.groupby("season").mean()["cnt"]/np.max(data_prep.groupby("season").mean()["cnt"])))
data_prep.groupby("yr").mean()["cnt"].plot(ax=ax[0,1], kind="bar", color=cm(data_prep.groupby("yr").mean()["cnt"]/np.max(data_prep.groupby("yr").mean()["cnt"])))
data_prep.groupby("mnth").mean()["cnt"].plot(ax=ax[0,2], kind="bar", color=cm(data_prep.groupby("mnth").mean()["cnt"]/np.max(data_prep.groupby("mnth").mean()["cnt"])))
data_prep.groupby("hr").mean()["cnt"].plot(ax=ax[1,0], kind="bar", color=cm(data_prep.groupby("hr").mean()["cnt"]/np.max(data_prep.groupby("hr").mean()["cnt"])))
data_prep.groupby("holiday").mean()["cnt"].plot(ax=ax[1,1], kind="bar", color=cm(data_prep.groupby("holiday").mean()["cnt"]/np.max(data_prep.groupby("holiday").mean()["cnt"])))
data_prep.groupby("weekday").mean()["cnt"].plot(ax=ax[1,2], kind="bar", color=cm(data_prep.groupby("weekday").mean()["cnt"]/np.max(data_prep.groupby("weekday").mean()["cnt"])))
data_prep.groupby("workingday").mean()["cnt"].plot(ax=ax[2,0], kind="bar", color=cm(data_prep.groupby("workingday").mean()["cnt"]/np.max(data_prep.groupby("workingday").mean()["cnt"])))
data_prep.groupby("weathersit").mean()["cnt"].plot(ax=ax[2,1], kind="bar", color=cm(data_prep.groupby("weathersit").mean()["cnt"]/np.max(data_prep.groupby("weathersit").mean()["cnt"])))
plt.tight_layout()

Weekday seems irrelevant in that case, same for workday, we should drop these features. The year is also a dangerous feature, since the data only spreads across 2 years (statistically this is not relevant). Other features seem relevant.

In [None]:
# let's look at the hourly distribution
data_prep.groupby("hr").mean()["cnt"].plot(kind="bar", figsize=(16, 8), color=cm(data_prep.groupby("hr").mean()["cnt"]/np.max(data_prep.groupby("hr").mean()["cnt"])))

Bike demand has a peek in the morning (8 am), most certainsly for people going to work. There's also a peak around 5-6 pm when people are leaving work.

In [None]:
sns.boxplot(data=data_prep, x="cnt")

Most of the data is contained between 0 to ~650.

In [None]:
# another way to show this
data_prep.cnt.describe()

In [None]:
# check the boxplot in more details. print quartiles from 5% to 99% to check out outliers.
data_prep.quantile(np.append(np.arange(0.05, 0.96, 0.05), 0.99))["cnt"]

99% of the data is contained below cnt=782. If the model does not perform too well, we might eliminate the most extreme outliers.

In [None]:
# let's check if numerical features are correlated with one another
sns.heatmap(data_prep[["temp", "atemp", "windspeed", "hum", "cnt"]].corr(), annot=True)

As one would guess, temperature and feeled temperature are correlated. We will drop the feeled temperature. Windspeed and hum seem correlated, and since windspeed is not well correlated with cnt, we will drop windspeed as well. Let's create our final dataframe.

In [None]:
dropped = ["windspeed", "atemp", "workingday", "weekday", "yr"]
data_final = data_prep.drop(dropped, axis=1)

In [None]:
data_final.head()

Only 8 relevant columns remaining!

In [None]:
# Let's check autocorrelation of cnt values
plt.acorr(data_final["cnt"].astype(float), maxlags=12)

There's a high auto-correlation for the closest cnt values, this may hurt the linear regression model.

Back to the cnt distribution, it looks like log-normal, so let's log it to check the normality of the distribution!

In [None]:
df = np.log(data_final["cnt"])
df.hist(rwidth=0.9, bins=20, color="blue")

and now it is normally distributed, so let's consider log(cnt) instead of cnt in our dataframe.

In [None]:
data_final["cnt"] = np.log(data_final["cnt"])

In [None]:
data_final.head()

Now on to the autocorrelation issue

In [None]:
# since cnt is correlated with itself, let's lag the cnt column and consider it as a feature
t1 = data_final["cnt"].shift(+1).to_frame()
t1.columns = ["t-1"]
t2 = data_final["cnt"].shift(+2).to_frame()
t2.columns = ["t-2"]
t3 = data_final["cnt"].shift(+3).to_frame()
t3.columns = ["t-3"]

In [None]:
data_lag = pd.concat([data_final, t1, t2, t3], axis=1)
data_lag.head()

In [None]:
# drop the NaN values
data_lag.dropna(inplace=True)

The categorical data needs to be transformed into dummy variables. Let's do that.

In [None]:
to_be_dummied = ["season", "mnth", "hr", "holiday", "weathersit"]
dummy_df = pd.get_dummies(data_lag[to_be_dummied].astype("category"), drop_first=True)
dummy_df.head()

In [None]:
# let's create ouf data finally pre-processed by concatenating the dummy variables with the numerical features.
dropped = ["season", "mnth", "holiday", "weathersit", "hr"]
df = pd.concat((data_lag.drop(dropped, axis=1), dummy_df), axis=1)
df.head()

We'll have 46 features to consider to predict the "cnt" value.

## Train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop("cnt", axis=1)
y = df["cnt"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101) # test size of 25%

In [None]:
# we create the linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
# let's fit it with the training set
lr.fit(X_train, y_train)

In [None]:
# check the score on the train set
lr.score(X_train, y_train)

Model has a 0.926 r2 score on the training set. Not bad! Let's now evaluate on the test set.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_error

In [None]:
# this is the model predictions
test_pred = lr.predict(X_test)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
bins = None
sns.distplot(test_pred, ax=ax, color="blue", label="predictions", bins=bins)
sns.distplot(y_test, ax=ax, color="red", label="true", bins=bins)
ax.legend()

The test results seem pretty on point. Let's check that with some scores.

In [None]:
print(f"r2 score: {r2_score(y_test, test_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.2f}")

## 0.93 r2 score