 This notebook is for an analysis of bike ride sharing data. The train dataset has 12 columns of variables and 10,886 rows of input. Variables are datatime, season, holiday, workingday, weather, temp, atemp, humidity, windspeed, casual, registered and count. Detailed explanation is here: https://www.kaggle.com/c/bike-sharing-demand/data. After the exploratary analysis, we used Linear Mixed Effects models to predict the number of bike in the given test dataset.

Author: Jiyun

Initial update: 9.3.21
Last update: 10.21.21

In [None]:
# Import of library

import pandas as pd
import math
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_log_error
import sklearn
import itertools

In [None]:
# Upload of train dataset

df = pd.read_csv("../input/bike-sharing-demand/train.csv")
df.datetime = pd.to_datetime(df['datetime'], format = '%Y-%m-%d %H:%M:%S') 
df.index = df['datetime']
# read datetime and added separate columns of year, month, day and hour
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] =df['datetime'].dt.hour
df.isnull().values.any()

In [None]:
# Upload of test dataset

test = pd.read_csv("../input/bike-sharing-demand/test.csv")
test.datetime = pd.to_datetime(test['datetime'], format = '%Y-%m-%d %H:%M:%S') 
test.index = test['datetime']
# read datetime and added separte columns of year, month, day and hour
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] =test['datetime'].dt.hour
test.isnull().values.any()

In [None]:
# Description of each variable

df.columns
df.describe()

In [None]:
# Categorize variables

var_be_category = ['weather', 'season', 'holiday','workingday', 'year','month','day', 'hour']
for var in var_be_category:
    df[var] = df[var].astype('category')  

In [None]:
# Plots for data check

var_to_boxplot = ['season','holiday','workingday','weather']
var_by_overtime = ['temp','atemp','humidity','windspeed','casual','registered']
fig, axes = plt.subplots(2,2,figsize=(15,12))
n = 0
for i,j in list(itertools.product(range(2), range(2))):
    sns.boxplot(x = var_to_boxplot[n], y = 'count', data=df, ax = axes[i,j])
    n = n + 1
n = 0
fig1, axes1 = plt.subplots(3,2,figsize=(15,20))
for i,j in list(itertools.product(range(3), range(2))):
    rgb = np.random.rand(3,)
    df[var_by_overtime[n]].plot(kind = 'line', ax = axes1[i,j],
                                ylabel = var_by_overtime[n], c = rgb)
    n = n + 1

plt.show()

In [None]:
#  Correlation among variables

var_corr = df.corr()
cmap = sns.diverging_palette(500, 10, as_cmap=True)
fig, ax = plt.subplots(figsize = (12,8))
sns.heatmap(var_corr, cmap = cmap,
            xticklabels=var_corr.columns, yticklabels=var_corr.columns,
           ax = ax,annot = True)
plt.title("Correlation among variables")
plt.show()

In [None]:
# Barplot of total number of bikes rented by year

grby_y = df.groupby('year').sum()['count'].to_frame()
fig, ax = plt.subplots(figsize = (7,5))
grby_y.T.plot(kind='bar', rot = 0, ax = ax,
              xlabel = 'year' , xticks = [],
              ylabel = 'Total number of bikes rented',
              title = 'Total number of bikes rented over year')
plt.show()

In [None]:
# Distribution of the sum of the count by month or day 

fig, (ax1, ax2) = plt.subplots(nrows=2,ncols=1,figsize=(10,10))
colors1 = ['cornflowerblue', 'royalblue']
colors2 = ['olivedrab', 'darkseagreen']

grby_m = df.groupby(['year','month']).sum()['count'].to_frame().reset_index()
med_cnt_m = grby_m.set_index(['month','year']).unstack()['count']
med_cnt_m.plot(kind="bar", rot = 0, ax = ax1, color = colors1,
              title = "Total number of bikes rented each month",
              xlabel = "Month", ylabel ="Total count of bikes")

grby_d = df.groupby(['year','day']).sum()['count'].to_frame().reset_index()
med_cnt_d = grby_d.set_index(['day','year']).unstack()['count']
med_cnt_d.plot(kind="bar", ax = ax2, rot = 0, color = colors2,
               title = "Total number of bikes rented each day",
               xlabel = "Day", ylabel ="Total count of bikes rented")
plt.legend(fontsize = 8)

plt.show()

In [None]:
# Distribution of the sum of the count by hour and workingday

grby_wr_h = df.groupby(['workingday','year','hour']).sum()['count'].to_frame().reset_index()
cnt_wr_h = grby_wr_h.set_index(['workingday','hour','year']).unstack()['count']

fig, (ax1, ax2)= plt.subplots(nrows=2,ncols=1,figsize=(10,10))
cnt_wr_h.loc[0,].plot(rot = 0, kind = 'bar', ax = ax1,
                      title = 'Total number of bikes rented by hour (workingday = 0)')
cnt_wr_h.loc[1,].plot(rot = 0, kind = 'bar', ax = ax2,
                     title = 'Total number of bikes rented by hour (workingday = 1)')
plt.setp([ax1,ax2], xlabel = 'Hour')
plt.setp([ax1,ax2], ylabel='Total number of bikes rented')
plt.show()

In [None]:
#Plots of Feels like temperature and count over time

fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10,12));
df['datetime']=df.index
df.atemp.plot( ax = ax1, kind = 'line', rot = 0,
              title = 'Feels like temperature(atemp) over time',
              ylabel = 'Temperature', xlabel = 'Datetime')
df.plot.scatter( x ='datetime', y = 'count', ax = ax2,
                s = 10, c = 'green',
               title='Number of bikes rented over time', 
               ylabel = 'Count', xlabel = 'Datetime')
plt.show()

In [None]:
# Distribution of the sum of count by "feels like temperature(atemp)" bin

count,atemp_bin_edges = np.histogram(df['atemp'], bins='auto')
atemp_bins = np.append([df['atemp'].min()-1], atemp_bin_edges)
atemp_bin_labels = np.arange(len(atemp_bin_edges))
df['atemp_bined'] = pd.cut(df['atemp'], bins = atemp_bins, labels = atemp_bin_labels)

grby_at = df.groupby(['year','atemp_bined']).sum()['count'].to_frame().reset_index()
med_cnt_at = grby_at.set_index(['atemp_bined','year']).unstack()['count']

fig, ax = plt.subplots(figsize = (13,5))
colors = ('darkorange', 'sienna')
med_cnt_at.plot(kind="bar", ax = ax, rot = 0, color= colors,
                title ="Total number of bikes rented each atemp bin",
                xlabel = "atemp_binned",ylabel = "Total number of bikes rented")
plt.show()

In [None]:
# Distribution of the sum of count by humidity bin

n = 30
df['humid_bined'], humid_bin_edges = pd.qcut(df['humidity'], q =n, labels=range(n), retbins=True)
humid_bins = np.append([df['humidity'].min()-1], humid_bin_edges)

grby_at = df.groupby(['year','humid_bined']).sum()['count'].to_frame().reset_index()
sum_cnt_at = grby_at.set_index(['humid_bined','year']).unstack()['count'] 

fig, ax = plt.subplots(figsize = (13,5))
sum_cnt_at.plot(kind="bar", ax = ax, rot = 0,
                color=colors,
               title ="Total number of bikes rented each atemp bin",
                xlabel = "Humidity binned",ylabel = "Total number of bikes rented")
plt.show()

In [None]:
# Fit to mixed effects model

mixed = smf.mixedlm("count ~ windspeed + C(atemp_bined, Treatment(0)) \
                    + C(humid_bined, Treatment(0))+ C(year, Treatment(2011)) \
                    +C(hour, Treatment(4)):C(workingday, Treatment(0))\
                    +C(hour, Treatment(4)):C(season, Treatment(1))",
                    df, groups = 'year')
mixed_fit = mixed.fit()
print(mixed_fit.summary())
df['predicted'] = np.array(mixed_fit.predict()).tolist()

plt.scatter(mixed_fit.predict() - mixed_fit.resid, mixed_fit.resid, alpha = 0.5)
plt.title("Residual vs. Fitted in Python")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")

plt.show()
np.sqrt(((df['predicted'] - df['count']) ** 2).mean())


In [None]:
# Boxplots of residuals by year

fig = plt.figure(figsize = (10,7))
ax = sns.boxplot(x =mixed_fit.model.groups, y = mixed_fit.resid )

In [None]:
# Scatter plots of prediction and real count

fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(10,12));
ax11 = axs[0]
ax12 = axs[1]
df.loc[df['predicted'] <0, 'predicted'] = 0
df.plot.scatter( x = 'atemp_bined', y = 'predicted', ax = ax11,
              title = 'Predicted over time',
              ylabel = 'Count', xlabel = 'Datetime')
df.plot.scatter( x ='atemp_bined', y = 'count', ax = ax12,
                s = 10, c = 'green',
               title='Number of bikes rented over time', 
               ylabel = 'Count', xlabel = 'Datetime')
plt.show()

In [None]:
# MSLE

mean_squared_log_error(df['count'], df['predicted'])

In [None]:
# Dataframe with prediction
df[['predicted', 'count', 'atemp']].head()

In [None]:
# Reformat of test dataset

test.columns
cols = test.columns.tolist()
cols = cols[-3:]+cols[1:-3]
test = test[cols]
test['datetime']=test.index

test['atemp_bined'] = pd.cut(test['atemp'], bins = atemp_bins, labels = atemp_bin_labels)
test['humid_bined'] = pd.cut(test['humidity'], bins = humid_bin_edges, labels = range(n))

In [None]:
# Save prediction for submission

prediction = mixed_fit.predict(test)
submission = pd.DataFrame({'datetime': test.index,
                           'count': prediction})
submission.loc[submission['count'] <0, 'count'] = 0
submission['count'] = submission['count'].fillna(0).astype(int)
submission['count'] = submission['count'].astype(np.int64)
submission.to_csv("./submission.csv",index=False)