In [None]:
pip install statsmodels

In [None]:
pip install scikit-learn

## Linear Regression only 

In [None]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
bikes = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/bikes_sharing.csv')

In [None]:
bikes.head()

In [None]:
bikes.describe()

In [None]:
# feature engineering
# create daily version of data set
# convert the datetime column into a datetime data type
bikes.info()

In [None]:
# with the pd.to_datetime() function we can make the change
bikes['datetime'] = pd.to_datetime(bikes['datetime'])

In [None]:
bikes.info()

In [None]:
# set the datetime as index

bikes_daily = bikes.set_index('datetime')


In [None]:
# D is for Daily
bikes_daily = bikes_daily.resample('D').aggregate({'season':'max', 'holiday':'max','workingday':'max', 'weather':'max',\
                                                   'temp':'mean', 'atemp':'mean', 'humidity':'mean', 'windspeed':'mean',\
                                                    'casual':'sum', 'registered':'sum', 'count':'sum'})
# one row per day
bikes_daily

In [None]:
# isoldate the hour of the day
bikes['hour'] = bikes['datetime'].apply(lambda x: x.hour)
bikes.head()

In [None]:
def time_of_day(x):
    if x >= 6 and x <= 9:
        return 'morning_commute'
    elif x > 9 and x <= 15:
        return 'midday'
    elif x > 15 and x <= 19:
        return 'evening_commute'
    else:
        return 'afterhours'

In [None]:
bikes['time_of_day'] = bikes['hour'].apply(time_of_day)
bikes.head()

In [None]:
# isoldate the weekday of the day
bikes['weekday'] = bikes['datetime'].apply(lambda x: x.weekday())
bikes.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# evaluate weekday vs number of casual or registered or total rentals
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (10,5), sharey=True)
sns.boxplot(x='weekday', y='casual', data=bikes, ax=ax1, palette = 'tab10')
sns.boxplot(x='weekday', y='registered', data=bikes, ax=ax2, palette = 'tab10')
sns.boxplot(x='weekday', y='count', data=bikes, ax=ax3, palette = 'tab10')

# good for casual
# not good for count or registered

In [None]:
# evaluate time_of_day vs number of casual or registered or total rentals
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (10,5), sharey=True)
sns.boxplot(x='time_of_day', y='casual', data=bikes, ax=ax1, palette='tab10')
sns.boxplot(x='time_of_day', y='registered', data=bikes, ax=ax2, palette='tab10')
sns.boxplot(x='time_of_day', y='count', data=bikes, ax=ax3, palette='tab10')

ax2.set_ylabel('registered')
ax3.set_ylabel('count')

# good for casual, registered and count

In [None]:
bikes[bikes['time_of_day']=='evening_commute']['casual'].describe()

In [None]:
# evaluate weekday & time_of_day vs number of casual or registered or total rentals
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (25,10), sharey=True)
sns.boxplot(x='weekday', y='casual', hue='time_of_day',data=bikes, ax=ax1, palette='tab10')
sns.boxplot(x='weekday', y='registered', hue='time_of_day',data=bikes, ax=ax2, palette='tab10')
sns.boxplot(x='weekday', y='count', hue='time_of_day',data=bikes, ax=ax3, palette='tab10')

In [None]:
# lets look at Temperature vs. Rental Counts
# evaluate weekday & time_of_day vs number of casual or registered or total rentals
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (25,10), sharey=True)
sns.scatterplot(x='temp', y='casual', data=bikes, ax=ax1, palette='tab10')
sns.scatterplot(x='temp', y='registered', data=bikes, ax=ax2, palette='tab10')
sns.scatterplot(x='temp', y='count', data=bikes, ax=ax3, palette='tab10')

In [None]:
# lets look at Temperature vs. Rental Counts
# evaluate weekday & time_of_day vs number of casual or registered or total rentals
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (25,10), sharey=True)
sns.scatterplot(x='temp', y='casual', hue='weekday',data=bikes, ax=ax1, palette='tab10')
sns.scatterplot(x='temp', y='registered', hue='weekday', data=bikes, ax=ax2, palette='tab10')
sns.scatterplot(x='temp', y='count', hue='weekday', data=bikes, ax=ax3, palette='tab10')

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (25,10), sharey=False)
sns.scatterplot(x='temp', y='casual', hue='weekday',data=bikes, ax=ax1, palette='tab10')
sns.scatterplot(x='temp', y='registered', hue='weekday', data=bikes, ax=ax2, palette='tab10')
sns.scatterplot(x='temp', y='count', hue='weekday', data=bikes, ax=ax3, palette='tab10')

In [None]:
# temp, weekday, time of day
# for categorical , they need to be converted to a dummy variable
# get_dummies()

model_data = pd.get_dummies(data = bikes[['casual','temp','weekday','time_of_day']], columns=['weekday','time_of_day'], dtype=int)

In [None]:
model_data.columns

In [None]:
# n -1 dummies - if it has 7 categories include only 6
# hence we delete monday and after_hours

model = ols("casual ~ temp +  weekday_1 + weekday_2 + weekday_3 + \
       weekday_4 + weekday_5 + weekday_6  + \
            time_of_day_evening_commute + time_of_day_midday + time_of_day_morning_commute", data=model_data).fit()

In [None]:
print(model.summary())
# r Squared value - goodness of the model - have we selected the right combination of variable - variability in output
# adj r squared is lower than your  rswuare then u ahev added a bad variable to your model

# y = mx+b
# ---- y = -41.3638 + 2.48tep - 7.22weekday1 - 6.5839weekday2 .....
# for tuesday we will have 7.22 lesser bikes and so on

# p value - stat significance <.05 -- normally disctrubuted - t tests - if p value hight - maybe not normal dist
# we dont remove anything - ???
# if u add more variables - model gets diluted


In [None]:
# make predictions

model_data.head()

In [None]:
predictions = model.predict(model_data.iloc[:,1:])
predictions

In [None]:
model.params

In [None]:
#residual analysis
model_data.head()

In [None]:
final_data = pd.concat([model_data, predictions], axis=1)

In [None]:
final_data.rename(columns={0:'predictions'}, inplace=True)

In [None]:
final_data['residuals'] = final_data['predictions'] - final_data['casual']

In [None]:
# predicted vs residuls 
sns.scatterplot(x='predictions', y='residuals', data=final_data)

# we want to see a parallel pattern

# on days we predicted 100 -- it was like 350 or something

In [None]:
# check for normal distribution of residuals
sns.histplot(final_data['residuals'])

# ---???

# linear regression - biases not flexible, cannot be tuned

In [None]:
# influence plots
import statsmodels.api as sm

In [None]:
sm.graphics.influence_plot(model, criterion = 'cooks')

# takes 20 mins to run.

In [None]:
np.mean(np.sqrt(final_data['residuals']**2))

In [None]:
np.mean(final_data.casual)

In [None]:
24/36