In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

# Data Preparation

## Dataset Aggregation

In [2]:
trips = pd.read_csv('trips_raw_data.csv')

In [3]:
trips.head()

Unnamed: 0.1,Unnamed: 0,Trip Id,Subscription Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,...,Temp (°C),Dew Point Temp (°C),Rel Hum (%),Wind Dir (10s deg),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Hmdx,Wind Chill,Weather
0,58,712441,,274,7006.0,2017-01-01 00:03:00-05:00,Bay St / College St (East Side),7021.0,2017-01-01 00:08:00-05:00,Bay St / Albert St,...,1.5,-3.6,69.0,26.0,39.0,16.1,99.81,,,
1,59,712442,,538,7046.0,2017-01-01 00:03:00-05:00,Niagara St / Richmond St W,7147.0,2017-01-01 00:12:00-05:00,King St W / Fraser Ave,...,1.5,-3.6,69.0,26.0,39.0,16.1,99.81,,,
2,60,712443,,992,7048.0,2017-01-01 00:05:00-05:00,Front St W / Yonge St (Hockey Hall of Fame),7089.0,2017-01-01 00:22:00-05:00,Church St / Wood St,...,1.5,-3.6,69.0,26.0,39.0,16.1,99.81,,,
3,61,712444,,1005,7177.0,2017-01-01 00:09:00-05:00,East Liberty St / Pirandello St,7202.0,2017-01-01 00:26:00-05:00,Queen St W / York St (City Hall),...,1.5,-3.6,69.0,26.0,39.0,16.1,99.81,,,
4,62,712445,,645,7203.0,2017-01-01 00:14:00-05:00,Bathurst St/Queens Quay(Billy Bishop Airport),7010.0,2017-01-01 00:25:00-05:00,King St W / Spadina Ave,...,1.5,-3.6,69.0,26.0,39.0,16.1,99.81,,,


In [4]:
trips['Start Time'] = pd.DatetimeIndex(trips['Start Time'])

Our model will be focused on predicting BikeShare trips for a "normal" year. Whether that future years represent a normal year is unknown, but we also can't assume any of the trends in 2020 will continue.

We will limit the model to look only at 2019 data, from January 1st to December 31st.

In [5]:
trips = trips[trips['Start Time'].dt.year == 2019]

# Feature Engineering

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train, test = train_test_split(trips, train_size=0.7, 
                               test_size=0.3, random_state=0)
val, test = train_test_split(test, train_size=0.5,
                             test_size=0.5, random_state=0)

## Chosen Parameters

* Wind Speed
* Hour of Day
* Month
* Weekend vs Weekday - if weekend or statutory holiday, then 1, otherwise 0
* Temperature - binned with this distribution [-30,0,10,20,30,40]
* Precipitation - if precipitation present, then 1, otherwise 0

In [8]:
def aggregate_trips(df):
    df['Start Time'] = df['Start Time'].dt.floor('H')
    
    df_hourly = df.groupby('Start Time').agg({'Trip Id':'count', 'Weather':'first',
                                              'Wind Spd (km/h)': 'first', 'Temp (°C)': 'first'})
    df_hourly = df_hourly.reset_index()
    df_hourly = df_hourly.rename(columns = {'Trip Id': 'Trips', 'Wind Spd (km/h)':'Wind', 'Temp (°C)': 'Temp'})
    
    return df_hourly

In [9]:
def weather_features(df):
    
    df['Temp'] = np.digitize(df.loc[:, 'Temp'], [-30,0,10,20,30,40])
    df['Weather'] = np.where(df['Weather'].isna(), 0, 1)
    
    return df

In [10]:
!pip install holidays



In [11]:
from datetime import date
import holidays

In [20]:
on_hol = holidays.CA(prov = 'ON', years = 2019)
for i in hol:
    print(i)

2019-01-01
2019-02-18
2019-04-19
2019-05-20
2019-07-01
2019-08-05
2019-09-02
2019-10-14
2019-12-25
2019-12-26


In [31]:
def temporal_features(df):
    
    df['Month'] = df['Start Time'].dt.month
    df['Hour'] = df['Start Time'].dt.hour
    df['dow'] = np.where(df['Start Time'].dt.dayofweek > 4, 1, 0)
    df['dow'] = np.where(df['Start Time'].dt.date.isin(on_hol), 1, 0)
    
    return df

In [32]:
train_hourly = aggregate_trips(train)

In [37]:
train_hourly.head()

Unnamed: 0,Start Time,Trips,Weather,Wind,Temp,Month,Hour,dow
0,2019-01-01 00:00:00-05:00,15,1,4.0,2,1,0,1
1,2019-01-01 01:00:00-05:00,21,1,28.0,2,1,1,1
2,2019-01-01 02:00:00-05:00,16,1,34.0,2,1,2,1
3,2019-01-01 03:00:00-05:00,7,0,28.0,2,1,3,1
4,2019-01-01 04:00:00-05:00,6,0,28.0,2,1,4,1


In [33]:
train_weather = weather_features(train_hourly)

In [38]:
train_weather.head()

Unnamed: 0,Start Time,Trips,Weather,Wind,Temp,Month,Hour,dow
0,2019-01-01 00:00:00-05:00,15,1,4.0,2,1,0,1
1,2019-01-01 01:00:00-05:00,21,1,28.0,2,1,1,1
2,2019-01-01 02:00:00-05:00,16,1,34.0,2,1,2,1
3,2019-01-01 03:00:00-05:00,7,0,28.0,2,1,3,1
4,2019-01-01 04:00:00-05:00,6,0,28.0,2,1,4,1


In [39]:
train_dt = temporal_features(train_weather)
train_dt.head()

Unnamed: 0,Start Time,Trips,Weather,Wind,Temp,Month,Hour,dow
0,2019-01-01 00:00:00-05:00,15,1,4.0,2,1,0,1
1,2019-01-01 01:00:00-05:00,21,1,28.0,2,1,1,1
2,2019-01-01 02:00:00-05:00,16,1,34.0,2,1,2,1
3,2019-01-01 03:00:00-05:00,7,0,28.0,2,1,3,1
4,2019-01-01 04:00:00-05:00,6,0,28.0,2,1,4,1


In [60]:
from sklearn.preprocessing import MinMaxScaler 

def process_data(df):
    
    df = aggregate_trips(df)
    df = weather_features(df)
    df = temporal_features(df)
    
    df = df.drop(columns = ['Start Time'])
    
    y = df['Trips']
    
    cat_features = ['Temp', 'Month', 'Hour']
    
    categoricals = [pd.get_dummies(df[s], prefix=s, drop_first=True) for s in cat_features]
    
    scaler = MinMaxScaler()
    scaler.fit(df[['Wind']])
    
    scaled = df[['Wind']]
    
    scaled.iloc[:, :] = scaler.transform(scaled) 
    
    x = pd.concat([scaled] + categoricals, axis=1)
    
    return x, y

In [62]:
x_train, y_train = process_data(train)

In [64]:
x_train.head()

Unnamed: 0,Wind,Temp_2,Temp_3,Temp_4,Temp_5,Temp_6,Month_2,Month_3,Month_4,Month_5,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
0,0.058824,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.411765,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.411765,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.411765,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
y_train.head()

0    15
1    21
2    16
3     7
4     6
Name: Trips, dtype: int64

# Model Fitting

In [None]:
'''

Code using regular train/test/split and validation


'''

## Time Series Cross Validation

In [70]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.base import clone

In [None]:
# this code is from assignment 6, except for TimeSeriesSplit

def cross_validate_rmse(model, x, y):
    
    # Setup
    model = clone(model)
    time_split = TimeSeriesSplit(n_splits=12) 
    rmse_values = []
    
    # Iterature thought cv-folds
    for train_index, val_index in five_fold.split(X):
        
        # Write your code here.
        
        x_train, x_val = x.iloc[train_index], x.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        '''
        
        add code to fit model
        
        '''
        
    return rmse_values