In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme()

In [None]:
stations = pd.read_feather('stations_2023.feather')
df = pd.read_feather('bixi_usage_2023.feather')

In [None]:
df.head()

In [None]:
#check if end time is after start time
df[df['start_timestamp'] > df['end_timestamp']]

In [None]:
print('First pickup : ', df['start_timestamp'].min())
print('Last pickup : ', df['start_timestamp'].max())
print('First return : ', df['end_timestamp'].min())
print('Last return : ', df['end_timestamp'].max())

In [None]:
# something is odd with the times. Maybe the time is Zulu timezone.
# Check if time of day makes sense

df['hour'] = df['start_timestamp'].dt.hour
hourly = df[['hour','start_station_id']].groupby('hour').count()
hourly.columns = ['count']

sns.barplot(data=hourly, y='count', x='hour')
plt.ticklabel_format(style='plain', axis='y')
plt.title('Total bixi pickups by hour')
plt.xlabel('Hour of the day')
plt.ylabel('Number of pick-up')

plt.show()

In [None]:
# too many pickup at night and not enough morning rush hour. 
df['start_timestamp'] = df['start_timestamp'].dt.tz_localize('UTC')
df['start_timestamp'] = df['start_timestamp'].dt.tz_convert('America/Montreal')
df['end_timestamp'] = df['end_timestamp'].dt.tz_localize('UTC')
df['end_timestamp'] = df['end_timestamp'].dt.tz_convert('America/Montreal')

df['hour'] = df['start_timestamp'].dt.hour
hourly = df[['hour','start_station_id']].groupby('hour').count()
hourly.columns = ['count']

sns.barplot(data=hourly, y='count', x='hour')
plt.ticklabel_format(style='plain', axis='y')
plt.title('Total bixi pickups by hour')
plt.xlabel('Hour of the day')
plt.ylabel('Number of pick-up')

plt.show()

In [None]:
print('First pickup : ', df['start_timestamp'].min())
print('Last pickup : ', df['start_timestamp'].max())
print('First return : ', df['end_timestamp'].min())
print('Last return : ', df['end_timestamp'].max())

In [None]:
#the last pickup is now in 2023 and the hourly distribution makes more sense.

In [None]:
print(df['duration_s'].describe())
plt.hist(df['duration_s'], bins=25)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Duration of bixi rides')
plt.xlabel('Duration')
plt.ylabel('Number of pick-up')
plt.show()

In [None]:
low_bound = df['duration_s'].quantile(0.025)
high_bound = df['duration_s'].quantile(0.975)
print('95% of duration is between {:.2f}min and {:.2f}min'.format(
    low_bound/60, 
    high_bound/60))

In [None]:
# keeping only 95% of the data as it make sense as a commuting service
# <1 min could be error in handling the bike system

new_df = df[
        (df['duration_s'] > low_bound) & 
        (df['duration_s'] < high_bound)
        ]

In [None]:
plt.hist(new_df['duration_s'], bins=25)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Duration of bixi rides')
plt.xlabel('Duration')
plt.ylabel('Number of pick-up')
plt.show()

In [None]:
print(new_df['duration_s'].describe())

In [None]:
new_df.to_feather('bixi_usage_2023_cleaned.feather')