# Station Balance

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
# sns.set()
sns.set_style('whitegrid')
sns.set_context("poster")

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

font = {'size'   : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 20
TICK_FONT_SIZE  = 15

In [None]:
day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

ordered_zipcodes = [94107, 95113, 94041, 94301, 94063]

# Load Data

## Trip Data

In [None]:
print('[%s] Loading Trip Data...' % datetime.datetime.now().time())

trips_df = pd.DataFrame()
trip_data_file = '../clean_data/bayareabikeshare/trip_data_cleaned.csv'

# Chunk Settings
chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(trip_data_file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(trip_data_file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['start_date', 'end_date']):
    
#     # prune chunks
#     chunk = chunk[(chunk.start_terminal.isin(pop_commuter_stations)) | (chunk.end_terminal.isin(pop_commuter_stations))].copy()
    
    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1

trips_df = pd.concat(chunks)
trips_df.user_type = trips_df.user_type.astype('category')

trips_df.drop_duplicates(inplace=True)
trips_df.sort_values('trip_id', inplace=True)
trips_df.reset_index(inplace=True, drop=True)

print('[%s] Complete!' % datetime.datetime.now().time())

In [None]:
trips = trips_df.copy()

In [None]:
trips.info()

## Station Data

In [None]:
print('[%s] Loading Trip Data...' % datetime.datetime.now().time())

stations_df = pd.DataFrame()
stations_data_file = '../clean_data/bayareabikeshare/station_data_cleaned.csv'

stations_df = pd.read_csv(stations_data_file, index_col=0, parse_dates=['first_service_date', 'last_service_date'])

print('[%s] Complete!' % datetime.datetime.now().time())

In [None]:
stations_df.head()

In [None]:
subscriber_trips = trips[trips.user_type == 'Subscriber'].copy()
customer_trips = trips[trips.user_type == 'Customer'].copy()

# Plot Hourly Traffic Patterns at Each Station

In [None]:
def plot_hourly_traffic(df=None, terminal=70, suffix='', alpha=0.35):
    
    title = 'Station %s Hourly %s Traffic' % (terminal, suffix)
    title = title.strip()
    
    # count hourly trips to this terminal
    inbound = df[df.end_terminal == terminal]
    inbound_hourly = inbound.groupby(inbound.end_date.dt.hour).count()['trip_id'].to_frame()
    inbound_hourly.columns = ['inbound_hourly']
    
    outbound = df[df.start_terminal == terminal]
    outbound_hourly = outbound.groupby(outbound.start_date.dt.hour).count()['trip_id'].to_frame()
    outbound_hourly.columns = ['outbound_hourly']
    
    hourly_traffic = inbound_hourly.merge(outbound_hourly, left_index=True, right_index=True, how='outer')

    # fill in missing index
    new_index = pd.Index(range(0, 24, 1), name='ID')
    hourly_traffic = hourly_traffic.reindex(new_index)
    hourly_traffic.fillna(0, inplace=True)

    plt.subplots(figsize=(24,6))
    ax = sns.barplot(x=hourly_traffic.index, y=hourly_traffic.inbound_hourly, color='b', alpha=alpha, label='inbound')
    sns.barplot(x=hourly_traffic.index, y=hourly_traffic.outbound_hourly, ax=ax, color='r', alpha=alpha, label='outbound')
    
    plt.title(title, size=TITLE_FONT_SIZE)
    plt.legend(loc=1, frameon=True)
    ax.set_xlabel('Hour', size=LABEL_FONT_SIZE)
    ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE)
    
    file_name = '../charts/balancing/hourly/%s' % title.replace(' ', '_')
    plt.savefig(file_name)
    
    # be memory effecient, close the figure once we are done with it
    plt.close('all')

In [None]:
for terminal in sorted(trips.start_terminal.unique()):
    plot_hourly_traffic(terminal=terminal, df=subscriber_trips, suffix='Subscribers')
    plot_hourly_traffic(terminal=terminal, df=trips[trips.start_date.dt.dayofweek < 5], suffix='All Riders Weekday')
    plot_hourly_traffic(terminal=terminal, df=trips[trips.start_date.dt.dayofweek >= 5], suffix='All Riders Weekend')
    plot_hourly_traffic(terminal=terminal, df=customer_trips, suffix='Customers')

# Plot Weekly Traffic Patterns at Each Station

In [None]:
def plot_weekly_traffic(df=None, terminals=[70], suffix='', subtitle='', title='', alpha=0.35):
    if subtitle == '':
        terminal_titles = ' '.join([str(x) for x in terminals])
        subtitle = 'Station %s Weekly %s Traffic' % (terminal_titles, suffix)
        subtitle = subtitle.replace('  ', ' ').replace("'", '')
        subtitle = subtitle.strip()
        
    #--------------------------------------------------------------------------------
    #   count weekly and hourly trips at specified terminal
    #--------------------------------------------------------------------------------
#     inbound = df[df.end_terminal == terminal]
    inbound = df[df.end_terminal.isin(terminals)]
    inbound_hourly = inbound.groupby([inbound.end_date.dt.dayofweek, inbound.end_date.dt.hour]).count()['trip_id'].to_frame()
    inbound_hourly.columns = ['inbound_hourly']
    inbound_hourly.index.names = ['dayofweek', 'hour']

#     outbound = df[df.start_terminal == terminal]
    outbound = df[df.start_terminal.isin(terminals)]
    outbound_hourly = outbound.groupby([outbound.start_date.dt.dayofweek, outbound.start_date.dt.hour]).count()['trip_id'].to_frame()
    outbound_hourly.columns = ['outbound_hourly']
    outbound_hourly.index.names = ['dayofweek', 'hour']
    
    hourly_influx = inbound_hourly.merge(outbound_hourly, left_index=True, right_index=True, how='outer')
    hourly_influx.reset_index(inplace=True)
    
    hourly_influx['ID'] = (hourly_influx.dayofweek * 24) + hourly_influx.hour
    weekly_traffic = hourly_influx.copy()
    
    # fill in missing index
    new_index = pd.Index(range(0, 24*7, 1), name='ID')
    weekly_traffic = weekly_traffic.set_index('ID').reindex(new_index)
    weekly_traffic.fillna(0, inplace=True)
    
    #--------------------------------------------------------------------------------
    #   plotting
    #--------------------------------------------------------------------------------
    plt.subplots(figsize=(24,6))
    ax = sns.barplot(x=weekly_traffic.index, y=weekly_traffic.inbound_hourly, color='b', alpha=alpha, label='inbound')
    sns.barplot(x=weekly_traffic.index, y=weekly_traffic.outbound_hourly, ax=ax, color='r', alpha=alpha, label='outbound')
    
    plt.suptitle(title, y=1, size=TITLE_FONT_SIZE)
    plt.title(subtitle, size=TITLE_FONT_SIZE * 0.8)
    plt.legend(loc=1, frameon=True)
    
    ax.set_xticks([x*24+13 for x in range(0, 7, 1)])
    ax.set_xticklabels(day_labels_full)
    for vline in [x*24 for x in range(1, 7, 1)]:
        ax.axvline(x=vline, linestyle=':', color='k', alpha=0.5)
    ax.set_xlabel('Hour', size=LABEL_FONT_SIZE)
    ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE)
#     plt.show()
    file_name = '../charts/balancing/weekly/%s.png' % subtitle.replace(' ', '_')
    plt.savefig(file_name)
    
    # be memory effecient, close the figure once we are done with it
    plt.close('all')
    print('Finished %s - %s' % (subtitle, title))

In [None]:
for terminal in sorted(trips.start_terminal.unique()):
    
    # lookup station name
    station_name = stations_df[stations_df.station_id == terminal]['name'].iloc[0]
    
    plot_weekly_traffic(terminals=[terminal], df=trips, suffix='', subtitle='', title=station_name)

## Plot Hourly Traffic at Caltrain 'Super Station'
<p>Stations 69 and 70 are across the street from each other and the most popular stations in the program</p>
<p>Plot weekly traffic as if they were a sigle station</p>

In [None]:
plot_weekly_traffic(terminals=[69, 70], df=trips, suffix='', subtitle='San Francisco Caltrain Stations 69 & 70', title='Caltrain \'Super Station\' Weekly Traffic')

# Load Status Data
<p>only load minute by minute data for stations 69 and 70</p>

In [None]:
print('[%s] Loading Status Data...' % datetime.datetime.now().time())

status_df = pd.DataFrame()
status_data_file = '../clean_data/bayareabikeshare/status_data_cleaned.csv'

# Chunk Settings
chunks = []
chunk_counter = 1
chunksize = 10000

# import file in chunks
for i, chunk in enumerate(pd.read_csv(status_data_file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['time'])):
    
#     # prune chunks
    chunk = chunk[chunk.station_id.isin([69, 70])]
    
    # append chunk to chunks list
    chunks.append(chunk)
    
    if i == 0 or math.floor(i%100) == 0:
        print('\t\t[%s] finished chunk %s' % (datetime.datetime.now().time(), i))

status_df = pd.concat(chunks)
# status_df.user_type = trips_df.user_type.astype('category')

status_df.drop_duplicates(inplace=True)
# status_df.sort_values('trip_id', inplace=True)
status_df.reset_index(inplace=True, drop=True)

print('[%s] Complete!' % datetime.datetime.now().time())

In [None]:
status_df.info()

In [None]:
status_df.head()

In [None]:
def plot_category_weekly(df=None, category='', subtitle='', title='', color='g', alpha=0.35, method='min', file_path_adj='', ylabel = ''):
    
    if method == 'min':
        df = df.groupby([df.index.dayofweek, df.index.hour]).min()[category].copy().to_frame()
        if subtitle != '':
            subtitle = '%s Min' % subtitle

    if method == 'max':
        df = df.groupby([df.index.dayofweek, df.index.hour]).max()[category].copy().to_frame()
        if subtitle != '':
            subtitle = '%s Max' % subtitle

    if method == 'mean':
        df = df.groupby([df.index.dayofweek, df.index.hour]).mean()[category].copy().to_frame()
        if subtitle != '':
            subtitle = '%s Mean' % subtitle

    if method == 'median':
        df = df.groupby([df.index.dayofweek, df.index.hour]).median()[category].copy().to_frame()
        if subtitle != '':
            subtitle = '%s Median' % subtitle
            
    if method == 'count':
        df = df.groupby([df.index.dayofweek, df.index.hour]).count()[category].copy().to_frame()
        if subtitle != '':
            subtitle = '%s Count' % subtitle
        
    df.index.names = ['dayofweek', 'hour']
    df.reset_index(inplace=True)
    df['ID'] = (df.dayofweek * 24) + df.hour

    # fill in index to include every hour of day in the week, fills in missing values
    new_index = pd.Index(range(0, 24*7, 1), name='ID')
    df = df.set_index('ID').reindex(new_index)
    df.fillna(0, inplace=True)

    #--------------------------------------------------------------------------
    #   Plot seaborn barplot of data
    #--------------------------------------------------------------------------
    plt.subplots(figsize=(24,6))
    ax = sns.barplot(x=df.index, y=df[category], color=color, alpha=alpha, label=category)
    
    if subtitle != '':
        plt.suptitle(title, y=1, x=0.51, size=TITLE_FONT_SIZE)
        plt.title(subtitle, size=TITLE_FONT_SIZE * 0.8)
    else:
        plt.title(title, size=TITLE_FONT_SIZE)
    plt.legend([method.title()], loc=1, frameon=True)

    ax.set_xticks([x*24+13 for x in range(0, 7, 1)])
    ax.set_xticklabels(day_labels_full)
    for vline in [x*24 for x in range(1, 7, 1)]:
        ax.axvline(x=vline, linestyle=':', color='k', alpha=0.5)
    ax.set_xlabel('Hour', size=LABEL_FONT_SIZE)
    
    if ylabel == '':
        ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE)
    else:
        ax.set_ylabel(ylabel.title(), size=LABEL_FONT_SIZE)
#     plt.show()
    if file_path_adj == '':
        file_name = '../charts/balancing/weekly/%s.png' % subtitle.replace(' ', '_')
    else:
        if subtitle == '':
            subtitle = '%s_%s' % (method, title)
        file_name = '../charts/balancing/%s/%s.png' % (file_path_adj.lower(), subtitle.replace(' ', '_'))
    print(file_name)
    plt.savefig(file_name)
#     plt.show()

    # be memory effecient, close the figure once we are done with it
    plt.close('all')

    return df


In [None]:
# plot_category_weekly(df=status, category='utilization', title='Caltrain Super Station', subtitle='Weekly Utilization', method='min', color='r')
j = plot_category_weekly(df=status, category='utilization', title='Bay Area Bike Share', subtitle='Weekly Utilization', method='median', color='c', file_path_adj='global')
j = plot_category_weekly(df=status, category='docks_available', title='Bay Area Bike Share', subtitle='Weekly Dock Availability', method='max', color='b', file_path_adj='global')
j = plot_category_weekly(df=status, category='bikes_available', title='Bay Area Bike Share', subtitle='Weekly Bike Availability', method='max', color='b', file_path_adj='global')

## Calculate Status at Stations 69 and 70 as if they were a single 'Super Station'

In [None]:
s69_status = status_df[status_df.station_id == 69].copy()
s69_status = s69_status.groupby(['time']).mean()
s69_status.reset_index(inplace=True)


s70_status = status_df[status_df.station_id == 70].copy()
s70_status = s70_status.groupby(['time']).mean()
s70_status.reset_index(inplace=True)

superstation_status = s69_status.append(s70_status, ignore_index=True)
superstation_status = superstation_status.groupby(['time']).sum()

superstation_status.drop(['station_id', 'zip_code'], axis=1, inplace=True)

superstation_status.utilization = superstation_status.docks_available / superstation_status.dock_count

superstation_status.info()

In [None]:
superstation_status.head()

# How Often is the Super Station Not at an Ideal Utilization?
<ul><b>Not Ideal Conditions are when Utilization is below 25% or above 75%</b>
    <li><ul><b>Below 25%</b>
        <li>Too many bikes are available, we run the risk of not having enough docks being available for inbound traffic</li>
        <li>Too many unused bikes that could be better served at other stations</li>
        </ul>
    </li>
    <li><ul><b>Above 75%</b>
        <li>Not enough bikes are available, we run the risk of not being able to provide rides to outbound traffic</li>
        <li>Too many unused bikes that could be better served at other stations</li>
        </ul>
    </li>
</ul>


In [None]:
superstation_status['is_over_utilized']  = np.where(superstation_status.utilization > 0.75, 1, 0)
superstation_status['is_under_utilized'] = np.where(superstation_status.utilization < 0.25, 1, 0)

superstation_status['is_full'] = np.where(superstation_status.docks_available == 0, 1, 0)
superstation_status['is_empty'] = np.where(superstation_status.docks_available == superstation_status.dock_count, 1, 0)

superstation_status.head()

In [None]:
# plt.subplots(figsize=(24,6))
# superstation_status.groupby([superstation_status.index.date]).max().is_under_utilized.plot(linestyle='', marker='.')
# plt.show()

# x = plot_category_weekly(df=superstation_status, category='is_under_utilized', method='count')
# x = plot_category_weekly(df=superstation_status, category='is_under_utilized', method='mean')

x = plot_category_weekly(df=superstation_status, category='is_full', method='mean', title='Super Station Full', file_path_adj='utilization', ylabel=' ')
x = plot_category_weekly(df=superstation_status, category='is_empty', method='mean', title='Super Station Empty', file_path_adj='utilization', ylabel=' ')

x = plot_category_weekly(df=superstation_status, category='is_under_utilized', method='mean', title='Super Station Under Utilized', file_path_adj='utilization', ylabel=' ')
x = plot_category_weekly(df=superstation_status, category='is_over_utilized', method='mean', title='Super Station Over Utilized', file_path_adj='utilization', ylabel=' ')



In [None]:
# t = superstation_status.groupby([superstation_status.index.date, superstation_status.index.hour]).mean()['is_empty'].to_frame()
t = superstation_status.resample('5Min').mean()[['is_empty', 'is_full']]
t.info()

In [None]:
t.describe()

In [None]:
superstation_status.describe()

In [None]:
dock_counts_df = stations_df.groupby(['station_id', 'landmark']).max()['dock_count'].to_frame()
dock_counts_df.reset_index(inplace=True)
dock_counts_df.groupby('landmark').sum()['dock_count']

In [None]:
num_sf_bikes = len(trips[trips.start_terminal.isin(stations_df[stations_df.landmark == 'San Francisco'].station_id.unique())].bike_id.unique())
num_sf_bikes

In [None]:
trips.info()

# Concurrent Trips in Each Region

In [None]:
def plot_weekly_concurrent_trips(df=None, method='max', color='m', title='', file_path_adj=''):
    concurrent_trips = pd.concat([pd.Series(1, df.start_date),  # created add 1
                           pd.Series(-1, df.end_date)  # closed substract 1
                           ]).resample('1Min').sum().cumsum()
    concurrent_trips.fillna(0, inplace=True)
    concurrent_trips = concurrent_trips.to_frame()
    concurrent_trips.columns = ['concurrent_trips']
    
    title = '%s Weekly Concurrent Trips' % title
    result = plot_category_weekly(df=concurrent_trips, category='concurrent_trips', title=title.strip(), method=method, color=color, file_path_adj=file_path_adj)
    return result
    

In [None]:
regional_concurrent_max  = {}
regional_concurrent_mean = {}

for region in stations_df.landmark.unique():
    region_stations = sorted(stations_df[stations_df.landmark == region].station_id.unique())    
    trips_in_region = trips[(trips.start_terminal.isin(region_stations)) & (trips.end_terminal.isin(region_stations))].copy()
    
    regional_concurrent_max[region] = plot_weekly_concurrent_trips(df=trips_in_region, title=region, method='max', color='r', file_path_adj='concurrent')
    regional_concurrent_mean[region] = plot_weekly_concurrent_trips(df=trips_in_region, title=region, method='mean', color='g', file_path_adj='concurrent')

# Concurrent Trips To and From the Super Station

In [None]:
inbound_trips = trips[trips.end_terminal.isin([69, 70])].copy()
outbound_trips = trips[trips.start_terminal.isin([69, 70])].copy()

super_station_inbound_max   =  plot_weekly_concurrent_trips(df=inbound_trips, title='Caltrain Super Station Inbound', method='max', color='r', file_path_adj='superstation')
super_station_inbound_mean  =  plot_weekly_concurrent_trips(df=inbound_trips, title='Caltrain Super Station Inbound', method='mean', color='g', file_path_adj='superstation')

super_station_outbound_max  =  plot_weekly_concurrent_trips(df=outbound_trips, title='Caltrain Super Station Outbound', method='max', color='r', file_path_adj='superstation')
super_station_outbound_mean =  plot_weekly_concurrent_trips(df=outbound_trips, title='Caltrain Super Station Outbound', method='mean', color='g', file_path_adj='superstation')



In [None]:
superstation_balance = pd.DataFrame()
superstation_balance['bikes_arriving'] = inbound_trips.groupby('end_date').count()['trip_id']
superstation_balance['bikes_departing'] = outbound_trips.groupby('start_date').count()['trip_id']

In [None]:
superstation_balance = superstation_balance.resample('1Min').sum()
superstation_balance.fillna(0, inplace=True)

superstation_balance['cumm_bikes_arriving'] = superstation_balance['bikes_arriving'].cumsum()
superstation_balance['cumm_bikes_departing'] = superstation_balance['bikes_departing'].cumsum()

superstation_balance['bikes_balance'] = superstation_balance['bikes_arriving'] - superstation_balance['bikes_departing']
superstation_balance['cumm_bikes_balance'] = superstation_balance['cumm_bikes_arriving'] - superstation_balance['cumm_bikes_departing']

superstation_balance.head(10)

In [None]:
supserstation_status.info()

In [None]:
superstation_balance.info()

In [None]:
balance_diff = supserstation_status.merge(superstation_balance, left_index=True, right_index=True, how='right')

for c in balance_diff.columns:
    if c not in ['bikes_available', 'docks_available', 'bikes_arriving', 'bikes_departing']:
        balance_diff.drop(c, axis=1, inplace=True)

balance_diff.fillna(method='bfill', inplace=True)
balance_diff.info()

In [None]:
balance_diff.head()

In [None]:
# number of bikes relocated by system managers
balance_diff['reported_bike_gain'] = balance_diff.bikes_available - balance_diff.bikes_available.shift(1)
balance_diff['trip_bike_gain'] = balance_diff.bikes_arriving - balance_diff.bikes_departing

# this nube
balance_diff['investigate'] = balance_diff.reported_bike_gain - balance_diff.trip_bike_gain

# bikes_arriving
# relocated_bikes
balance_diff[11300:11500]

In [None]:
# balance_diff.rolling('15min', min_periods=4).mean()[['reported_bike_gain', 'trip_bike_gain']].plot(figsize=(24,6), alpha=0.75)
balance_diff[['investigate']].plot(figsize=(24,6), alpha=0.75)
plt.show()