# Investigation of Trip Data from Bay Area Bike Share Published Data
<ul>
    <li>First overview of user types and hourly/weekly/yearly trends</li>
</ul>

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import scipy
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime

import seaborn as sns
# sns.set()
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

font = {'size'   : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 10
TICK_FONT_SIZE  = 15

# Load Trip Data

In [3]:
print('[%s] Loading Trip Data Data...' % datetime.datetime.now().time())

trips_df = pd.DataFrame()
trip_data_file = '../clean_data/bayareabikeshare/trip_data_extended_cleaned.csv'

# Chunk Settings
chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(trip_data_file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(trip_data_file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['start_date', 'end_date', 'forecast_time']):
    
    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1

trips_df = pd.concat(chunks)
trips_df.user_type = trips_df.user_type.astype('category')
print('[%s] Complete!' % datetime.datetime.now().time())

[22:47:37.884075] Loading Trip Data Data...
		[22:47:39.073625] finished chunk 1 of 99
		[22:47:40.689219] finished chunk 10 of 99
		[22:47:42.548617] finished chunk 20 of 99
		[22:47:44.420123] finished chunk 30 of 99
		[22:47:46.331755] finished chunk 40 of 99
		[22:47:48.106466] finished chunk 50 of 99
		[22:47:49.889150] finished chunk 60 of 99
		[22:47:51.690893] finished chunk 70 of 99
		[22:47:53.483385] finished chunk 80 of 99
		[22:47:55.255473] finished chunk 90 of 99
		[22:47:56.933194] finished chunk 99 of 99
[22:47:58.573224] Complete!


In [4]:
trips_df.head(5)

Unnamed: 0,trip_id,duration,start_date,start_station_id,end_date,end_station_id,bike_id,user_type,user_zip,duration_minutes,...,humidity,precipIntensity,precipProbability,precipType,pressure,temperature,forecast_time,visibility,windBearing,windSpeed
0,4069,174,2013-08-29 09:08:00,64,2013-08-29 09:11:00,64,288,Subscriber,94114,2.9,...,0.92,0.0,0.0,0.0,1017.06,62.63,2013-08-29 09:00:00,10.0,183.0,0.7
1,4073,1067,2013-08-29 09:24:00,66,2013-08-29 09:42:00,69,321,Subscriber,94703,17.783333,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
2,4074,1131,2013-08-29 09:24:00,66,2013-08-29 09:43:00,69,317,Subscriber,94115,18.85,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
3,4075,1117,2013-08-29 09:24:00,66,2013-08-29 09:43:00,69,316,Subscriber,94122,18.616667,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
4,4076,1118,2013-08-29 09:25:00,66,2013-08-29 09:43:00,69,322,Subscriber,94597,18.633333,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0


In [5]:
trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 983350 entries, 0 to 1010724
Data columns (total 37 columns):
trip_id                           983350 non-null int64
duration                          983350 non-null int64
start_date                        983350 non-null datetime64[ns]
start_station_id                  983350 non-null int64
end_date                          983350 non-null datetime64[ns]
end_station_id                    983350 non-null int64
bike_id                           983350 non-null int64
user_type                         983350 non-null category
user_zip                          983350 non-null int64
duration_minutes                  983350 non-null float64
start_station_name                983350 non-null object
start_station_region              983350 non-null object
start_station_elevation_meters    983350 non-null float64
start_station_elevation_feet      983350 non-null float64
end_station_name                  983350 non-null object
end_station_region

## Utility Functions

In [6]:
def zip_to_landmark(zip_code):
    ''' Return zipcode for given landmark'''
    if zip_code == 94107:
        return 'San Francisco'
    if zip_code == 94063:
        return 'Redwood City'
    if zip_code == 94301:
        return 'Palo Alto'
    if zip_code == 94041:
        return 'Mountain View'
    if zip_code == 95113:
        return 'San Jose'
    return False

In [7]:
day_labels = ['MON','TUE','WED','THU','FRI','SAT','SUN']
day_labels_full = ['MONDAY','TUESDAY','WEDNESDAY','THURSDAY','FRIDAY','SATURDAY','SUNDAY']
month_labels = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']

In [8]:
sub_color = 'b'
sub_color_alt = 'm'
cust_color='r'
cust_color_alt='y'

commuter_color='g'
commuter_color_alt='#1daf1d'

commuter_am = '#ea54d9'
commuter_am_alt = '#9b8460'

commuter_pm = '#b97ccc'
commuter_pm_alt = '#f4ad3a'

figsize = (15,6)

# Initial Analysis - Usage by Region

<ul>
    <li>How many Trips are taken within each region?</li>
    <li>How many Trips are taken within each region by user type?</li>
    <li>How many Trips are taken that cross from one rgion to another?</li>
</ul>

In [9]:
print('\t\t\t\t\t\t\t\tSubscriber\tSubscriber\tCustomer\tCustomer')
print('\t\tRegion\t\tTrip Count\tTrip Share\tTrip Count\tRegion Share\tTrip Count\tRegion Share')

# data for all trip
babs_label='Full Service Area'
babs_trip_count            = trips_df.trip_id.count()
babs_trip_count_subscriber = trips_df[trips_df.user_type == 'Subscriber'].trip_id.count()
babs_trip_share_subscriber = babs_trip_count_subscriber / trips_df.trip_id.count() * 100.
babs_trip_share_subscriber = '{:2.4f}'.format(babs_trip_share_subscriber).rjust(6)

babs_trip_count_customer   = trips_df[trips_df.user_type == 'Customer'].trip_id.count()
babs_trip_share_customer   = babs_trip_count_customer / trips_df.trip_id.count() * 100.
babs_trip_share_customer = '{:2.4f}'.format(babs_trip_share_customer).rjust(6)

print('%s\t\t%s\t%s\t%s\t%s\t%s\t%s' % (babs_label.rjust(20), 
                                        str(babs_trip_count).rjust(10), 
                                        str('100.0000').rjust(10), 
                                        str(babs_trip_count_subscriber).rjust(10), 
                                        str(babs_trip_share_subscriber).rjust(10), 
                                        str(babs_trip_count_customer).rjust(10), 
                                        str(babs_trip_share_customer).rjust(10)))

regions = ['San Francisco', 'San Jose', 'Mountain View', 'Palo Alto', 'Redwood City']
for region in regions:

        
    # subset All Days
    trips_in_region = pd.DataFrame()
    trips_in_region            = trips_df[(trips_df.start_station_region == region) & (trips_df.start_station_region == region)].copy()
    subscriber_trips_in_region = trips_in_region[trips_in_region.user_type == 'subscriber'.title()]
    customer_trips_in_region   = trips_in_region[trips_in_region.user_type == 'customer'.title()]
    
    # subset Weekend Only
    weekend_trips_in_region            =     trips_in_region[trips_in_region.start_date.dt.dayofweek < 5]
    weekend_subscriber_trips_in_region =     subscriber_trips_in_region[subscriber_trips_in_region.start_date.dt.dayofweek < 5]
    weekend_customer_trips_in_region   =     customer_trips_in_region[customer_trips_in_region.start_date.dt.dayofweek < 5]
    
    # subset Weekdays Only
    weekday_trips_in_region            =     trips_in_region[trips_in_region.start_date.dt.dayofweek >= 5]
    weekday_subscriber_trips_in_region =     subscriber_trips_in_region[subscriber_trips_in_region.start_date.dt.dayofweek >= 5]
    weekday_customer_trips_in_region   =     customer_trips_in_region[customer_trips_in_region.start_date.dt.dayofweek >= 5]
    
    # format All Days
    trips_in_region_count = trips_in_region.shape[0]

    trips_in_region_share = trips_in_region.shape[0] / trips_df.shape[0] * 100.
    trips_in_region_share = '{:2.4f}'.format(trips_in_region_share)
    
    subscriber_trips_in_region_count = subscriber_trips_in_region.shape[0]

    subscriber_trips_in_region_share = subscriber_trips_in_region.shape[0] / trips_in_region.shape[0] * 100.
    subscriber_trips_in_region_share = '{:2.4f}'.format(subscriber_trips_in_region_share)
    
    customer_trips_in_region_count = customer_trips_in_region.shape[0]

    customer_trips_in_region_share = customer_trips_in_region.shape[0] / trips_in_region.shape[0] * 100.
    customer_trips_in_region_share = '{:2.4f}'.format(customer_trips_in_region_share)
    
    
    print('%s\t\t%s\t%s\t%s\t%s\t%s\t%s' % (region.title().rjust(20), 
                                            str(trips_in_region_count).rjust(10), 
                                            str(trips_in_region_share).rjust(10), 
                                            str(subscriber_trips_in_region_count).rjust(10), 
                                            str(subscriber_trips_in_region_share).rjust(10), 
                                            str(customer_trips_in_region_count).rjust(10), 
                                            str(customer_trips_in_region_share).rjust(10)))
    
# cross region trips
cross_region_trips = trips_df[trips_df.start_station_region != trips_df.end_station_region]

cross_region_label='Cross Region'
cross_region_trip_count = cross_region_trips.trip_id.count()

cross_region_trip_share = cross_region_trips.shape[0] / trips_df.shape[0] * 100.
cross_region_trip_share = '{:2.4f}'.format(cross_region_trip_share)

cross_region_trip_count_subscriber = cross_region_trips[cross_region_trips.user_type == 'Subscriber'].trip_id.count()
cross_region_trip_share_subscriber = cross_region_trip_count_subscriber / cross_region_trips.trip_id.count() * 100.
cross_region_trip_share_subscriber = '{:2.4f}'.format(cross_region_trip_share_subscriber).rjust(6)

cross_region_trip_count_customer   = cross_region_trips[cross_region_trips.user_type == 'Customer'].trip_id.count()
cross_region_trip_share_customer   = cross_region_trip_count_customer / cross_region_trips.trip_id.count() * 100.
cross_region_trip_share_customer = '{:2.4f}'.format(cross_region_trip_share_customer).rjust(6)

print('%s\t\t%s\t%s\t%s\t%s\t%s\t%s' % (cross_region_label.rjust(20), 
                                        str(cross_region_trip_count).rjust(10), 
                                        str(cross_region_trip_share).rjust(10), 
                                        str(cross_region_trip_count_subscriber).rjust(10), 
                                        str(cross_region_trip_share_subscriber).rjust(10), 
                                        str(cross_region_trip_count_customer).rjust(10), 
                                        str(cross_region_trip_share_customer).rjust(10)))

								Subscriber	Subscriber	Customer	Customer
		Region		Trip Count	Trip Share	Trip Count	Region Share	Trip Count	Region Share
   Full Service Area		    983350	  100.0000	    846790	   86.1128	    136560	   13.8872
       San Francisco		    891068	   90.6155	    771572	   86.5896	    119496	   13.4104
            San Jose		     52781	    5.3675	     44117	   83.5850	      8664	   16.4150
       Mountain View		     24646	    2.5063	     20933	   84.9347	      3713	   15.0653
           Palo Alto		      9852	    1.0019	      5940	   60.2923	      3912	   39.7077
        Redwood City		      5003	    0.5088	      4228	   84.5093	       775	   15.4907
        Cross Region		      1420	    0.1444	       715	   50.3521	       705	   49.6479


# A1.1 - Usage by User Type within San Francisco

<ul>
    <li>How Many Trips are taken by each User Type</li>
    <li>Identify Hourly Trends</li>
    <li>Identify Weekly Trends</li>
    <li>Identify Yearly Trends</li>
</ul>

In [10]:
sf_trips = trips_df[(trips_df.start_station_region == 'San Francisco') & (trips_df.end_station_region == 'San Francisco')].copy()
sf_trips.reset_index(inplace=True, drop=True)
sf_trips.head()

Unnamed: 0,trip_id,duration,start_date,start_station_id,end_date,end_station_id,bike_id,user_type,user_zip,duration_minutes,...,humidity,precipIntensity,precipProbability,precipType,pressure,temperature,forecast_time,visibility,windBearing,windSpeed
0,4069,174,2013-08-29 09:08:00,64,2013-08-29 09:11:00,64,288,Subscriber,94114,2.9,...,0.92,0.0,0.0,0.0,1017.06,62.63,2013-08-29 09:00:00,10.0,183.0,0.7
1,4073,1067,2013-08-29 09:24:00,66,2013-08-29 09:42:00,69,321,Subscriber,94703,17.783333,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
2,4074,1131,2013-08-29 09:24:00,66,2013-08-29 09:43:00,69,317,Subscriber,94115,18.85,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
3,4075,1117,2013-08-29 09:24:00,66,2013-08-29 09:43:00,69,316,Subscriber,94122,18.616667,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0
4,4076,1118,2013-08-29 09:25:00,66,2013-08-29 09:43:00,69,322,Subscriber,94597,18.633333,...,0.91,0.0,0.0,0.0,1017.26,62.6,2013-08-29 09:00:00,10.0,241.0,1.0


In [11]:
subscriber_trips = sf_trips[sf_trips.user_type == 'Subscriber'].copy()
subscriber_trips.reset_index(inplace=True, drop=True)
print('Subscriber Trips {:d}'.format(subscriber_trips.shape[0]))

customer_trips = sf_trips[sf_trips.user_type == 'Customer'].copy()
customer_trips.reset_index(inplace=True, drop=True)
print('Customer Trips   {:d}'.format(customer_trips.shape[0]))

Subscriber Trips 771561
Customer Trips   119484


# A1.2 - Subscriber and Customer Hourly Trips

In [12]:
# SUBSCRIBERS
hourly_starting = subscriber_trips.groupby([subscriber_trips.start_date.dt.hour]).count()
hourly_ending = subscriber_trips.groupby([subscriber_trips.end_date.dt.hour]).count()
    
starting_x_ticks = sorted(hourly_starting.index.unique())
ending_x_ticks = sorted(hourly_ending.index.unique())

plt.subplots(figsize=figsize)
ax = sns.barplot(x = starting_x_ticks , y = 'trip_id', data=hourly_starting, color=sub_color, alpha = 0.35, label='Trips Starting')
sns.barplot(ax=ax, x = ending_x_ticks , y = 'trip_id', data=hourly_ending, color=sub_color_alt, alpha = 0.35, label='Trips Ending')

ax.set_title('San Francisco Subscriber Hourly Trip Count', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=LABEL_FONT_SIZE*2)
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)
ax.legend(loc=1, frameon=True)
# plt.show()
plt.savefig('../charts/user_trends/subscriber_trips_hourly.png')
plt.close()

In [13]:
# CUSTOMERS
hourly_starting = customer_trips.groupby([customer_trips.start_date.dt.hour]).count()
hourly_ending = customer_trips.groupby([customer_trips.end_date.dt.hour]).count()
    
starting_x_ticks = sorted(hourly_starting.index.unique())
ending_x_ticks = sorted(hourly_ending.index.unique())

plt.subplots(figsize=figsize)
ax = sns.barplot(x = starting_x_ticks , y = 'trip_id', data=hourly_starting, color=cust_color, alpha = 0.35, label='Trips Starting')
sns.barplot(ax=ax, x = ending_x_ticks , y = 'trip_id', data=hourly_ending, color=cust_color_alt, alpha = 0.35, label='Trips Ending')

ax.set_title('San Francisco Customer Hourly Trip Count', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=LABEL_FONT_SIZE*1.5)
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)
ax.legend(loc=1, frameon=True)
# plt.show()
plt.savefig('../charts/user_trends/customer_trips_hourly.png')
plt.close()

# A1.3 -  Subscriber and Customer Weekly Trips

In [14]:
dummy_week = pd.DataFrame(index=[x for x in range(0, 24*7, 1)], columns=['dummy'])
dummy_week['dummy'] = dummy_week.index.to_series()

In [15]:
def week_fill(df=None, dummy_df=dummy_week):
    tmp = df.groupby([df['start_date'].dt.dayofweek, df['start_date'].dt.hour]).count()['trip_id'].to_frame()
    tmp.index.names = ['dayofweek', 'hour']
    tmp.reset_index(inplace=True, drop=False)
    tmp['dummy'] = tmp.dayofweek * 24 + tmp.hour
    tmp.drop(['dayofweek', 'hour'], axis=1, inplace=True)
    tmp.set_index('dummy', inplace=True, drop=True)
    tmp = tmp.merge(dummy_week, left_index=True, right_index=True, how='right')
    tmp.set_index('dummy', inplace=True, drop=True)
    tmp.index.names = ['hourofweek']
    return tmp

In [16]:
# SUBSCRIBERS

weekly_trips = week_fill(df=subscriber_trips)

x_tick_labels = day_labels_full
x_ticks = [x*24+12 for x in range(0, len(x_tick_labels))]

x_markers = [x*24 for x in range(1, len(x_tick_labels))]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = weekly_trips.index , y = 'trip_id', data=weekly_trips, color=sub_color, alpha = 0.35, label='Trips Starting')

ax.set_title('San Francisco Subscriber Weekly Trip Counts', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')

ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

plt.savefig('../charts/user_trends/subscriber_trips_weekly.png')
# plt.show()
plt.close()

In [17]:
# CUSTOMERS
weekly_trips = week_fill(df=customer_trips)


x_tick_labels = day_labels_full
x_ticks = [x*24+12 for x in range(0, len(x_tick_labels))]

x_markers = [x*24+1 for x in range(1, len(x_tick_labels))]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = weekly_trips.index , y = 'trip_id', data=weekly_trips, color=cust_color, alpha = 0.35, label='Trips Starting')

ax.set_title('San Francisco Customer Weekly Trip Counts', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')

ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

plt.savefig('../charts/user_trends/customer_trips_weekly.png')
# plt.show()
plt.close()

# A1.4 -  Subscriber and Customer Yearly Trips

In [18]:
def yearly_mean(df=None):
    
    years = sorted(df.start_date.dt.year.unique())
    interval = sorted(df.start_date.dt.week.unique())

    yearly_df = pd.DataFrame(index=interval, columns=['dummy'])
    yearly_df.index.names = ['start_date']
    
    for year in years:        
        year_df = df[df.start_date.dt.year == year].copy()
        year_df = year_df.groupby([year_df.start_date.dt.week]).count()['trip_id'].to_frame()
        year_df.columns = [str(year)]
        
        # merge year to main dataframe
        yearly_df = yearly_df.merge(year_df, left_index=True, right_index=True, how='left')

    yearly_df.drop(['dummy'], axis=1, inplace=True)
    
    yearly_df['mean_skipped'] = yearly_df.mean(axis=1, skipna=True)
    
    yearly_df.fillna(0, inplace=True)
    yearly_df['mean'] = yearly_df.mean(axis=1)
    
    return yearly_df

## A1.4.1 -  Some Important Yearly Dates
<p> Holidays might play a role in the number of trips taken, this is the pseudo average day of the year each holiday takes place</p>

In [19]:
significant_dates = {'4th of July' : 186,
                     'Labor Day' : 247,
                     'Christmas' : 359,
                     'Thanksgiving' : 330,
                     'Memorial Day' : 152}
fade_alpha=0.075

In [20]:
# SUBSCRIBERS
yearly_trips = yearly_mean(df=subscriber_trips)

x_tick_labels = month_labels

x_ticks = [x*(53/12)+(53/24) for x in range(0, len(x_tick_labels))]
x_markers = [x*(53/12) for x in range(1, len(x_tick_labels))]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = yearly_trips.index , y = 'mean_skipped', data=yearly_trips, color=sub_color, alpha = 0.25, label='Trips Starting')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2013', data=yearly_trips, color=sub_color, alpha = fade_alpha, label='2013')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2014', data=yearly_trips, color=sub_color, alpha = fade_alpha, label='2014')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2015', data=yearly_trips, color=sub_color, alpha = fade_alpha, label='2015')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2016', data=yearly_trips, color=sub_color, alpha = fade_alpha, label='2016')

ax.set_title('San Francisco Subscriber Yearly Trip Counts', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')

for k, v in significant_dates.items():
    # draw line on date
    ax.axvline(x=(v-365/53/2)/365*53, linestyle='-', alpha=1.0, color=cust_color, linewidth=3)
    
    ax.text((v-365/53/2)/365, 0.20, k,
        horizontalalignment='right',
        verticalalignment='baseline',
        rotation=-30,
        transform=ax.transAxes,
        size=TICK_FONT_SIZE, color='w', weight='bold', alpha=1.0, backgroundcolor=(0.0, 0.0, 0.0, 0.5))
    
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

plt.savefig('../charts/user_trends/subscriber_trips_yearly.png')
# plt.show()
plt.close()

In [21]:
# CUSTOMERS
yearly_trips = yearly_mean(df=customer_trips)

x_tick_labels = month_labels

x_ticks = [x*(53/12)+(53/24) for x in range(0, len(x_tick_labels))]
x_markers = [x*(53/12) for x in range(1, len(x_tick_labels))]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = yearly_trips.index , y = 'mean_skipped', data=yearly_trips, color=cust_color, alpha = 0.25, label='Trips Starting')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2013', data=yearly_trips, color=cust_color, alpha = fade_alpha, label='2013')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2014', data=yearly_trips, color=cust_color, alpha = fade_alpha, label='2014')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2015', data=yearly_trips, color=cust_color, alpha = fade_alpha, label='2015')
sns.barplot(ax=ax, x = yearly_trips.index , y = '2016', data=yearly_trips, color=cust_color, alpha = fade_alpha, label='2016')

ax.set_title('San Francisco Customer Yearly Trip Counts', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')

for k, v in significant_dates.items():
    # draw line on date
    ax.axvline(x=(v-365/53/2)/365*53, linestyle='-', alpha=1.0, color=sub_color, linewidth=3)
    
    ax.text((v-365/53/2)/365, 0.9, k,
        horizontalalignment='right',
        verticalalignment='baseline',
        rotation=30,
        transform=ax.transAxes,
        size=TICK_FONT_SIZE, color='w', weight='bold', alpha=1.0, backgroundcolor=(0.0, 0.0, 0.0, 0.5))
    
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

plt.savefig('../charts/user_trends/customer_trips_yearly.png')
# plt.show()
plt.close()

# B1.1 - Drilling in on Subscriber Usage, Commuters

In [22]:
weekday_subscriber_trips = subscriber_trips[(subscriber_trips.start_date.dt.dayofweek < 5) | (subscriber_trips.end_date.dt.dayofweek < 5)].copy()
weekday_subscriber_trips.reset_index(inplace=True, drop=True)

am_commute_hour_begin  = 7
am_commute_hour_finish = 10
morning_commuter_trips = weekday_subscriber_trips[((weekday_subscriber_trips.start_date.dt.hour >= am_commute_hour_begin) & (weekday_subscriber_trips.start_date.dt.hour < am_commute_hour_finish)) |
                                                          ((weekday_subscriber_trips.end_date.dt.hour >= am_commute_hour_begin) & (weekday_subscriber_trips.end_date.dt.hour < am_commute_hour_finish))].copy()

# morning_commuter_trips = morning_commuter_trips[morning_commuter_trips.duration_minutes <= 30].copy()
morning_commuter_trips.reset_index(inplace=True, drop=True)

pm_commute_hour_begin  = 16
pm_commute_hour_finish = 19
evening_commuter_trips = weekday_subscriber_trips[((weekday_subscriber_trips.start_date.dt.hour >= pm_commute_hour_begin) & (weekday_subscriber_trips.start_date.dt.hour < pm_commute_hour_finish)) |
                                                          ((weekday_subscriber_trips.end_date.dt.hour >= pm_commute_hour_begin) & (weekday_subscriber_trips.end_date.dt.hour < pm_commute_hour_finish))].copy()

# evening_commuter_trips = evening_commuter_trips[evening_commuter_trips.duration_minutes <= 30].copy()
evening_commuter_trips.reset_index(inplace=True, drop=True)

print('{:d} Subscriber Trips               '.format(subscriber_trips.shape[0]))
print('{:d} Weekday Subscriber Trips       '.format(weekday_subscriber_trips.shape[0]))
print('{:d} Weekday Commuter Trips         '.format(morning_commuter_trips.shape[0] + evening_commuter_trips.shape[0]))
print('{:d} Weekday Morning Commuter Trips '.format(morning_commuter_trips.shape[0]))
print('{:d} Weekday Evening Commuter Trips '.format(evening_commuter_trips.shape[0]))

771561 Subscriber Trips               
721171 Weekday Subscriber Trips       
492944 Weekday Commuter Trips         
255117 Weekday Morning Commuter Trips 
237827 Weekday Evening Commuter Trips 


In [23]:
def commute_hours_fill(df=None, dummy_df=None, group_date='start_date'):
    tmp = df.groupby([df[group_date].dt.hour, df[group_date].dt.minute]).count()['trip_id'].to_frame()
    tmp.index.names = ['hour', 'minute']
    tmp.reset_index(inplace=True, drop=False)
    tmp['dummy'] = tmp.hour * 60 + tmp.minute
    tmp.drop(['hour', 'minute'], axis=1, inplace=True)
    tmp.set_index('dummy', inplace=True, drop=True)
    tmp = tmp.merge(dummy_df, left_index=True, right_index=True, how='right')
    tmp.set_index('dummy', inplace=True, drop=True)
    tmp.index.names = ['minuteofcommute']
    tmp.fillna(0, inplace=True)
    
    return tmp

# B1.2 - Morning Commuters

In [24]:
dummy_morning_commute = pd.DataFrame(index=[x for x in range(am_commute_hour_begin*60, am_commute_hour_finish*60, 1)], columns=['dummy'])
dummy_morning_commute['dummy'] = dummy_morning_commute.index.to_series()

In [25]:
# MORNING COMMUTERS
trips_starting = commute_hours_fill(df=morning_commuter_trips, dummy_df=dummy_morning_commute, group_date='start_date')
trips_ending = commute_hours_fill(df=morning_commuter_trips, dummy_df=dummy_morning_commute, group_date='end_date')

starting_x_ticks = sorted(trips_starting.index.unique())
ending_x_ticks = sorted(trips_ending.index.unique())

x_ticks = [x*60 for x in range(0, am_commute_hour_finish-am_commute_hour_begin, 1)]
x_markers = [x*60 for x in range(1, len(x_ticks))]

x_tick_labels = [str(x)+':00' for x in range(am_commute_hour_begin, am_commute_hour_finish+1, 1)]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = starting_x_ticks , y = 'trip_id', data=trips_starting, color=commuter_am, alpha = 0.5, label='Trips Starting')
sns.barplot(ax=ax, x = ending_x_ticks , y = 'trip_id', data=trips_ending, color=commuter_am_alt, alpha = 0.5, label='Trips Ending')

ax.set_title('San Francisco Morning Commuters Hourly Trips Count', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')
        
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

ax.legend(loc=1, frameon=True)
# plt.show()
plt.savefig('../charts/user_trends/morning_subscriber_trips_hourly.png')
plt.close()


# B1.3 - Evening Commuters

In [26]:
dummy_evening_commute = pd.DataFrame(index=[x for x in range(pm_commute_hour_begin*60, pm_commute_hour_finish*60, 1)], columns=['dummy'])
dummy_evening_commute['dummy'] = dummy_evening_commute.index.to_series()

In [27]:
# MORNING COMMUTERS
trips_starting = commute_hours_fill(df=evening_commuter_trips, dummy_df=dummy_evening_commute, group_date='start_date')
trips_ending = commute_hours_fill(df=evening_commuter_trips, dummy_df=dummy_evening_commute, group_date='end_date')

starting_x_ticks = sorted(trips_starting.index.unique())
ending_x_ticks = sorted(trips_ending.index.unique())

x_ticks = [x*60 for x in range(0, pm_commute_hour_finish-pm_commute_hour_begin, 1)]
x_markers = [x*60 for x in range(1, len(x_ticks))]

x_tick_labels = [str(x)+':00' for x in range(pm_commute_hour_begin, pm_commute_hour_finish+1, 1)]

plt.subplots(figsize=figsize)
ax = sns.barplot(x = starting_x_ticks , y = 'trip_id', data=trips_starting, color=commuter_pm, alpha = 0.5, label='Trips Starting')
sns.barplot(ax=ax, x = ending_x_ticks , y = 'trip_id', data=trips_ending, color=commuter_pm_alt, alpha = 0.5, label='Trips Ending')

ax.set_title('San Francisco Evening Commuters Hourly Trips Count', size=TITLE_FONT_SIZE, weight='bold')

ax.set_xlabel('', size=TICK_FONT_SIZE)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_tick_labels, size=TICK_FONT_SIZE)
for x in x_markers:
    ax.axvline(x=x, linestyle=':', alpha=0.25, color='k')
    
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*1.5)

ax.legend(loc=1, frameon=True)
# plt.show()
plt.savefig('../charts/user_trends/evening_subscriber_trips_hourly.png')
plt.close()



# Write Commuter Groupings to csv files

In [28]:
morning_commuter_trips.to_csv('../clean_data/bayareabikeshare/trip_data_morning_commutes.csv', encoding='utf-8')
evening_commuter_trips.to_csv('../clean_data/bayareabikeshare/trip_data_evening_commutes.csv', encoding='utf-8')

# Yearly Trip Count Comparison by User Group

In [29]:
def monthly_resample_sum(df=None):
    resampled_count = df[-((df.start_date.dt.year == 2013) & (df.start_date.dt.month == 8))]
    resampled_count = resampled_count.groupby([resampled_count.start_date.dt.date]).count()['trip_id'].to_frame()
    resampled_count.reset_index(inplace=True)
    resampled_count.start_date = pd.to_datetime(resampled_count.start_date)
    resampled_count.set_index(['start_date'], inplace=True)
    resampled_count = resampled_count.resample('1M').sum()
    
    return resampled_count.trip_id

## Split into User Groups

In [30]:
subscriber_trips = trips_df[trips_df.user_type == 'Subscriber'].copy()
customer_trips   = trips_df[trips_df.user_type == 'Customer'].copy()

commuter_trips = subscriber_trips[subscriber_trips.start_date.dt.dayofweek < 5].copy()
commuter_trips = commuter_trips[((commuter_trips.start_date.dt.hour >= 7) & (commuter_trips.start_date.dt.hour < 10)) | 
                                ((commuter_trips.start_date.dt.hour >= 16) & (commuter_trips.start_date.dt.hour < 19))].copy()

## Get Monthly Sums

In [31]:
trip_counts = pd.DataFrame()
trip_counts['customer_trips'] = monthly_resample_sum(df=customer_trips)
trip_counts['subscriber_trips'] = monthly_resample_sum(df=subscriber_trips)
trip_counts['commuter_trips'] = monthly_resample_sum(df=commuter_trips)
trip_counts['non_commuter_trips'] = trip_counts.subscriber_trips - trip_counts.commuter_trips

## Plot Monthly Counts 

In [32]:
x_ticks = []
for idx in trip_counts.index:
    x_ticks.append('%s %d' % (month_labels[idx.month-1], idx.year))
trip_counts.reset_index(inplace=True, drop=True)


In [33]:
ax = trip_counts.subscriber_trips.plot(color=sub_color, linestyle='-.', marker='o', label='Subscribers', figsize=(18,6))
trip_counts.customer_trips.plot(ax=ax, color=cust_color, linestyle='-.', marker='o', label='Customers')
trip_counts.commuter_trips.plot(ax=ax, color=commuter_color, linestyle='-.', marker='o', label='Commuters')
trip_counts.non_commuter_trips.plot(ax=ax, color=commuter_color_alt, linestyle='-.', marker='o', label='Non Commuters')

ax.set_title('Trips By Month', size=TITLE_FONT_SIZE, weight='bold')
ax.set_ylabel('Number of Trips', size=LABEL_FONT_SIZE, weight='bold')

y_min = 0
y_max = 35000
y_interval = 2500
ax.set_ylim([y_min, y_max])
y_ticks = [x*y_interval for x in range(y_min, int(y_max/y_interval)+1)]
ax.set_yticks(y_ticks)

ax.set_xticks([x for x in range(0, len(x_ticks))])
ax.set_xticklabels(x_ticks, rotation=90, size=TICK_FONT_SIZE)
ax.set_xlabel('')


year_markers = [3, 15, 27]
for x in year_markers:
    ax.axvline(x=x, color='k', alpha=0.35)

ax.legend(['Subscribers', 'Customers', 'Commuters', 'Non Commuters'], loc=2, frameon=True)
# plt.show()
plt.savefig('../charts/user_trends/month_to_month.png')
plt.close()

## Plot Trip Duration

In [34]:
# SUBSCRIBERS and COMMUTERS

duration_cutoff = 30.0
interval = 3

plt.subplots(figsize=figsize)
ax = subscriber_trips[subscriber_trips.duration_minutes < duration_cutoff].duration_minutes.plot(kind='hist', bins=100, alpha=0.35, color=sub_color)
commuter_trips[commuter_trips.duration_minutes < duration_cutoff].duration_minutes.plot(kind='hist', bins=100, alpha=0.35, color=commuter_color, ax=ax)

subscriber_median_duration = subscriber_trips[subscriber_trips.duration_minutes < duration_cutoff].duration_minutes.median()
commuter_median_duration = commuter_trips[commuter_trips.duration_minutes < duration_cutoff].duration_minutes.median()
plt.legend(['Subscriber Trips', 'Commuter Trips'], loc=1, frameon=True)
lines = {'subscriber_mean' : subscriber_median_duration,
        'commuter_mean' : commuter_median_duration}
    
label_alpha = 0.5
# draw line on date
ax.axvline(x=subscriber_median_duration, linestyle='-', alpha=1.0, color=cust_color, linewidth=3)
ax.text((subscriber_median_duration-0.5)/duration_cutoff, 0.15, 'Subscriber Median {:2.4f}'.format(subscriber_median_duration),
    horizontalalignment='right',
    verticalalignment='baseline',
    rotation=0,
    transform=ax.transAxes,
    size=TICK_FONT_SIZE, color='w', weight='bold', alpha=label_alpha, backgroundcolor=(0.0, 0.0, 0.0, 0.5))

ax.axvline(x=commuter_median_duration, linestyle='-', alpha=1.0, color=cust_color_alt, linewidth=3)
ax.text((commuter_median_duration+0.5)/duration_cutoff, 0.15, 'Commuter Median {:2.4f}'.format(commuter_median_duration),
    horizontalalignment='left',
    verticalalignment='baseline',
    rotation=0,
    transform=ax.transAxes,
    size=TICK_FONT_SIZE, color='w', weight='bold', alpha=label_alpha, backgroundcolor=(0.0, 0.0, 0.0, 0.5))
ax.set_xticks([x*interval for x in np.arange(0, (duration_cutoff/interval)+1, 1)])
ax.set_xlabel('Trip Duration (minutes)', size=LABEL_FONT_SIZE*2)
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*2)
plt.title('30 Minutes or less shown, median from all trips', size=TITLE_FONT_SIZE*0.5)
plt.suptitle('Distribution of Subscriber and Commuter Trip Duration', y=0.97, size=TITLE_FONT_SIZE, weight='bold')

# plt.show()
plt.savefig('../charts/user_trends/subscriber_and_commuter_trip_duration_histogram.png')
plt.close()


# CUSTOMERS
duration_cutoff = 60.0
interval = 3
plt.subplots(figsize=figsize)
ax = customer_trips[customer_trips.duration_minutes < duration_cutoff].duration_minutes.plot(kind='hist', bins=100, alpha=0.35, color=cust_color)
plt.legend(['Customer Trips'], loc=1, frameon=True)
customer_median_duration = customer_trips[customer_trips.duration_minutes < duration_cutoff].duration_minutes.median()
ax.axvline(x=customer_median_duration, linestyle='-', alpha=1.0, color=sub_color, linewidth=3)
ax.text((customer_median_duration+2)/duration_cutoff, 0.15, 'Customer Median {:2.4f}'.format(customer_median_duration),
    horizontalalignment='left',
    verticalalignment='baseline',
    rotation=0,
    transform=ax.transAxes,
    size=TICK_FONT_SIZE, color='w', weight='bold', alpha=label_alpha, backgroundcolor=(0.0, 0.0, 0.0, 0.5))


if duration_cutoff > 30.0:
    ax.axvline(x=30, linestyle=':', alpha=0.75, color=sub_color, linewidth=3)
    ax.text((30+1)/duration_cutoff, (27.5/35), 'These Trips Incure Additional Fee',
        horizontalalignment='left',
        verticalalignment='baseline',
        rotation=0,
        transform=ax.transAxes,
        size=TICK_FONT_SIZE, color='w', weight='bold', alpha=label_alpha, backgroundcolor=(0.0, 0.0, 0.0, 0.5))




ax.set_xticks([x*interval for x in np.arange(0, (duration_cutoff/interval)+1, 1)])
ax.set_xlabel('Trip Duration (minutes)', size=LABEL_FONT_SIZE*2)
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*2)
plt.title('60 Minutes or less shown, median from all trips', size=TITLE_FONT_SIZE*0.5)
plt.suptitle('Distribution of Customer Trip Duration', y=0.975, size=TITLE_FONT_SIZE, weight='bold')

# plt.show()
plt.savefig('../charts/user_trends/customer_trip_duration_histogram.png')
plt.close()

## Plot Trip Duration - 3 Hour Customers

In [35]:
# CUSTOMERS
duration_cutoff_min = 30.0
duration_cutoff_max = 180.0
duration_cutoff_diff = duration_cutoff_max - duration_cutoff_min

three_hour_customer_trips = customer_trips[(customer_trips.duration_minutes > duration_cutoff_min) &
                                           (customer_trips.duration_minutes <= duration_cutoff_max)].copy()

interval = 15
plt.subplots(figsize=figsize)
ax = three_hour_customer_trips.duration_minutes.plot(kind='hist', bins=100, alpha=0.35, color=cust_color_alt)

plt.legend(['Fee Incuring Customer Trips'], loc=1, frameon=True)
three_hour_customer_trips_median_duration = three_hour_customer_trips.duration_minutes.median()

ax.axvline(x=three_hour_customer_trips_median_duration, linestyle='-', alpha=1.0, color=sub_color, linewidth=3)
ax.text((three_hour_customer_trips_median_duration-15)/duration_cutoff_max, (875/2000), 'Fee Incuring Customer Median {:2.4f}'.format(three_hour_customer_trips_median_duration),
    horizontalalignment='left',
    verticalalignment='baseline',
    rotation=0,
    transform=ax.transAxes,
    size=TICK_FONT_SIZE, color='w', weight='bold', alpha=label_alpha, backgroundcolor=(0.0, 0.0, 0.0, 0.5))




ax.set_xticks([x*interval for x in np.arange(duration_cutoff_min/interval, duration_cutoff_max/interval+1, 1)])
ax.set_xlabel('Trip Duration (minutes)', size=LABEL_FONT_SIZE*2)
ax.set_ylabel('Trip Count', size=LABEL_FONT_SIZE*2)
plt.suptitle('Distribution of Fee Incuring Customer Trip Duration', y=0.975, size=TITLE_FONT_SIZE, weight='bold')

# plt.show()
plt.savefig('../charts/user_trends/three_hour_customer_trip_duration_histogram.png')
plt.close()