In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime

import seaborn as sns
sns.set()

## Load Trip Data

In [2]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_trip_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    trip_import = pd.DataFrame()

    counter = 1
    chunks = []

    # load data from each file
    for file in file_list:

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):

            # set chunk index column to 'Trip ID'
            chunk = chunk.set_index('Trip ID')

            # define Columns
            chunk.columns = ['Duration', 'Start Date', 'Start Station', 'Start Terminal', 'End Date', 
                             'End Station', 'End Terminal', 'Bike #', 'Subscriber Type', 'Zip Code']

            # append chunk to chunks list
            chunks.append(chunk)

        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    # concat chunks
    trip_import = pd.concat(chunks)

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

Loading Trip Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [3]:
# zipcodes are all over the place, only keep corrected 5 digit zipcodes, and replace all others with NaNs
def clean_zipcode(item):
    if len(item) != 5:

        # split on '-'
        try:
            result = item.split('-')[0]
        except:
            result = item

        # split on '.'
        try:
            result = item.split('.')[0]
        except:
            result = item
        
        # if len of item is less than 5, return 'NaN'
        if len(result) < 5:
            result = 'NaN'
        else:
            # if len result is greater than 5, take at most, first 5 digits
            result = result[:5]
    else:
        result = item
    
    # make sure result is all digits
    if result.isdigit():
        return result
    else:
        return 'NaN'

In [1]:
print('Trip Data Cleanup Started...')
trip_data = trip_import.copy()

# cleanup column names
print('\tcleaning column names')
new_cols = []
for col in trip_data.columns:
    new_cols.append(col.replace(' ', '_').lower())
trip_data.columns = new_cols

# extract columns we want to keep
print('\tsubsetting to useful columns')
important_cols = ['duration', 'start_date', 'start_terminal', 'end_date', 'end_terminal', 'bike_#', 'subscriber_type', 'zip_code']
trip_data = trip_data[important_cols]

# we are only looking at stations in San Francisco
# sf_trips_data = trip_data[trip_data['start_terminal'].isin(sf_stations)]
# sf_trips_data = sf_trips_data[sf_trips_data['end_terminal'].isin(sf_stations)]

# trip_data = sf_trips_data.copy()

# create duration minutes column
print('\tcreating a duration_minutes column')
trip_data['duration_minutes'] = trip_data['duration'] / 60.0

# convert end and start dates to datetime objects
print('\tconverting end and start dates to datetime objects')
trip_data['start_date'] = pd.to_datetime(trip_data['start_date'], format="%m/%d/%Y %H:%M")
trip_data['end_date']   = pd.to_datetime(trip_data['end_date'],   format="%m/%d/%Y %H:%M")

# convert and clean zipcodes
print('\tcleaning zipcodes')
trip_data['zip_code'] = trip_data['zip_code'].astype(str)
trip_data.zip_code = trip_data.zip_code.apply(clean_zipcode)
trip_data['zip_code'] = pd.to_numeric(trip_data['zip_code'], errors='coerce')

# clean up data types
print('cleaning up data types')

trip_data['duration']         = trip_data['duration'].astype('float')
trip_data['start_terminal']   = trip_data['start_terminal'].astype('category')
trip_data['end_terminal']     = trip_data['end_terminal'].astype('category')
trip_data['bike_#']           = trip_data['bike_#'].astype('int')
trip_data['subscriber_type']  = trip_data['subscriber_type'].astype('category')
trip_data['zip_code']         = trip_data['zip_code'].astype('str')
trip_data['duration_minutes'] = trip_data['duration_minutes'].astype('float')


print('Trip Data Cleanup complete')

Trip Data Cleanup Started...


NameError: name 'trip_import' is not defined

In [6]:
trip_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 983648 entries, 913465 to 198776
Data columns (total 9 columns):
duration            983648 non-null float64
start_date          983648 non-null datetime64[ns]
start_terminal      983648 non-null category
end_date            983648 non-null datetime64[ns]
end_terminal        983648 non-null category
bike_#              983648 non-null int64
subscriber_type     983648 non-null category
zip_code            983648 non-null object
duration_minutes    983648 non-null float64
dtypes: category(3), datetime64[ns](2), float64(2), int64(1), object(1)
memory usage: 55.4+ MB


In [7]:
trip_data.head()

Unnamed: 0_level_0,duration,start_date,start_terminal,end_date,end_terminal,bike_#,subscriber_type,zip_code,duration_minutes
Trip ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
913465,746.0,2015-09-01 00:10:00,69,2015-09-01 00:23:00,58,238,Subscriber,94107.0,12.433333
913466,969.0,2015-09-01 00:15:00,41,2015-09-01 00:31:00,46,16,Subscriber,94133.0,16.15
913467,233.0,2015-09-01 00:15:00,42,2015-09-01 00:19:00,45,534,Subscriber,94111.0,3.883333
913468,213.0,2015-09-01 01:29:00,41,2015-09-01 01:32:00,74,312,Subscriber,94107.0,3.55
913469,574.0,2015-09-01 01:33:00,74,2015-09-01 01:42:00,69,279,Subscriber,94107.0,9.566667


In [None]:
trip_clean = trip_data.copy()
# get list of all time stamps

if not pre_cleaned:
    trip_start_times = pd.unique(trip_clean.start_date)
    trip_start_times = np.sort(trip_start_times)
    print('trip start times\t', len(trip_clean.start_date))
    print('unique trip start times\t', len(trip_start_times))

    print()
    trip_end_times = pd.unique(trip_clean.end_date)
    trip_end_times = np.sort(trip_end_times)
    print('trip end times\t\t', len(trip_clean.end_date))
    print('unique trip end times\t', len(trip_end_times))
    
    # create numpy array of only unique timestamps from all trips
    unique_times = np.concatenate([trip_start_times, trip_end_times])
    unique_times = np.unique(unique_times)
    unique_times = np.sort(unique_times)

    print()
    print('unique trip times\t', len(unique_times))

## Load Status Data

In [None]:

if not pre_cleaned:
    print('Loading Status Data...')

    # manual load files
    file01 = '../../datasets/bayareabikeshare/201402_status_data.csv'
    file02 = '../../datasets/bayareabikeshare/201408_status_data.csv'
    file03 = '../../datasets/bayareabikeshare/201508_status_data.csv'
    file04 = '../../datasets/bayareabikeshare/201608_status_data.csv'

    print('\nstarted reading ', file01)
    status_01 = pd.DateFrame()
    status_01 = pd.read_csv(file01, parse_dates=['time'])
    print('\tcleaning time')
    # status_01['time']   = pd.to_datetime(status_01['time'],   format="%Y/%m/%d %H:%M:%S")
    status_01['time'] = status_01['time'].apply(lambda t: t.replace(second=0))
    print('\tdone!')

    print('\nstarted reading ', file02)
    status_02 = pd.DateFrame()
    status_02 = pd.read_csv(file02, parse_dates=['time'])
    print('\tcleaning time')
    # status_02['time']   = pd.to_datetime(status_02['time'],   format="%Y/%m/%d %H:%M:%S")
    status_02['time'] = status_02['time'].apply(lambda t: t.replace(second=0))
    print('\tdone!')

    print('\nstarted reading ', file03)
    status_03 = pd.DateFrame()
    status_03 = pd.read_csv(file03, parse_dates=['time'])
    print('\tcleaning time')
    # status_03['time']   = pd.to_datetime(status_03['time'],   format="%Y/%m/%d %H:%M:%S")
    status_03['time'] = status_03['time'].apply(lambda t: t.replace(second=0))
    print('\tdone!')

    print('\nstarted reading ', file04)
    status_04 = pd.DateFrame()
    status_04 = pd.read_csv(file04)
    print('\tcleaning time')
    status_04['time']   = pd.to_datetime(status_04['time'],   format="%m/%d/%Y %H:%M:%S")
    print('\tsetting all time seconds to zero')
    status_04['time'] = status_04['time'].apply(lambda t: t.replace(second=0))
    print('\tdone!')
else:
    print('data was already cleaened and used')


> Status data is ultra dense, with records from every station, at ever minutes.
>
> Previously, we collected unique_trip_times to be an array of all the unique time stamps from all trip start and end dates
>
> we will now prune the status data down to only include records that are in these unique time stamps.
>
> this does not completely removed unneessary times, such as a status record for a station that was not used for a given timestamp, but it reduces the number or records to search to less than half

In [None]:
if not pre_cleaned:
    # status_01 - prune down to only times from unique_trip_times
    status_01_pruned = status_01[status_01['time'].isin(unique_times)]
    print('pruned status_01 from %s to %s' % (len(status_01), len(status_01_pruned)))

    # status_02 - prune down to only times from unique_trip_times
    status_02_pruned = status_02[status_02['time'].isin(unique_times)]
    print('pruned status_02 from %s to %s' % (len(status_02), len(status_02_pruned)))

    # status_03 - prune down to only times from unique_trip_times
    status_03_pruned = status_03[status_03['time'].isin(unique_times)]
    print('pruned status_03 from %s to %s' % (len(status_03), len(status_03_pruned)))

    # status_04 - prune down to only times from unique_trip_times
    status_04_pruned = status_04[status_04['time'].isin(unique_times)]
    print('pruned status_04 from %s to %s' % (len(status_04), len(status_04_pruned)))

In [None]:
if not pre_cleaned:
    status_selects = pd.concat([status_01_pruned, status_02_pruned, status_03_pruned, status_04_pruned])
    status_selects.info()

In [None]:
if not pre_cleaned:
    all_records = len(status_01) + len(status_02) + len(status_03) + len(status_04)
    print('pruned status_data from %s to %s' % (all_records, len(status_selects)))
    print('\tratio %s' % (len(status_selects) / all_records * 100.))

## Join Trip Data with Status Data


> for each trip, we want to append a 'bikes_available' column with the number of bikes that were available at that time, at that station
>
> for each trip, we want to append a 'docks_available' column with the number of docks that were available at that time, at that station
>


In [None]:
def docks_available_at_end(row):
    # lookup bikes that were available at that station
    test = status_selects.loc[status_selects['station_id'] == row['end_terminal']]
    test = test.loc[test['time'] == row['end_date']]
    print('[bikes_available_at_start] - Last updated at: ', datetime.datetime.now())
    try:
        result = int(test.docks_available)
    except:
        result = 'NaN'
        
    return result
        
    
def bikes_available_at_start(row):
    # lookup bikes that were available at that station
    test = status_selects.loc[status_selects['station_id'] == row['start_terminal']]
    test = test.loc[test['time'] == row['start_date']]
    print('[bikes_available_at_start] - Last updated at: ', datetime.datetime.now())
    try:
        result = int(test.bikes_available)
    except:
        result = 'NaN'
        
    return result

def start_terminal_zip(row):
    test = status_selects.loc['status_selects['station_id]' == row[start_terminal']]
    return test.zip


def end_terminal_zip(row):
    test = status_selects.loc['status_selects['station_id]' == row[start_terminal']]
    return test.zip
        

>For each trip records, append a the following columns
- 'docks_available_at_end' : The number of docks available for the rider to choose from when they ended their trip
- 'bikes_available_at_start' : The number of bikes available for the rider to choose from when they started their trip

In [None]:
if not pre_cleaned:
    trip_terminal_utilization = trip_clean[['start_date', 'start_terminal', 'end_date', 'end_terminal']].copy()

In [None]:
trip_clean.head()

In [None]:
if not pre_cleaned:
    print('Total Trips\t\t\t', len(trip_clean), '\t', '100.0')
    # only subsciber trips
    subscriber_trips = trip_clean[trip_clean.subscriber_type == 'Subscriber']
    print('Subscriber Trips\t\t', len(subscriber_trips), '\t ', (100.* len(subscriber_trips)/len(trip_clean)))

    # am_commute_start = datetime.datetime.strptime('07:00', '%H:%M').time()
    # print(subscriber_trips.start_date.dt.time > am_commute_start)

    # subscriber trips in commute hours
    am_commute_start = datetime.datetime.strptime('07:00', '%H:%M').time()
    am_commute_end = datetime.datetime.strptime('11:00', '%H:%M').time()
    am_subscriber_commute_trips = subscriber_trips[subscriber_trips.start_date.dt.time >= am_commute_start]
    am_subscriber_commute_trips = am_subscriber_commute_trips[am_subscriber_commute_trips.start_date.dt.time <= am_commute_end]
    print('AM Commute Subscriber Trips\t', len(am_subscriber_commute_trips), '\t ', (100.* len(am_subscriber_commute_trips)/len(trip_clean)))

    pm_commute_start = datetime.datetime.strptime('16:00', '%H:%M').time()
    pm_commute_end = datetime.datetime.strptime('20:00', '%H:%M').time()
    pm_subscriber_commute_trips = subscriber_trips[subscriber_trips.start_date.dt.time >= pm_commute_start]
    pm_subscriber_commute_trips = pm_subscriber_commute_trips[pm_subscriber_commute_trips.start_date.dt.time <= pm_commute_end]
    print('PM Commute Subscriber Trips\t', len(pm_subscriber_commute_trips), '\t ', (100.* len(pm_subscriber_commute_trips)/len(trip_clean)))



    # from previous analysis, these are the top ten start and end terminals used by subscribers during commute hours
    am_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
    am_start_terms = [69, 70, 73, 74, 77]
    am_end_terms   = [51, 55, 60, 61, 63, 65, 69, 70, 74, 77]
    am_end_terms   = [65, 69, 70, 74, 77]
    pm_start_terms = [55, 60, 61, 64, 65, 67, 69, 70, 74, 77]
    pm_start_terms = [67, 69, 70, 74, 77]
    pm_end_terms   = [39, 50, 55, 60, 61, 65, 69, 70, 74, 77]
    pm_end_terms   = [65, 69, 70, 74, 77]


    am_sub_start_terms = am_subscriber_commute_trips[am_subscriber_commute_trips.start_terminal.isin(am_start_terms)].copy()
    print('am_sub_start_terms: \t\t', len(am_sub_start_terms), '\t ', (100.* len(am_sub_start_terms)/len(trip_clean)))
    am_sub_end_terms   = am_subscriber_commute_trips[am_subscriber_commute_trips.end_terminal.isin(am_end_terms)].copy()
    print('am_sub_end_terms:\t\t ', len(am_sub_end_terms), '\t ', (100.* len(am_sub_end_terms)/len(trip_clean)))

    pm_sub_start_terms = pm_subscriber_commute_trips[pm_subscriber_commute_trips.start_terminal.isin(pm_start_terms)].copy()
    print('pm_sub_start_terms: \t\t ', len(pm_sub_start_terms), '\t ', (100.* len(pm_sub_start_terms)/len(trip_clean)))
    pm_sub_end_terms   = pm_subscriber_commute_trips[pm_subscriber_commute_trips.end_terminal.isin(pm_end_terms)].copy()
    print('pm_sub_end_terms: \t\t', len(pm_sub_end_terms), '\t ', (100.* len(pm_sub_end_terms)/len(trip_clean)))





In [None]:
if not pre_cleaned:
    print('am_sub_start_terms: ', len(am_sub_start_terms))
    print('pm_sub_start_terms:  ', len(pm_sub_start_terms))

## Appending Dock Informtion to Trips

> We are going to narrow our search to subscribers during morning and evening commute hours
>
> We will also be looking only at trips that started during these commute hours
>

In [None]:
if not pre_cleaned:
    # pm_sub_start_terms.info()

    # print('Appending \'docks_available_at_end\'')
    # pm_sub_start_terms['docks_available_at_end'] = pm_sub_start_terms.apply(lambda row: docks_available_at_end (row),axis=1)
    print('Appending \'bikes_available_at_start\'')
    pm_sub_start_terms['bikes_available_at_start'] = pm_sub_start_terms.apply(lambda row: bikes_available_at_start(row),axis=1)
    print('Done!')

    print('Writing to file')
    pm_sub_start_terms.to_csv('pm_sub_start_terms.csv', encoding='utf-8')
    print('Done!')

    pm_sub_start_terms.head()

In [None]:
if not pre_cleaned:
    # am_sub_start_terms.info()

    # print('Appending \'docks_available_at_end\'')
    am_sub_start_terms['docks_available_at_end'] = am_sub_start_terms.apply(lambda row: docks_available_at_end (row),axis=1)
    print('Appending \'bikes_available_at_start\'')
    am_sub_start_terms['bikes_available_at_start'] = am_sub_start_terms.apply(lambda row: bikes_available_at_start(row),axis=1)
    print('Done!')

    print('Writing to file')
    am_sub_start_terms.to_csv('am_sub_start_terms.csv', encoding='utf-8')
    print('Done!')

    am_sub_start_terms.head()

In [None]:
trip_clean.head()

In [None]:
plt.subplots(figsize=(12,6))
trip_clean.groupby('start_terminal')['bikes_available_at_start'].mean().plot(kind='bar')
plt.show()

In [None]:
sns.lmplot(x='start_terminal', y='bikes_available_at_start', data=trip_clean, x_estimator=np.mean)
plt.show()

In [None]:
sns.jointplot(x='end_terminal', y='docks_available_at_end', data=trip_clean, kind='reg')
plt.show()

In [None]:
plt.subplots(figsize=(12,6))
ax = sns.swarmplot(x="start_terminal", y="bikes_available_at_start", data=trip_clean)
plt.show()

In [None]:
plt.subplots(figsize=(12,6))
ax = sns.swarmplot(x="end_terminal", y="docks_available_at_start", data=trip_clean)
plt.show()

In [None]:
# bike number vs number of tirps
bike_first_trip = trip_clean.groupby('bike_#')['start_date'].min()
bike_last_trip = trip_clean.groupby('bike_#')['start_date'].max()

In [None]:
bike_first = bike_first_trip.to_frame()
bike_last  = bike_last_trip.to_frame()

In [None]:
junk = trip_clean.groupby(['start_terminal', 'end_terminal'])['bike_#'].count()
junk.tail(10)

In [None]:
bike_data.head()

In [None]:
bike_data = pd.DataFrame()

bike_first_trip = trip_clean.groupby('bike_#')['start_date'].min()
bike_last_trip = trip_clean.groupby('bike_#')['start_date'].max()



bike_first = bike_first_trip.to_frame()
bike_last  = bike_last_trip.to_frame()

bike_data = pd.concat([bike_first, bike_last], axis=1)

bike_data.columns = ['first_trip', 'last_trip']

bike_data['days_in_service'] = (bike_data['last_trip'] - bike_data['first_trip']).dt.days

bike_data.reset_index(inplace=True)
bike_data.columns = ['bike_id', 'first_trip', 'last_trip', 'days_in_service']

In [None]:
bike_data.info()

In [None]:
bike_data.head()

In [None]:
bike_data['days_in_service'] = (bike_data['last_trip'] - bike_data['first_trip']).dt.days

In [None]:
bike_data.reset_index(inplace=True)
bike_data.columns = ['bike_id', 'first_trip', 'last_trip', 'days_in_service']

In [None]:
bike_data.head()

In [None]:
bike_data.info()

In [None]:
plt.subplots(figsize=(12,6))
sns.distplot(bike_data.days_in_service, color='b')
# plt.legend()
plt.show()

In [None]:
sns.jointplot(x=bike_data.bike_id, y=bike_data.days_in_service, data=bike_data)
plt.show()

In [None]:
bike_data.head()

In [None]:
last_recorded_trip = bike_data.last_trip.max()
bike_data['days_since_last_trip'] = (last_recorded_trip - bike_data.last_trip).dt.days


In [None]:
bike_data.head(5)

In [None]:
sns.jointplot(x='days_in_service', y='days_since_last_trip', data=bike_data)

In [None]:


plt.subplots(figsize=(12,6))
sns.distplot(bike_data[bike_data.days_since_last_trip > 0]['days_since_last_trip'], color='b')
# plt.legend()
plt.show()