# Statistical Analysis of Bay Area Bike Share Data

> From our initial Visual Exploratory Data Analysis on the Bay Area BIke Share dataset, we inferred that the vast majority of the trips are taken by cummuters, who are subscribers.
>
> We will also be retaining from previous analysis that we only need concern ourselves with trips no more than 60 minutes in duration
>
> Predicting ridership appears pretty easy, commuters need to commute, and customers seem to be mostly starting or ending their trips at propular tourist destinations.
>
> <b>1</b> Are Customer or Subscriber trip counts affected by Rain?
> 
> <b>2</b> Are Customer or Subscriber trip counts affected by Hot or Cold Temperatures?
>
> <b>3</b> Are the number of trips taken by subscribers in morning commute hours coorelated to the number of trips taken by subscribers in evening commute hours?

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob
from geopy.distance import vincenty


import seaborn as sns
sns.set()

## Load Data

### Trip Data

In [None]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_trip_data.csv'
    file_list = glob(file_path_slug)

    trip_import = pd.DataFrame()
    
    counter = 1
    chunks = []
    
    for file in file_list:
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            chunk = chunk.set_index('Trip ID')
            chunk.columns = ['Duration', 'Start Date', 'Start Station', 'Start Terminal', 'End Date', 
                             'End Station', 'End Terminal', 'Bike #', 'Subscriber Type', 'Zip Code']
            chunks.append(chunk)
        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    trip_import = pd.concat(chunks)
    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

In [None]:
trip_data = trip_import.copy()

### Weather Data

In [None]:
print('Loading Weather Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_weather_data.csv'
    file_list = glob(file_path_slug)

    weather_import = pd.DataFrame()

    counter = 1
    chunks = []

    for file in file_list:
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            chunk.columns = ['Date', 'Max_Temperature_F', 'Mean_Temperature_F', 'Min_TemperatureF', 'Max_Dew_Point_F', 
                             'MeanDew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 
                             'Max_Sea_Level_Pressure_In', 'Mean_Sea_Level_Pressure_In', 'Min_Sea_Level_Pressure_In', 
                             'Max_Visibility_Miles', 'Mean_Visibility_Miles', 'Min_Visibility_Miles', 
                             'Max_Wind_Speed_MPH', 'Mean_Wind_Speed_MPH', 'Max_Gust_Speed_MPH', 'Precipitation_In', 
                             'Cloud_Cover', 'Events', 'Wind_Dir_Degrees', 'zip']
            chunks.append(chunk)
        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1

    weather_import = pd.concat(chunks)
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')

In [None]:
weather_data = weather_import.copy()

### Station Data

In [None]:
print('Loading Station Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_station_data.csv'
    file_list = glob(file_path_slug)

    station_import = pd.DataFrame()

    counter = 1
    chunks = []

    for file in file_list:
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
            chunks.append(chunk)
        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    station_import = pd.concat(chunks)
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong importing the data :(')

In [None]:
station_data = station_import.copy()

## Cleaning Data

### Trip Data

In [None]:
# our data set show duration in seconds, here are some handy conversions
second = 1
minute = second * 60
hour = minute * 60

# zipcodes are all over the place, only keep corrected 5 digit zipcodes, and replace all others with NaNs
def clean_zipcode(item):
    if len(item) != 5:
        # split on '-'
        try:
            result = item.split('-')[0]
        except:
            result = item
        # split on '.'
        try:
            result = item.split('.')[0]
        except:
            result = item
        # if len of item is less than 5, return 'NaN'
        if len(result) < 5:
            result = 'NaN'
        else:
            # if len result is greater than 5, take at most, first 5 digits
            result = result[:5]
    else:
        result = item
    # make sure result is all digits
    if result.isdigit():
        return result
    else:
        return 'NaN'

In [None]:
print('Trip Data Cleanup Started...')

# cleanup column names
print('\tcleaning column names')
new_cols = []
for col in trip_data.columns:
    new_cols.append(col.replace(' ', '_').lower())
trip_data.columns = new_cols

# extract columns we want to keep
print('\tsubsetting to useful columns')
important_cols = ['duration', 'start_date', 'start_terminal', 'end_date', 'end_terminal', 'bike_#', 'subscriber_type', 'zip_code']
trip_data = trip_data[important_cols]

# create duration minutes column
print('\tcreating a duration_minutes column')
trip_data['duration_minutes'] = trip_data['duration'] / 60.0

# convert end and start dates to datetime objects
print('\tconverting end and start dates to datetime objects')
trip_data['start_date'] = pd.to_datetime(trip_data['start_date'], format="%m/%d/%Y %H:%M")
trip_data['end_date']   = pd.to_datetime(trip_data['end_date'],   format="%m/%d/%Y %H:%M")

# create a start and end hour trip column
print('\tcreating trip_date and trip_dow columns')
trip_data['trip_date']  = trip_data['start_date'].dt.date
trip_data['trip_dow']  = trip_data['start_date'].dt.weekday
trip_data['trip_day']  = trip_data['start_date'].dt.weekday_name

print('\tcreating start_hour and end_hour columns')
trip_data['start_hour'] = trip_data['start_date'].dt.hour
trip_data['end_hour']   = trip_data['end_date'].dt.hour

# convert and clean zipcodes
print('\tcleaning zipcodes')
trip_data['zip_code'] = trip_data['zip_code'].astype(str)
trip_data.zip_code = trip_data.zip_code.apply(clean_zipcode)
trip_data['zip_code'] = pd.to_numeric(trip_data['zip_code'], errors='coerce')

# clean up data types
print('cleaning up data types')

trip_data['duration']         = trip_data['duration'].astype('float')
trip_data['start_terminal']   = trip_data['start_terminal'].astype('category')
trip_data['end_terminal']     = trip_data['end_terminal'].astype('category')
trip_data['bike_#']           = trip_data['bike_#'].astype('int')
trip_data['subscriber_type']  = trip_data['subscriber_type'].astype('category')
trip_data['zip_code']         = trip_data['zip_code'].astype('str')
trip_data['duration_minutes'] = trip_data['duration_minutes'].astype('float')
trip_data['trip_dow']         = trip_data['trip_dow'].astype('category')
trip_data['trip_day']         = trip_data['trip_day'].astype('category')

# prune data to exclude trips longer than 60 minutes
print('pruning data to trips no more than 60 minutes long...')
trip_data = trip_data[trip_data['duration_minutes'] <= 60]

# Cleanup
trip_data.sort_index(inplace=True)
print('\tpruned data set \'trip_data\' consists of %i entries' % len(trip_data.index))

print('Trip Data Cleanup complete')
trip_clean = trip_data.copy()

### Weather Data

In [None]:
print('Weather Data Cleanup Started...')

# cleanup column names
print('\tcleaning column names')
new_cols = []
for col in weather_data.columns:
    new_cols.append(col.replace(' ', '_').lower())
weather_data.columns = new_cols

# convert end and start dates to datetime objects
print('\tconverting dates to datetime objects')
weather_data['date'] = pd.to_datetime(weather_data['date'], format="%m/%d/%Y")

# extract columns we want to keep
print('\tsubsetting to useful columns')
important_cols = ['date', 'max_temperature_f', 'mean_temperature_f', 'min_temperaturef',
                  'max_wind_speed_mph', 'mean_wind_speed_mph', 'max_gust_speed_mph',
                  'precipitation_in', 'cloud_cover', 'events', 'zip']
weather_data = weather_data[important_cols]

# correct min_temperaturef column name to min_temperature_f
weather_data.rename(columns={'min_temperaturef': 'min_temperature_f'}, inplace=True)

# cleanup and set date as index
weather_data.set_index('date', inplace=True)
weather_data.sort_index(inplace=True)

# cleanup precipitation data to be all float values
weather_data['precipitation_in'] = pd.to_numeric(weather_data['precipitation_in'], errors='coerce')

# we only want San Francisco Weather information, zipcode 94107
weather_data = weather_data[weather_data.zip == 94107]

print('Weather Data Cleanup complete')
weather_clean = weather_data.copy()

### Station Data

In [None]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

def make_lat_long(row):
    lat = row['lat']
    long = row['long']
    return (lat, long)

In [None]:
station_data = station_import.copy()

# remove dulplicates
print('remove dulplicates')
station_data.drop_duplicates(keep='first', inplace=True)
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('set datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')

# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
# station_data['zip_code'] = station_data['landmark'].astype('str')

# create lat,lon tuple column
station_data['lat_long'] = station_data.apply(lambda row: make_lat_long (row),axis=1)

# reindex to remove some extra duplicate
print('correcting index')
station_data.reset_index(inplace=True)
station_data.drop_duplicates(['station_id', 'installation'], keep='first', inplace=True)
station_data.set_index('station_id', inplace=True)
station_data.sort_index(inplace=True)
del station_data['index']

station_clean = station_data.copy()
print('Cleaning complete!')
station_clean.info()

## Appending Distance Data to Trips

In [None]:
def route_distance(row):
    
    # round trips are defaulting to zero km
    if row['start_terminal'] == row['end_terminal']:
        dist = 0.0
    else:
        # lookup start_station id coords
        start_gps = station_clean.loc[row['start_terminal']]['lat_long']
        end_gps = station_clean.loc[row['end_terminal']]['lat_long']

        if isinstance(start_gps, pd.core.series.Series):
            start_gps = start_gps.iloc[-1]
        if isinstance(end_gps, pd.core.series.Series):
            end_gps = end_gps.iloc[-1]
        # sloppy lookup, uses most recent station coordinates
        # does not account for stations that are relocated over time correctly
        try:
            dist = str(vincenty(start_gps, end_gps))
            dist = float(dist.split(' ')[0])
        except:
            dist = 'NaN'  
    return dist
    

In [None]:
# trip_clean['distance_km'] = trip_clean.apply(lambda row: route_distance (row),axis=1)

## Splitting up Rainy and Dry Days

In [None]:
# split up rainy days and dry days
rainy_days = weather_clean[ weather_clean['precipitation_in'] > 0.0].reset_index()
dry_days =   weather_clean[-weather_clean['precipitation_in'] > 0.0].reset_index()

# All trips
rainy_trips = trip_clean[ trip_clean['start_date'].dt.date.isin(rainy_days['date'].dt.date)]
dry_trips   = trip_clean[-trip_clean['start_date'].dt.date.isin(rainy_days['date'].dt.date)]

# Customer Trips
customer_rainy_trips = rainy_trips[rainy_trips.subscriber_type == 'Customer']
customer_dry_trips = dry_trips[dry_trips.subscriber_type == 'Customer']

# Subscriber Trips
subscriber_rainy_trips = rainy_trips[rainy_trips.subscriber_type == 'Subscriber']
subscriber_dry_trips = dry_trips[dry_trips.subscriber_type == 'Subscriber']

## Splitting up Hot and Cold Days

- Hot days are days with a mean temperature over 70°F
- Cold days are days with a mean temperature below 50°F

In [None]:
# split up rainy days and dry days
hot_days = weather_clean[weather_clean['mean_temperature_f'] > 75.0].reset_index()
cold_days = weather_clean[weather_clean['mean_temperature_f'] < 45.0].reset_index()

# normal days are days not in either hot or cold days
norm_days = weather_clean[weather_clean['mean_temperature_f'] <= 75.0]
norm_days = norm_days[norm_days['mean_temperature_f'] >= 45.0].reset_index()


# All trips
hot_trips  = trip_clean[trip_clean['start_date'].dt.date.isin(hot_days['date'].dt.date)]
cold_trips = trip_clean[trip_clean['start_date'].dt.date.isin(cold_days['date'].dt.date)]
norm_trips = trip_clean[trip_clean['start_date'].dt.date.isin(norm_days['date'].dt.date)]


# Customer Trips
customer_hot_trips  = hot_trips[hot_trips.subscriber_type == 'Customer']
customer_cold_trips = cold_trips[cold_trips.subscriber_type == 'Customer']
customer_norm_trips = norm_trips[norm_trips.subscriber_type == 'Customer']

# Subscriber Trips
subscriber_hot_trips  = hot_trips[hot_trips.subscriber_type == 'Subscriber']
subscriber_cold_trips = cold_trips[cold_trips.subscriber_type == 'Subscriber']
subscriber_norm_trips = norm_trips[norm_trips.subscriber_type == 'Subscriber']

In [None]:
print(len(hot_trips))
print(len(cold_trips))
print(len(norm_trips))
print('------')
print(len(norm_trips) + len(hot_trips) + len(cold_trips))
print(len(trip_clean))

In [None]:
def calculate_stats(data1, data2):

    # means
    data1_mean = data1.mean()
    data2_mean = data2.mean()
    diff_mean = data1_mean - data2_mean
    print('Diff of means:\t\t', diff_mean)

    # calculate t statistic and p value with scipy
    t, p = stats.ttest_ind(data1, data2)
    print('T Test')
    print('\tt statistic:\t\t', t)
    print('\tp value:\t\t', p)
    print('')
    u, p2 = stats.mannwhitneyu(data1, data2)
    print('MannWhitneyU Test')
    print('\tu statistic:\t\t', u)
    print('\tp value:\t\t', p2)

## Analysis

### 1. Does Rain Affect Trips Duration of Customers or of Subscribers?

> A <b>Two Sample T Test</b> is appropriate for this problem as we are trying to see a difference between two sample means
- Mean ride duration on rainy days vs mean ride duration on dry days
>
> ##### Customer Trips
- $H1C$o : Customer Mean Number of trips on Rainy Days = Customer Mean Number of trips on Dry Days
- $H1C$a : Customer Mean Number of trips on Rainy Days ≠ Customer Mean Number of trips on Dry Days
>
> ##### Subscriber Trips
- $H2S$o : Subscriber Mean Number of trips on Rainy Days = Subscriber Mean Number of trips on Dry Days
- $H2S$a : Subscriber Mean Number of trips on Rainy Days ≠ Subscriber Mean Number of trips on Dry Days

### 1. Results

> #### Customer Trips
> Mean trip durations on rainy days are equal mean trip durations on dry days
- T Statistic <b>-1.4050</b> 
- P Value <b>0.1667</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $HC$o


> #### Subscriber Trips
> Mean trip durations on rainy days are not equal to mean trip durations on dry days
- T Statistic <b>0.1287</b> 
- P Value <b>0.8981</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $HS$o



### 1. Calculations

In [None]:
# Customer Trips Only

customer_rainy_data = customer_rainy_trips.groupby('start_hour')['duration'].count() / len(rainy_trips)
customer_dry_data = customer_dry_trips.groupby('start_hour')['duration'].count() / len(dry_trips)

# Subscriber Trips Only
subscriber_rainy_data = subscriber_rainy_trips.groupby('start_hour')['duration'].count() / len(rainy_trips)
subscriber_dry_data = subscriber_dry_trips.groupby('start_hour')['duration'].count() / len(dry_trips)

print('-' * 40)
print('Customer Trips')
calculate_stats(customer_rainy_data, customer_dry_data)
print()
print('-' * 40)
print('Subscriber Trips')
calculate_stats(subscriber_rainy_data, subscriber_dry_data)
print()
print('-' * 40)

In [None]:
customer_hot_data
customer_cold_data
# customer_rainy_data

### 2. Does Temperature Affect the average number of trips of Customers or of Subscribers?

> A <b>Two Sample T Test</b> is appropriate for this problem as we are trying to see a difference between two sample means
- Mean ride duration on rainy days vs mean ride duration on dry days
>
>
> ##### Customer Trips - Hot Days
- $H1C$o : Customer Mean Number of trips on Hot Days = Customer Mean Number of trips on Normal Days
- $H1C$a : Customer Mean Number of trips on Hot Days ≠ Customer Mean Number of trips on Normal Days
>
> ##### Subscriber Trips - Hot Days
- $H1S$o : Subscriber Mean Number of trips on Hot Days = Subscriber Mean Number of trips on Normal Days
- $H1S$a : Subscriber Mean Number of trips on Hot Days ≠ Subscriber Mean Number of trips on Normal Days


> ##### Customer Trips - Cold Days
- $H2C$o : Customer Mean Number of trips on Cold Days = Customer Mean Number of trips on Normal Days
- $H2C$a : Customer Mean Number of trips on Cold Days ≠ Customer Mean Number of trips on Normal Days
>
> ##### Subscriber Trips - Cold Days
- $H2S$o : Subscriber Mean Number of trips on Cold Days = Subscriber Mean Number of trips on Normal Days
- $H2S$a : Subscriber Mean Number of trips on Cold Days ≠ Subscriber Mean Number of trips on Normal Days

### 2. Results - Hot Days

> #### Customer Trips
> Mean trip durations on rainy days are equal mean trip durations on dry days
- T Statistic <b>-0.2014</b> 
- P Value <b>0.8413</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $H1C$o

> #### Subscriber Trips
> Mean trip durations on rainy days are not equal to mean trip durations on dry days
- T Statistic <b>0.08339</b> 
- P Value <b>0.9339</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $H1S$o




### 2. Results - Cold Days

> #### Customer Trips
> Mean trip durations on rainy days are equal mean trip durations on dry days
- T Statistic <b>0.5714</b> 
- P Value <b>0.5707</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $H2C$o

> #### Subscriber Trips
> Mean trip durations on rainy days are not equal to mean trip durations on dry days
- T Statistic <b>-0.0089</b> 
- P Value <b>0.9929</b> which is above the 0.05 threshhold thus we <b>can not reject</b> the $H2S$o



### 2. Calculations

In [None]:
print('\n Normal vs Hot Days \n')

# Customer Trips Only
customer_hot_data = customer_hot_trips.groupby('start_hour')['duration'].count() / len(hot_trips)
customer_norm_data = customer_norm_trips.groupby('start_hour')['duration'].count() / len(norm_trips)

# Subscriber Trips Only
subscriber_hot_data = subscriber_hot_trips.groupby('start_hour')['duration'].count() / len(hot_trips)
subscriber_norm_data = subscriber_norm_trips.groupby('start_hour')['duration'].count() / len(norm_trips)

print('-' * 40)
print('Customer Trips')
calculate_stats(customer_hot_data, customer_norm_data)
print()
print('-' * 40)
print('Subscriber Trips')
calculate_stats(subscriber_hot_data, subscriber_norm_data)
print()
print('-' * 40)

In [None]:
print('\n Normal vs Cold Days \n')

# Customer Trips Only
customer_cold_data = customer_cold_trips.groupby('start_hour')['duration'].count() / len(cold_trips)
customer_norm_data = customer_norm_trips.groupby('start_hour')['duration'].count() / len(norm_trips)

# Subscriber Trips Only
subscriber_cold_data = subscriber_cold_trips.groupby('start_hour')['duration'].count() / len(cold_trips)
subscriber_norm_data = subscriber_norm_trips.groupby('start_hour')['duration'].count() / len(norm_trips)

print('-' * 40)
print('Customer Trips')
calculate_stats(customer_cold_data, customer_norm_data)
print()
print('-' * 40)
print('Subscriber Trips')
calculate_stats(subscriber_cold_data, subscriber_norm_data)
print()
print('-' * 40)

### 3. Compare morning and evening commute hours

In [None]:
def commute_timer(row, start_time):
    orig = row.start_date.to_pydatetime()
    orig_time = orig.time()
    
    a = datetime.timedelta(hours=orig_time.hour, minutes=orig_time.minute, seconds=orig_time.second)
    b = datetime.timedelta(hours=start_time.hour, minutes=start_time.minute, seconds=start_time.second)

    result = a - b
    
    return result


In [None]:
subscriber_trips = trip_clean[trip_clean.subscriber_type == 'Subscriber']



# prune only morning commute hours from subscribers [07:00 - 11:00]
am_commute_start = datetime.datetime.strptime('07:00', '%H:%M').time()
am_commute_end = datetime.datetime.strptime('11:00', '%H:%M').time()
morning_commutes = subscriber_trips[subscriber_trips.start_date.dt.time >= am_commute_start]
morning_commutes = morning_commutes[morning_commutes.start_date.dt.time < am_commute_end]

# prune only evening commute hours from subscribers [16:00 - 20:00]
pm_commute_start = datetime.datetime.strptime('16:00', '%H:%M').time()
pm_commute_end = datetime.datetime.strptime('20:00', '%H:%M').time()
evening_commutes = subscriber_trips[subscriber_trips.start_date.dt.time >= pm_commute_start]
evening_commutes = evening_commutes[evening_commutes.start_date.dt.time < pm_commute_end]

# morning_commutes['time_adjust'] = morning_commutes.start_date.dt.time - am_commute_start
morning_commutes['time_adj'] = morning_commutes.apply(lambda row: commute_timer(row, am_commute_start),axis=1)
evening_commutes['time_adj'] = evening_commutes.apply(lambda row: commute_timer(row, pm_commute_start),axis=1)

# fix time_adj type
morning_commutes.time_adj = morning_commutes.time_adj.astype('timedelta64[m]')
evening_commutes.time_adj = evening_commutes.time_adj.astype('timedelta64[m]')

print('morning_commutes:\t', len(morning_commutes))
print('evening_commutes:\t', len(evening_commutes))

# plot the data
plt.subplots(figsize=(12,6))

ax = sns.distplot(morning_commutes.time_adj, color='b', label='morning')
sns.distplot(evening_commutes.time_adj, color='y', label='evening', ax=ax)

ax.set(xlabel='start hour')
plt.legend()
plt.show()

# calculate some statistics
am_commuters = morning_commutes.groupby('time_adj')['start_terminal'].count()
pm_commuters = evening_commutes.groupby('time_adj')['start_terminal'].count()

print('-' * 40)
print('Commuter Trips')
calculate_stats(am_commuters, pm_commuters)
print()
print('-' * 40)

In [None]:
junk_sub = pd.DataFrame()
junk_sub['cold'] = subscriber_cold_data
junk_sub['hot'] = subscriber_hot_data
junk_sub['norm'] = subscriber_norm_data

sns.jointplot(x='hot', y='norm', data=junk_sub, kind='reg')
plt.show()
sns.jointplot(x='cold', y='norm', data=junk_sub, kind='reg')
plt.show()



junk_cust = pd.DataFrame()
junk_cust['cold'] = customer_cold_data
junk_cust['hot'] = customer_hot_data
junk_cust['norm'] = customer_norm_data

sns.jointplot(x='hot', y='norm', data=junk_cust, kind='reg')
plt.show()
sns.jointplot(x='cold', y='norm', data=junk_cust, kind='reg')
plt.show()

In [None]:
commute_hours = pd.DataFrame()
commute_hours['morning'] = am_commuters
commute_hours['evening'] = pm_commuters

sns.jointplot(x='morning', y='evening', data=commute_hours, kind='reg')
plt.show()

### Split morning and evening - Subscribers

In [None]:
def trip_time_adj(row):
    orig = row.start_date.to_pydatetime()
    orig_time = orig.time()
    
    noon = datetime.datetime.strptime('12:00', '%H:%M').time()
    midnight = datetime.datetime.strptime('00:00', '%H:%M').time()
    
    # this is horribly inneficient, but it does work
    _orig = datetime.timedelta(hours=orig_time.hour, minutes=orig_time.minute, seconds=orig_time.second)
    _noon = datetime.timedelta(hours=noon.hour, minutes=noon.minute, seconds=noon.second)
    _midnight = datetime.timedelta(hours=midnight.hour, minutes=midnight.minute, seconds=midnight.second)
    
    if _orig > _noon:
        result = _orig - _noon
    else:
        result = _midnight + _orig       
    return result

In [None]:
noon = datetime.datetime.strptime('12:00', '%H:%M').time()

# split subscriber trips into morning and evening trip times [00:00 - 11:59], [12:00, 23:59]
morning_trips = subscriber_trips[subscriber_trips.start_date.dt.time < noon].copy()
afternoon_trips = subscriber_trips[subscriber_trips.start_date.dt.time >= noon].copy()

# create adjusted time for comparison, reduced to minutes after midnight and minutes after noon
morning_trips['time_adj'] = morning_trips.apply(lambda row: trip_time_adj(row),axis=1)
afternoon_trips['time_adj'] = afternoon_trips.apply(lambda row: trip_time_adj(row),axis=1)

# fix time_adj type
morning_trips.time_adj = morning_trips.time_adj.astype('timedelta64[m]')
afternoon_trips.time_adj = afternoon_trips.time_adj.astype('timedelta64[m]')

print('morning_trips:\t\t', len(morning_trips))
print('afternoon_trips:\t', len(afternoon_trips))

# plot the data
plt.subplots(figsize=(12,6))

sns.distplot(morning_trips.time_adj, color='b', label='morning')
sns.distplot(afternoon_trips.time_adj, color='y', label='evening')

ax.set(xlabel='start hour')
plt.legend()
plt.show()

# calculate some statistics!
am_trips = morning_trips.groupby('time_adj')['start_terminal'].count()
pm_trips = afternoon_trips.groupby('time_adj')['start_terminal'].count()

print('-' * 40)
print('Commuter Trips')
calculate_stats(am_trips, pm_trips)
print()
print('-' * 40)

In [None]:
plt.subplots(figsize=(12,6))
sns.distplot(trip_clean[trip_clean.subscriber_type == 'Customer'].start_date.dt.hour, color='r', label='Customers')
ax.set(xlabel='start hour')
plt.legend()
plt.show()

### Split morning and evening - Customers

In [None]:
customer_trips = trip_clean[trip_clean.subscriber_type == 'Customer']

noon = datetime.datetime.strptime('12:00', '%H:%M').time()

# prune only morning commute hours from customers [07:00 - 11:00]
am_trip_start = datetime.datetime.strptime('06:00', '%H:%M').time()
am_trip_end   = datetime.datetime.strptime('12:00', '%H:%M').time()

morning_trips = customer_trips[customer_trips.start_date.dt.time >= am_commute_start].copy()
morning_trips = morning_trips[morning_trips.start_date.dt.time < am_commute_end].copy()

# prune only evening commute hours from customers [15:00 - 20:00]
pm_trip_start = datetime.datetime.strptime('12:00', '%H:%M').time()
pm_trip_end   = datetime.datetime.strptime('18:00', '%H:%M').time()

evening_trips = customer_trips[customer_trips.start_date.dt.time >= pm_commute_start].copy()
evening_trips = evening_trips[evening_trips.start_date.dt.time < pm_commute_end].copy()

# create adjusted time for comparison, reduced to minutes after midnight and minutes after noon
morning_trips['time_adj'] = morning_trips.apply(lambda row: commute_timer(row, am_trip_start),axis=1)
evening_trips['time_adj'] = evening_trips.apply(lambda row: commute_timer(row, pm_trip_start),axis=1)

# fix time_adj type
morning_trips.time_adj = morning_trips.time_adj.astype('timedelta64[m]')
evening_trips.time_adj = evening_trips.time_adj.astype('timedelta64[m]')

print('morning_trips:\t', len(morning_trips))
print('evening_trips:\t', len(evening_trips))

# plot the data
plt.subplots(figsize=(12,6))

sns.distplot(morning_trips.time_adj, color='b', label='morning')
sns.distplot(evening_trips.time_adj, color='y', label='evening')

ax.set(xlabel='start hour')
plt.legend()
plt.show()

# calculate some statistics!
am_trips = morning_trips.groupby('time_adj')['start_terminal'].count()
pm_trips = evening_trips.groupby('time_adj')['start_terminal'].count()

print('-' * 40)
print('Commuter Trips')
calculate_stats(am_trips, pm_trips)
print()
print('-' * 40)