# Wrangling Data From Bay Area Bike Share Published Data - Trips

### Import Packages

In [None]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math
import numpy as np

import seaborn as sns
sns.set()

<p> Set some notebook variables, makes the notebook 95% width of the screen for easier viewing</p>

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

<p> Set some global font sizes for plots </p>

In [None]:
font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25

## Import Data

In [None]:
print('Started Loading Station Data...')
file_path_slug = '../source_data/bayareasbikeshare/*_trip_data.csv'
file_list = glob(file_path_slug)

trips_df = pd.DataFrame()

counter = 1
chunks = []

for file in file_list:
    
    chunk_counter = 1
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/10000)
    
    for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
        # define Columns
        chunk.columns = ['trip_id', 'duration', 'start_date', 'start_station_name', 'start_terminal', 'end_date', 
                         'end_station_name', 'end_terminal', 'bike_id', 'subscriber_type', 'zip_code']

        # append chunk to chunks list
        chunks.append(chunk)

        if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
            print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
        chunk_counter += 1
        
    print('Finished file! (%d of %d)' % (counter, len(file_list)))
    counter += 1
    
trips_df = pd.concat(chunks)
print('Data Loaded Successfully!')

## Cleaning Support Functions
<p>General Cleaning functions</p>

In [None]:
def clean_zipcode(item):
    ''' Zipcodes in this data set are self reported rider zipcodes
        and the method of entry for these zipcodes is reported to 
        not be very clear for users, we are using a heavy hand 
        when cleaning up zip codes'''

    zip_string = str(item)
    
    # if zipcode is not 5 characters long
    if len(zip_string) > 5:
        # try splitting on a few different characters

        # split on '-'
        try:
            result = zip_string.split('-')[0]
        except:
            result = zip_string
        
        # split on '.'
        try:
            result = zip_string.split('.')[0]
        except:
            result = zip_string
            
        # take only first 5 characters
        result = result[:5]

        # if len of zipstring is less than 5, return 'NaN'
        if len(result) < 5:
            return 99999

    # if it is 5 characters long, move along
    else:
        result = zip_string
    
    # make sure result is all digits
    if result.isdigit():
        result = int(result)
        return result
    else:
        return 99999


def clean_trip_frame(df):
    '''Basic Cleanup of column types and appends a 'duration_minutes' 
        column converted from given 'duration' which is in seconds'''
    # set column types
    df['start_date']       = pd.to_datetime(df['start_date'], format="%m/%d/%Y %H:%M")
    df['end_date']         = pd.to_datetime(df['end_date'],   format="%m/%d/%Y %H:%M")
    df['trip_id']          = df['trip_id'].astype('int')
    df['duration']         = df['duration'].astype('int')    
    df['start_terminal']   = df['start_terminal'].astype('int')
    df['end_terminal']     = df['end_terminal'].astype('int')
    df['bike_id']          = df['bike_id'].astype('int')
    df['subscriber_type']  = df['subscriber_type'].astype('category')
    
    # add 'duration_minutes' column
    df['duration_minutes'] = df['duration'] / 60.
    df['duration_minutes'] = df['duration_minutes'].astype('float')
    
    # Clean Zipcode
    df['zip_code'] = df.zip_code.apply(clean_zipcode)
    
    # rename columns for clarity
    df.rename(columns={'zip_code': 'user_zip', 'subscriber_type': 'user_type'}, inplace=True)

    return df

## Cleaning Data
<p>Format zip codes to take only first 5 digits, this data is self reported and often wrongly input per Bay Area Bike Share Notes</p>

In [None]:
# clean data frames
print('[%s] Trip Data Cleanup Started' % datetime.datetime.now().time())
trips_df = clean_trip_frame(trips_df)

print('[%s] Cleaning Duplicates and Reindexing Trip Data' % datetime.datetime.now().time())
trips_df.drop_duplicates(inplace=True)
trips_df.sort_values('trip_id', inplace=True)
trips_df.reset_index(inplace=True, drop=True)

print('[%s] Cleanup Complete!' % datetime.datetime.now().time())

In [None]:
trips_df.shape

### Prune by Duration
<p>The distribution of trips is heavily left skewed, pruning trips longer than 60 minutes in duration helps get a beter picture of the important data points</p>

In [None]:
trips_df.duration_minutes.plot(kind='hist', bins=60, figsize=(12,6))
plt.xlabel('Duration (Minutes)')
plt.ylabel('Frequency')
plt.title('Trip Duration Histogram')
plt.show()

In [None]:
# prune data to exclude trips longer than 60 minutes
print('[%s] - Removing trips longer than 60 minutes' % (datetime.datetime.now().time()))
drop_list = trips_df[trips_df.duration_minutes > 60.0].index
print('\t\tremoving %s items' % len(drop_list))
trips_df.drop(drop_list, inplace=True)
trips_df.reset_index(inplace=True, drop=True)
print('[%s] - Complete' % (datetime.datetime.now().time()))

In [None]:
trips_df.duration_minutes.plot(kind='hist', bins=60, figsize=(12,6))
plt.xlabel('Duration (Minutes)')
plt.ylabel('Frequency')
plt.title('Trip Duration Histogram - Pruned')
plt.show()

## Correct Relocated Stations

> There was a delay in station_id updates when stations 23, 24, 25, and 26 were relocated, update status information by changing the station_id for these stations in dates after they were relocated

In [None]:
def plot_terminal_trips(df, id, date_range = (pd.Timestamp('2013-08-01'), pd.Timestamp('2016-10-01')), x_label = 'Date', y_label = 'Trips', title_suffix='', draw_dates=[]):
    '''for each terminal in a trips_df, plot the number of trips starting or finishing at that terminal'''
    def group_terminal(df, start=True):
        ''' group by start or end terminal trips per day
        '''
        
        term = pd.DataFrame()
        if start:
            term = df[df['start_terminal'] == id]
        else:
            term = df[df['end_terminal'] == id]
        term.set_index('start_date', inplace=True)
        g_term = term.groupby(term.index.date)['trip_id'].count()
        
        return g_term
    
        
    start_term = group_terminal(df, start=True)
    end_term   = group_terminal(df, start=False)
    
    ax = start_term.plot(kind='line', color='c', alpha=0.75, figsize=(24,3))
    end_term.plot(kind='line', color='g', alpha=0.75, ax=ax)
    
    ax.set_xlim(date_range)
    if title_suffix != '':
        title = 'Station %s - %s' % (id, title_suffix)
    elif y_label != '':
        title = 'Station %s - %s' % (id, y_label)
    else:
        title = 'Station %s' % id
    ax.set_title(title, size=TITLE_FONT_SIZE, weight='bold')
    ax.set_xlabel(x_label, size=LABEL_FONT_SIZE, weight='bold')
    ax.set_ylabel(y_label, size=LABEL_FONT_SIZE, weight='bold')
    
    ax.legend(['Start Terminal', 'End Terminal'], loc=1)
    
    if len(draw_dates) > 0:
        for xc in draw_dates:
            ax.axvline(x=xc, color='k', linestyle=':', alpha=0.5)
    plt.show()

def date_fixes(df, old_terminal, new_terminal, change_date):
    '''Correct dates on terminal relocations, outputs graphs for quick validation of before and after'''
    print('[%s]\tUpdating %s to %s for dates after %s' % (datetime.datetime.now().time(), old_terminal, new_terminal, change_date))
    
    for station in [old_terminal, new_terminal]:
        plot_terminal_trips(df, station, title_suffix='PRE DATE FIX', draw_dates=[change_date])
        
    # Fix A to B - Start Terminal
    print('[%s]\tSettings %s to %s occuring before %s' % (datetime.datetime.now().time(), old_terminal, new_terminal, change_date))
    index_to_update_start = df[(df.start_terminal == old_terminal) & (df.start_date >= change_date)].index
    df.loc[index_to_update_start, 'start_terminal'] = new_terminal

    # Fix A to B - End Terminal
    index_to_update_end   = df[(df.end_terminal == old_terminal) & (df.end_date >= change_date)].index
    df.loc[index_to_update_end, 'end_terminal'] = new_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())
    
    # Fix B to A - Start Terminal
    print('[%s]\tSettings %s to %s occuring after %s' % (datetime.datetime.now().time(), new_terminal, old_terminal, change_date))
    index_to_update_start = df[(df.start_terminal == new_terminal) & (df.start_date < change_date)].index
    df.loc[index_to_update_start, 'start_terminal'] = old_terminal

    # Fix B to A - End Terminal
    index_to_update_end   = df[(df.end_terminal == new_terminal) & (df.end_date < change_date)].index
    df.loc[index_to_update_end, 'end_terminal'] = old_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())
    
    for station in [old_terminal, new_terminal]:
        plot_terminal_trips(df, station, draw_dates=[change_date])
    
    return df

In [None]:
change_date = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')
trips_df = date_fixes(trips_df, 23, 88, change_date)
trips_df = date_fixes(trips_df, 24, 89, change_date)

In [None]:
change_date = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
trips_df = date_fixes(trips_df, 25, 91, change_date)
trips_df = date_fixes(trips_df, 26, 90, change_date)

# Write Data to File - Basic Trip Data

In [None]:
trips_df.to_csv('../clean_data/bayareabikeshare/trip_data_cleaned.csv', encoding='utf-8')

# Append Station and Weather Data to Trips

### Load Station and Weather Data

In [None]:
# load Station Data
stations_df = pd.DataFrame()
stations_df = pd.read_csv('./clean_data/station_data_cleaned.csv', index_col=0, parse_dates=['first_service_date', 'last_service_date'])
stations_df.head()

In [None]:
# load weather data
weather_df = pd.DataFrame()
weather_df = pd.read_csv('./clean_data/weather_data_cleaned.csv', parse_dates=['date'], index_col=0)
weather_df.set_index('date', drop=True)
weather_df.head()

### Append Data to Trips

In [None]:
print('trips_df.shape\t\t%s\t%s' % (trips_df.shape[0], trips_df.shape[1]))
print('stations_df.shape\t%s\t%s' % (stations_df.shape[0], stations_df.shape[1]))
print('weather_df.shape\t%s\t%s' % (weather_df.shape[0], weather_df.shape[1]))

In [None]:
# append Station Data to Trips
print('[%s]\tAppending Start Area & Zip Lookup...' % datetime.datetime.now().time())
trips_df['start_area'] = trips_df['start_terminal'].apply(lambda x: stations_df[stations_df.station_id == x]['landmark'].iloc[0])
trips_df['start_zip'] = trips_df['start_terminal'].apply(lambda x: stations_df[stations_df.station_id == x]['zip_code'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

print('[%s]\tAppending End Area & Zip Lookup...' % datetime.datetime.now().time())
trips_df['end_area'] = trips_df['end_terminal'].apply(lambda x: stations_df[stations_df.station_id == x]['landmark'].iloc[0])
trips_df['end_zip'] = trips_df['end_terminal'].apply(lambda x: stations_df[stations_df.station_id == x]['zip_code'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

print('[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
trips_df.columns

In [None]:
dl = ['date', 'max_temp', 'mean_temp',
       'min_temp', 'max_wind', 'mean_wind', 'max_gust', 'precipitation',
       'cloud_cover', 'events', 'zip_code']

for d in dl:
    try:
        trips_df.drop(d, axis=1, inplace=True)
    except:
        pass

In [None]:
trips_df.head(5)

In [None]:
weather_df.head(5)

In [None]:
print('[%s]\tAppending Weather Data...' % datetime.datetime.now().time())

trips_complete_df = pd.merge(trips_df, 
                          weather_df, 
                          how='left', 
                          left_on=[trips_df['start_date'].dt.date, 'start_zip'], 
                          right_on=[weather_df.date.dt.date, 'zip_code'])
trips_complete_df.drop(['zip_code'], axis=1, inplace=True)

print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
trips_complete_df.info()

# Write Data to File - Complete Trip Data

In [None]:
trips_complete_df.to_csv('../clean_data/bayareabikeshare/trip_data_extended_cleaned.csv', encoding='utf-8')

***
# Preview EDA of Stations Data

In [None]:
x_ticks = sorted(pd.unique(trips_complete_df.start_date.dt.hour))
subscriber_y = trips_complete_df[trips_complete_df.user_type == 'Subscriber'].groupby(trips_complete_df.start_date.dt.hour)['trip_id'].count()
customer_y   = trips_complete_df[trips_complete_df.user_type == 'Customer'].groupby(trips_complete_df.start_date.dt.hour)['trip_id'].count()
sub_norm  = subscriber_y.to_frame().apply(lambda x: x / (np.max(x) - np.min(x)))
cust_norm =   customer_y.to_frame().apply(lambda x: x / (np.max(x) - np.min(x)))

# Trips by Hour by Customer Type
plt.subplots(figsize=(24,6))
ax = sns.barplot(x = x_ticks , y = subscriber_y, color='b', alpha = 0.5, label='Subscribers')
sns.barplot(x = x_ticks , y = customer_y, color='r', alpha = 0.5, label='Customers', ax=ax)

ax.set_title('Distribution of Trips by Hour', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Start Hour', size=LABEL_FONT_SIZE, weight='bold')
ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE, weight='bold')
ax.set_xticks(x_ticks)

plt.legend()
plt.show()

# Normalized Trips by Hour by Customer Type
plt.subplots(figsize=(24,6))
ax = sns.barplot(x = x_ticks , y = sub_norm['trip_id'], color='b', alpha = 0.5, label='Subscribers')
sns.barplot(x = x_ticks , y = cust_norm['trip_id'], color='r', alpha = 0.5, label='Customers', ax=ax)

ax.set_title('Distribution of Trips by Hour (Normalized)', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Start Hour', size=LABEL_FONT_SIZE, weight='bold')
ax.set_ylabel('Distribution', size=LABEL_FONT_SIZE, weight='bold')
ax.set_xticks(x_ticks)

plt.legend()
plt.show()

# Quick Reference to some 'Special Stations'
<p>Some stations were relocated, closed, or expanded during the program's recorded dataset being used in this analysis, this is a quick view to these notable stations</p>
<ul>
    <li> Station 21 - Relocated on September 16, 2015
    <li> Station 30 - Relocated on September 28, 2015
    <li> Station 33 - Relocated on September 16, 2015
    <li> Station 73 - Relocated on March 14, 2016 and May 19, 2016 had 4 extra docks added
</ul>

In [None]:
# Station 21 Review
s21_dates = ['2013-08-29', '2015-09-16', '2016-08-04']
plot_terminal_trips(trips_df, 21, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s21_dates)

# Station 30 Review
s30_dates = ['2013-08-29', '2015-09-28', '2016-08-31']
plot_terminal_trips(trips_df, 30, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s30_dates)

# Station 33 Review
s33_dates = ['2013-08-29', '2015-09-16', '2016-08-31']
plot_terminal_trips(trips_df, 33, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s33_dates)

# Station 73 Review
s73_dates = ['2013-08-29', '2015-05-19', '2016-08-31']
plot_terminal_trips(trips_df, 73, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s73_dates)