# Data Wrangling - Trips

### Import Data

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25

## Import Data

In [7]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../../datasets/cogobikeshare/*_trip_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    counter = 1

    # load data from each file
    for file in file_list:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True):

            # define Columns
            chunk.columns = ['trip_id', 'duration', 'start_date', 'start_station_name', 'start_terminal', 'end_date', 
                             'end_station_name', 'end_terminal', 'bike_id', 'subscriber_type', 'zip_code']

            # append chunk to chunks list
            chunks.append(chunk)
            
            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1
            

        if counter == 1:
            trip_import_01 = pd.DataFrame()
            trip_import_01 = pd.concat(chunks)
        elif counter == 2:
            trip_import_02 = pd.DataFrame()
            trip_import_02 = pd.concat(chunks)
        elif counter == 3:
            trip_import_03 = pd.DataFrame()
            trip_import_03 = pd.concat(chunks)
        elif counter == 4:
            trip_import_04 = pd.DataFrame()
            trip_import_04 = pd.concat(chunks)
        else:
            pass


        print('Finished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

Loading Trip Data...
Data Loaded Successfully!


In [6]:
trip_import_01

NameError: name 'trip_import_01' is not defined

In [None]:
trip_import_01.head(3)

## Utility Functions

In [None]:
# zipcodes are all over the place, only keep corrected 5 digit zipcodes, and replace all others with NaNs
def clean_zipcode(item):
    
    z = str(item)
    
    if len(z) != 5:

        # split on '-'
        try:
            result = z.split('-')[0]
        except:
            result = z

        # split on '.'
        try:
            result = z.split('.')[0]
        except:
            result = z
        
        # if len of item is less than 5, return 'NaN'
        if len(result) < 5:
            result = 'NaN'
        else:
            # if len result is greater than 5, take at most, first 5 digits
            result = result[:5]
    else:
        result = z
    
    # make sure result is all digits
    if result.isdigit():
        result = int(result)
        return result
    else:
        return 99999
    
def clean_trip_frame(df):
    
    # set column types
    df['start_date']       = pd.to_datetime(df['start_date'], format="%m/%d/%Y %H:%M")
    df['end_date']         = pd.to_datetime(df['end_date'],   format="%m/%d/%Y %H:%M")
    df['trip_id']          = df['trip_id'].astype('int')
    df['duration']         = df['duration'].astype('int')    
    df['start_terminal']   = df['start_terminal'].astype('int')
    df['end_terminal']     = df['end_terminal'].astype('int')
    df['bike_id']          = df['bike_id'].astype('int')
    df['subscriber_type']  = df['subscriber_type'].astype('category')
    
    # add 'duration_minutes' column
    df['duration_minutes'] = df['duration'] / 60.
    df['duration_minutes'] = df['duration_minutes'].astype('float')
    
    # Clean Zipcode
    df['zip_code'] = df.zip_code.apply(clean_zipcode)
    
    # rename columns for clarity
    df.rename(columns={'zip_code': 'user_zip', 'subscriber_type': 'user_type'}, inplace=True)

    return df

def plot_terminal_trips(df, id, date_range = (pd.Timestamp('2013-08-01'), pd.Timestamp('2016-10-01')), x_label = 'Date', y_label = 'Trips', title_suffix='', draw_dates=[]):
    
    def group_terminal(df, start=True):
        ''' group by start or end terminal trips per day
        '''
        
        term = pd.DataFrame()
        if start:
            term = df[df['start_terminal'] == id]
        else:
            term = df[df['end_terminal'] == id]
        term.set_index('start_date', inplace=True)
        g_term = term.groupby(term.index.date)['trip_id'].count()
        
        return g_term
    
        
    start_term = group_terminal(df, start=True)
    end_term   = group_terminal(df, start=False)
    
    ax = start_term.plot(kind='line', color='c', alpha=0.75, figsize=(24,3))
    end_term.plot(kind='line', color='g', alpha=0.75, ax=ax)
    
    ax.set_xlim(date_range)
    if title_suffix != '':
        title = 'Station %s - %s' % (id, title_suffix)
    elif y_label != '':
        title = 'Station %s - %s' % (id, y_label)
    else:
        title = 'Station %s' % id
    ax.set_title(title, size=TITLE_FONT_SIZE, weight='bold')
    ax.set_xlabel(x_label, size=LABEL_FONT_SIZE, weight='bold')
    ax.set_ylabel(y_label, size=LABEL_FONT_SIZE, weight='bold')
    
    ax.legend(['Start Terminal', 'End Terminal'], loc=1)
    
    if len(draw_dates) > 0:
        for xc in draw_dates:
            ax.axvline(x=xc, color='k', linestyle=':', alpha=0.5)
    plt.show()
    
    
def date_fixes(df, old_terminal, new_terminal, change_date):
    
    print('[%s]\tUpdating %s to %s for dates after %s' % (datetime.datetime.now().time(), old_terminal, new_terminal, change_date))

#     t_min = pd.Timestamp('2013-05-01')
#     t_max = pd.Timestamp('2016-10-01')
    
    for station in [old_terminal, new_terminal]:
        plot_terminal_trips(df, station, title_suffix='PRE DATE FIX', draw_dates=[change_date])
        
    # Fix A to B - Start Terminal
    print('[%s]\tStarted indexing...' % datetime.datetime.now().time())
    index_to_update_start = df[(df.start_terminal == old_terminal) & (df.start_date >= change_date)].index
    df.loc[index_to_update_start, 'start_terminal'] = new_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())    

    # Fix A to B - End Terminal
    print('[%s]\tStarted Update...' % datetime.datetime.now().time())
    index_to_update_end   = df[(df.end_terminal == old_terminal) & (df.end_date >= change_date)].index
    df.loc[index_to_update_end, 'end_terminal'] = new_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())
    
    
    # Fix B to A - Start Terminal
    print('[%s]\tStarted indexing...' % datetime.datetime.now().time())
    index_to_update_start = df[(df.start_terminal == new_terminal) & (df.start_date < change_date)].index
    df.loc[index_to_update_start, 'start_terminal'] = old_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())    

    # Fix B to A - End Terminal
    print('[%s]\tStarted Update...' % datetime.datetime.now().time())
    index_to_update_end   = df[(df.end_terminal == new_terminal) & (df.end_date < change_date)].index
    df.loc[index_to_update_end, 'end_terminal'] = old_terminal
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())
    

    for station in [old_terminal, new_terminal]:
        plot_terminal_trips(df, station, draw_dates=[change_date])
    
    return df

# Clean Data

- Format zip codes to take only first 5 digits, this data is self reported and often wrongly input per Bay Area Bike Share Notes
- Prune out Trips greater than 60 minutes long
- Adjust records for station relocations and renaming

In [None]:
# clean data frames
print('[%s] Trip Data Cleanup Started' % datetime.datetime.now().time())
trip_01_clean = clean_trip_frame(trip_import_01.copy())
trip_02_clean = clean_trip_frame(trip_import_02.copy())
trip_03_clean = clean_trip_frame(trip_import_03.copy())
trip_04_clean = clean_trip_frame(trip_import_04.copy())

print('[%s] Merging Trip Data' % datetime.datetime.now().time())
trip_data = pd.DataFrame()
trip_data = pd.concat([trip_01_clean, trip_02_clean, trip_03_clean, trip_04_clean])
trip_data.drop_duplicates(inplace=True)
trip_data.sort_values('trip_id', inplace=True)
trip_data.reset_index(inplace=True, drop=True)

print('[%s] Cleanup Complete!' % datetime.datetime.now().time())

In [None]:
trip_data.info()

In [None]:
trip_data.head(3)

In [None]:
trip_data.tail(3)

## Prune By Trip Duration

In [None]:
# prune data to exclude trips longer than 60 minutes
print('[%s] - Removing trips longer than 60 minutes' % (datetime.datetime.now().time()))
drop_list = trip_data[trip_data.duration_minutes > 60.0].index
print('\t\tremoving %s items' % len(drop_list))
trip_data.drop(drop_list, inplace=True)
trip_data.reset_index(inplace=True, drop=True)
print('[%s] - Complete' % (datetime.datetime.now().time()))

trip_data.info()

## Correct Relocated Stations

> There was a delay in station_id updates when stations 23, 24, 25, and 26 were relocated, update status information by changing the station_id for these stations in dates after they were relocated

In [None]:
change_date = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')
trip_data = date_fixes(trip_data, 23, 88, change_date)
trip_data = date_fixes(trip_data, 24, 89, change_date)

In [None]:
change_date = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
trip_data = date_fixes(trip_data, 25, 91, change_date)
trip_data = date_fixes(trip_data, 26, 90, change_date)

## Preview Graphical EDA by User Type

In [None]:
x_ticks = sorted(pd.unique(trip_data.start_date.dt.hour))
subscriber_y = trip_data[trip_data.user_type == 'Subscriber'].groupby(trip_data.start_date.dt.hour)['trip_id'].count()
customer_y   = trip_data[trip_data.user_type == 'Customer'].groupby(trip_data.start_date.dt.hour)['trip_id'].count()
sub_norm  = subscriber_y.to_frame().apply(lambda x: x / (np.max(x) - np.min(x)))
cust_norm =   customer_y.to_frame().apply(lambda x: x / (np.max(x) - np.min(x)))


plt.subplots(figsize=(24,6))
ax = sns.barplot(x = x_ticks , y = subscriber_y, color='b', alpha = 0.5, label='Subscribers')
sns.barplot(x = x_ticks , y = customer_y, color='r', alpha = 0.5, label='Customers', ax=ax)

ax.set_title('Distribution of Trips by Hour', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Start Hour', size=LABEL_FONT_SIZE, weight='bold')
ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE, weight='bold')
ax.set_xticks(x_ticks)

plt.legend()
plt.show()



plt.subplots(figsize=(24,6))
ax = sns.barplot(x = x_ticks , y = sub_norm['trip_id'], color='b', alpha = 0.5, label='Subscribers')
sns.barplot(x = x_ticks , y = cust_norm['trip_id'], color='r', alpha = 0.5, label='Customers', ax=ax)

ax.set_title('Distribution of Trips by Hour (Normalized)', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Start Hour', size=LABEL_FONT_SIZE, weight='bold')
ax.set_ylabel('Distribution', size=LABEL_FONT_SIZE, weight='bold')
ax.set_xticks(x_ticks)

plt.legend()
plt.show()

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
trip_data.to_csv('../clean_data/trip_data_cleaned.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

# Load Station Data

In [None]:
station_data = pd.DataFrame()
station_data = pd.read_csv('../clean_data/station_data_cleaned_final.csv', parse_dates=['first_service_date', 'last_service_date'], index_col=0)

station_data_basic = pd.DataFrame()
# drop duplicated station_id ros, keep first
station_data_basic = station_data.copy()
station_data_basic.drop_duplicates(subset=['station_id'], keep='first', inplace=True)

station_data_basic.drop(['lat', 'long', 'landmark', 'zip_code'], axis=1, inplace=True)

station_data_basic.info()

### Append Service area start and end columns

In [None]:
trip_data.head(3)

In [None]:
station_data.head(3)

In [None]:
print('[%s]\tStarting Area Lookup...' % datetime.datetime.now().time())
trip_data['start_area'] = trip_data['start_terminal'].apply(lambda x: station_data[station_data.station_id == x]['landmark'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

print('[%s]\tStarting Area Lookup...' % datetime.datetime.now().time())
trip_data['end_area'] = trip_data['end_terminal'].apply(lambda x: station_data[station_data.station_id == x]['landmark'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
print('[%s]\tStarting Area Lookup...' % datetime.datetime.now().time())
trip_data['start_zip'] = trip_data['start_terminal'].apply(lambda x: station_data[station_data.station_id == x]['zip_code'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

print('[%s]\tStarting Area Lookup...' % datetime.datetime.now().time())
trip_data['end_zip'] = trip_data['end_terminal'].apply(lambda x: station_data[station_data.station_id == x]['zip_code'].iloc[0])
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

## Review Special Circumstance Stations

In [None]:

# Station 26 Review
s26_dates = ['2013-08-29', '2016-08-04']
plot_terminal_trips(trip_data, 26, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s26_dates)

# Station 30 Review
s30_dates = ['2013-08-29', '2015-09-28', '2016-08-31']
plot_terminal_trips(trip_data, 30, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s30_dates)

# Station 33 Review
s33_dates = ['2013-08-29', '2015-09-16', '2016-08-31']
plot_terminal_trips(trip_data, 33, x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s33_dates)

# Station 73 Review
s73_dates = ['2013-08-29', '2015-05-19', '2016-08-31']
plot_terminal_trips(trip_data, 73,x_label = 'Date', y_label = 'Trips', title_suffix='Review', draw_dates=s73_dates)

In [None]:
trip_data.head(5)

In [None]:
trip_data.info()

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
trip_data.to_csv('../clean_data/trip_data_cleaned_zips.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

# Load Weather Data

In [None]:
weather_data = pd.DataFrame()
weather_data = pd.read_csv('../clean_data/weather_cleaned_all.csv', parse_dates=['date'], index_col=['date'])

In [None]:
weather_data.head()

In [None]:
pd.unique(weather_data.events)

## Append Weather Data to Trips

In [None]:
trip_data.head(3)

In [None]:
print('[%s]\tAppending Weather Data...' % datetime.datetime.now().time())


trip_data = pd.merge(trip_data, 
                      weather_data, 
                      how='left', 
                      left_on=[trip_data['start_date'].dt.date, 'start_zip'], 
                      right_on=[weather_data.index.date, 'zip_code'])
trip_data.drop(['zip_code'], axis=1, inplace=True)
# test.set_index('Trip ID', inplace=True)




print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
trip_data.head(3)

In [None]:
trip_data.info()

In [None]:
trip_data.columns

In [None]:
trip_data.to_csv('../clean_data/trip_data_cleaned_master.csv', encoding='utf-8')