In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Read in the data
# Data files
ride_data_orig = "Resources/Metro_Bike_Share_Trip_Data.csv"
ride_data_17_Q2 = "Resources/la_metro_gbfs_trips_Q2_2017.csv"
ride_data_17_Q3 = "Resources/metro-bike-share-trips-2017-q3.csv"
ride_data_17_Q4 = "Resources/metro-bike-share-trips-2017-q4-v2.csv"
ride_data_18_Q1 = "Resources/metro-bike-share-trips-2018-q1.csv"

# Create dataframes
ride_orig_df = pd.read_csv(ride_data_orig, low_memory=False)
ride_17_Q2_df = pd.read_csv(ride_data_17_Q2, low_memory=False)
ride_17_Q3_df = pd.read_csv(ride_data_17_Q3, low_memory=False)
ride_17_Q4_df = pd.read_csv(ride_data_17_Q4, low_memory=False)
ride_18_Q1_df = pd.read_csv(ride_data_18_Q1, low_memory=False)


In [2]:
# Datetime data after Q1 2017 is not zero padded and it has to be zero padded
def datetime_reformat(dt_str):
    # seperate date and time
    dt_strings = dt_str.split()
    date_str = dt_strings[0]
    time_str = dt_strings[1]
    
    # split up the date field
    date_fields = date_str.split("/")
    
    # do the formatting date
    if (len(date_str) == 6):
        # add zero to month and date
        new_date = "0" + date_fields[0] + "/0" + date_fields[1] + "/" + date_fields[2]
    elif (len(date_str) == 7):
        if (len(date_fields[0]) == 2):
            # add zero to day
            new_date = date_fields[0] + "/0" + date_fields[1] + "/" + date_fields[2]
        else:
            # add zero to month
            new_date = "0" + date_fields[0] + "/" + date_fields[1] + "/" + date_fields[2]
    else:
        new_date = date_str
    
    # do the formatting time
    if (len(time_str) == 4):
        # add zero to hour
        new_time = "0" + time_str
    else:
        new_time = time_str
        
    # final string
    new_str = new_date + " " + new_time
    
    return new_str


In [3]:
# Rename start time column so it is consistant between data files
# For this inital analysis, the start time is used for every plot (both date and time)
ride_orig_df.rename(columns={'Start Time':'start_time'}, inplace=True)

# Drop columns that aren't needed (leaving some for future analysis)
# Note (Starting Lat-Long and Ending Lat-Long) do not exist in the newer data
ride_orig_df.drop(['Starting Station Latitude', 'Starting Station Longitude', 'Ending Station Latitude',
                   'Ending Station Longitude','Starting Lat-Long', 'Ending Lat-Long'], axis=1, inplace=True)
ride_17_Q2_df.drop(['start_lat', 'start_lon', 'end_lat', 'end_lon'], axis=1, inplace=True)
ride_17_Q3_df.drop(['start_lat', 'start_lon', 'end_lat', 'end_lon'], axis=1, inplace=True)
ride_17_Q4_df.drop(['start_lat', 'start_lon', 'end_lat', 'end_lon'], axis=1, inplace=True)
ride_18_Q1_df.drop(['start_lat', 'start_lon', 'end_lat', 'end_lon'], axis=1, inplace=True)

# Drop rows that have any NaN values
ride_orig_df = ride_orig_df.dropna()
ride_17_Q2_df = ride_17_Q2_df.dropna()
ride_17_Q3_df = ride_17_Q3_df.dropna()
ride_17_Q4_df = ride_17_Q4_df.dropna()
ride_18_Q1_df = ride_18_Q1_df.dropna()

# Converting start time to datetime object in the original data
ride_orig_df['start_time'] = pd.to_datetime(ride_orig_df['start_time'], format='%m/%d/%Y %I:%M:%S %p')

# The format of the start date and time changed significantly from the original data file
# to the newer ones (17Q2-18Q1). Basically, the original datetime was zero padded, the new
# data wasn't. Converting the date and time into a datetime object was simple for the original
# The later data had to be zero padded first.
#
# From https://stackoverflow.com/questions/41191365/python-datetime-strptime-error-is-a-bad-directive-in-format-m-d-y-h
# The use of %-m (for non-zero padded month value), will not work on a platform that dosn't have the
# proper GNU strftime C library function installed. Or from the Python datetime module documentation, the format
# codes that the C standard (1989 version) supports does not include %-m (and the others with a - indicating no
# zero padding). The 1999 version of the C standard added additional format codes. Interpreting these codes is not
# part of Python.

# Converting the newer data date/time strings
# Setup to use datetime_reformat function
v_format = np.vectorize(datetime_reformat)

# Format
ride_17_Q2_df['start_time'] = v_format(ride_17_Q2_df.start_time)
ride_17_Q2_df['end_time'] = v_format(ride_17_Q2_df.end_time)
ride_17_Q3_df['start_time'] = v_format(ride_17_Q3_df.start_time)
ride_17_Q3_df['end_time'] = v_format(ride_17_Q3_df.end_time)
ride_17_Q4_df['start_time'] = v_format(ride_17_Q4_df.start_time)
ride_17_Q4_df['end_time'] = v_format(ride_17_Q4_df.end_time)
ride_18_Q1_df['start_time'] = v_format(ride_18_Q1_df.start_time)
ride_18_Q1_df['end_time'] = v_format(ride_18_Q1_df.end_time)

# Now that its zero-padded, convert to datetime object
ride_17_Q2_df['start_time'] = pd.to_datetime(ride_17_Q2_df['start_time'], format='%m/%d/%y %H:%M')
ride_17_Q3_df['start_time'] = pd.to_datetime(ride_17_Q3_df['start_time'], format='%m/%d/%y %H:%M')
ride_17_Q4_df['start_time'] = pd.to_datetime(ride_17_Q4_df['start_time'], format='%m/%d/%y %H:%M')
ride_18_Q1_df['start_time'] = pd.to_datetime(ride_18_Q1_df['start_time'], format='%m/%d/%y %H:%M')


In [4]:
# Create the 2016 and 2017 Quarters from the original file

# Add Year, Month, and Day columns to the original data file
ride_orig_df['Year'] = ride_orig_df['start_time'].dt.year
ride_orig_df['Month'] = ride_orig_df['start_time'].dt.month

# Get 2016 data (should already have new columns)
ride_orig_2016_df = ride_orig_df.loc[ride_orig_df['Year'] == 2016]

# Get Months
ride_16_Q3_7 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 7]
ride_16_Q3_8 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 8]
ride_16_Q3_9 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 9]
ride_16_Q4_10 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 10]
ride_16_Q4_11 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 11]
ride_16_Q4_12 = ride_orig_2016_df.loc[ride_orig_2016_df['Month'] == 12]

# Create quarters dataframes
frames = [ride_16_Q3_7, ride_16_Q3_8, ride_16_Q3_9]
ride_16_Q3_df = pd.concat(frames, ignore_index=True, sort=False)

frames = [ride_16_Q4_10, ride_16_Q4_11, ride_16_Q4_12]
ride_16_Q4_df = pd.concat(frames, ignore_index=True, sort=False)

# Get the 2017 Q1 data (the 2017 data is from 1/1 to 3/31 so all 2017 is Q1)
ride_17_Q1_df = ride_orig_df.loc[ride_orig_df['Year'] == 2017].copy()
