# Load Station Status Stream Files

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25

## Import Data

In [4]:
def load_status_data():

    print('Loading Status Data...')
    file_path_slug = '../streamed_data/*/*station_status*.csv'

    # glob all files
    file_list = glob(file_path_slug)

    status_import = pd.DataFrame()
    chunks = []
    counter = 0
    
    print('Started Import (%s files to load)...' % len(file_list))

    # load data from each file
    for file in file_list:
        try:

            # import file in chunks
            chunk = pd.read_csv(file)

            # add program id to chunk
            chunk['program_id'] = str(file).split('/')[2]

            # append chunk to chunks list
            chunks.append(chunk)
            
            counter += 1
            if counter == 1 or counter % math.ceil(len(file_list)/10) == 0 or counter == len(file_list):
                print('\tFinished file! (%d of %d)' % (counter, len(file_list)))        
        except:
            print(file)

    status_import = pd.concat(chunks)
    print('Data Loaded Successfully!\n')
    return status_import
    
def clean_data(df):
    program_list = sorted(pd.unique(df.program_id))
    print('Started cleaning (%s programs to clean)...' % len(program_list))
    df.rename(columns={'last_reported' : 'time',
                                     'eightd_has_available_keys' : 'has_available_keys',
                                     'is_installed' : 'is_installed',
                                     'is_renting' : 'is_renting',
                                     'is_returning' : 'is_returning',
                                     'num_bikes_available' : 'bikes_available',
                                     'num_bikes_disabled' : 'bikes_disabled',
                                     'num_docks_available' : 'docks_available',
                                     'num_docks_disabled' : 'docks_disabled',
                                     'station_id' : 'station_id'}, inplace=True)

    drop_cols = ['eightd_active_station_services', 'has_available_keys', 'is_installed', 'is_renting', 'is_returning', 'bikes_disabled', 'docks_disabled', 'renting', 'returning', 'installed']

    for c in drop_cols:
        try:
            df.drop(c, axis=1, inplace=True)
        except:
            pass

    # convert time to datetime, and prune entries pre 2000, some epoch dates are being reported
    df['time'] = pd.to_datetime(status_import['time'], format="%Y-%m-%d %H:%M:%S")
    change_date = pd.Timestamp('2000-01-01 00:00')
    df = df[df.time >= change_date]

    df.sort_values(['program_id', 'station_id', 'time'], inplace=True)
    
    A = len(df)
    df.drop_duplicates(subset=['time', 'program_id', 'station_id'], inplace=True)
    B = len(df)
    print('\t\tDropped %s duplicates!' % (A-B))
    
    # by program and station ,append dock change and bike change
    cleaned_df = pd.DataFrame()
    cleaned_list = []
    counter = 1
    for pid in program_list:

        df_temp_program = df[df.program_id == pid].copy()

        # for each dock
        print('%s\t%s - %s Stations to Clean...' % (str(counter).rjust(3), pid.ljust(25), str(len(pd.unique(df_temp_program.station_id))).rjust(4)))
        for sid in pd.unique(df_temp_program.station_id):
            df_temp_station = df_temp_program[df_temp_program.station_id == sid].copy()
            
            # drop duplicates
            A = len(df_temp_station)
            df_temp_station.drop_duplicates(subset='time', inplace=True)
            B = len(df_temp_station)
            if A != B:
                print('\t%s\tDropped %s duplicates!' % (sid, (A-B)))
            
            df_temp_station.sort_values(['time'], inplace=True)
            
            df_temp_station['trips_ending'] = df_temp_station['bikes_available'].diff().apply(lambda x: x if x > 0 else 0)
            df_temp_station['trips_starting'] = df_temp_station['docks_available'].diff().apply(lambda x: x if x > 0 else 0)
            df_temp_station['net_traffic'] = df_temp_station['trips_starting'] - df_temp_station['trips_ending']

            cleaned_list.append(df_temp_station)
            
        counter += 1

    print('Merging DataFrame...')
    cleaned_df = pd.concat(cleaned_list)
    cleaned_df.reset_index(inplace=True, drop=True)

    print('Cleaning Complete!')

    return cleaned_df


In [None]:
status_import = load_status_data()

Loading Status Data...
Started Import (8696 files to load)...
	Finished file! (1 of 8696)
	Finished file! (870 of 8696)
	Finished file! (1740 of 8696)
	Finished file! (2610 of 8696)
	Finished file! (3480 of 8696)
	Finished file! (4350 of 8696)
	Finished file! (5220 of 8696)
	Finished file! (6090 of 8696)
	Finished file! (6960 of 8696)
	Finished file! (7830 of 8696)
	Finished file! (8696 of 8696)


In [None]:
status_import.info()

In [None]:
status_data = clean_data(status_import)

In [None]:
status_data.info()

In [None]:
status_data.head(20)

In [None]:
def show_station(df, id, x_label = '', y_label = '', title_suffix='', color='b', col='bikes_available'):
    t = df[df.station_id == id]
    t.set_index('time', inplace=True)
    ax = t[col].cumsum().plot(color=color, figsize=(24,3))
    if title_suffix != '':
        title = 'Station %s - %s' % (id, title_suffix)
    elif y_label != '':
        title = 'Station %s - %s' % (id, y_label)
    else:
        title = 'Station %s' % id
    ax.set_title(title, size=TITLE_FONT_SIZE, weight='bold')
    ax.set_xlabel(x_label, size=LABEL_FONT_SIZE, weight='bold')
    ax.set_ylabel(y_label, size=LABEL_FONT_SIZE, weight='bold')
    plt.show()

In [None]:
def plot_hourly_usage(df, title='CoGo', color='r'):
    
    try:
        df.sort_values('time', inplace=True)
        x_ticks = sorted(pd.unique(df.time.dt.hour))
        df_data = df[df.bike_change != 0]
        df_data.reset_index(inplace=True, drop=True)
        df_status_change = df_data.groupby([df_data.time.dt.hour])['station_id'].count()

        plt.subplots(figsize=(24,3))
        ax = sns.barplot(x = df_status_change.index , y = df_status_change, color=color, alpha = 0.25, label=title)
        ax.set_title('Distribution of Trips by Hour', size=TITLE_FONT_SIZE, weight='bold')
        ax.set_xlabel('Start Hour', size=LABEL_FONT_SIZE, weight='bold')
        ax.set_ylabel('Total Trips', size=LABEL_FONT_SIZE, weight='bold')
        ax.set_xticks(x_ticks)
        plt.legend()
        plt.show()
    except:
        print('Unable to plot %s' % title)

In [None]:
def resample_status(df, interval='5T', verbose=False):
    if verbose:
        print('[%s] Resampling on interval' % datetime.datetime.now().time())
    
    resampled_list = []
    
    for pid in pd.unique(df.program_id):

            df_temp = df[df.program_id == pid].copy()
            
            df_temp.set_index(['station_id', 'time'], inplace=True)

            # resample on 5 minute interval mean
            df_temp = df_temp.groupby([pd.Grouper(level=0), pd.Grouper(freq=interval, level=-1)]).mean()
            df_temp.reset_index(inplace=True)
            
            df_temp['program_id'] = pid
         
            resampled_list.append(df_temp)

    resampled_df = pd.concat(resampled_list)
    resampled_df.reset_index(inplace=True, drop=True)
    
    if verbose:
        print('[%s] Complete' % datetime.datetime.now().time())
    return resampled_df

In [None]:
fig, ax = plt.subplots(figsize=(24,6))
for pid in pd.unique(status_data.program_id):
    status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_starting.sum().plot(kind='bar', color='b', alpha= 0.5, ax=ax)

# ax.set_ylim(0, 1000)
plt.show()

In [None]:
d0 = status_data.groupby([status_data.time.dt.dayofweek, status_data.time.dt.hour]).trips_starting.sum()
d1 = status_data.groupby([status_data.time.dt.dayofweek, status_data.time.dt.hour]).trips_ending.sum()

ax = d0.plot(figsize=(36,6), color='b', alpha=0.5, marker='o')
d1.plot(ax=ax, color='r', alpha=0.5, marker='s')


y_tick_interval = 500
y_max = math.ceil(d0.max()/y_tick_interval)*y_tick_interval

if math.ceil(d1.max()/y_tick_interval)*y_tick_interval > y_max:
    y_max = math.ceil(d1.max()/y_tick_interval)*y_tick_interval

y_tick_max = int(y_max/y_tick_interval*1.1)

ax.set_ylim(0, y_max)
ax.set_yticks([y*y_tick_interval for y in range(0, y_tick_max)])

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(36,6))
y_max = 0
for pid in pd.unique(status_data.program_id):
    status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_starting.sum().plot(color='b', alpha= 0.5, ax=ax, marker='o')
    status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_ending.sum().plot(color='r', alpha= 0.5, ax=ax, marker='s')
    
    if status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_starting.sum().max() > y_max:
        y_max = status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_starting.sum().max()
    
    if status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_ending.sum().max() > y_max:
        y_max = status_data[status_data.program_id == pid].groupby([status_data.time.dt.hour]).trips_ending.sum().max()

y_tick_interval = 1000
y_max = math.ceil(y_max/y_tick_interval)*y_tick_interval
y_tick_max = int(y_max/y_tick_interval)+1
x_ticks = sorted(pd.unique(df.time.dt.hour))
ax.set_xticks(x_ticks)
ax.set_ylim(0, y_max)
ax.set_yticks([y*y_tick_interval for y in range(0, y_tick_max)])

plt.show()

In [None]:
print('Program ID\t\t\tTrips Starting\tTrips Ending\tNet')
for pid in pd.unique(status_data.program_id):
    S = status_data[status_data.program_id == pid].trips_starting.sum()
    E = status_data[status_data.program_id == pid].trips_ending.sum()
    print('%s\t%s\t\t%s\t\t%s' % (pid.ljust(25), str(S), str(E), S-E))

In [None]:
for pid in pd.unique(status_data_resample.program_id):
    fig, ax = plt.subplots(figsize=(48,6))
    data = status_data_resample[status_data_resample.program_id == pid].groupby([status_data_resample.time.dt.hour])
    data.trips_starting.sum().plot(kind ='line', color='b', alpha= 0.5, ax=ax, marker='o')
    data.trips_ending.sum().plot(kind ='line', color='r', alpha= 0.5, ax=ax, marker='s')
    title = 'Program - %s' % pid
    ax.set_title(title)
    x_ticks = range(0, 24)
    ax.set_xticks(x_ticks)
    plt.show()

In [None]:
print('Program ID\t\t\tTrips Starting\tTrips Ending\tNet')
for pid in pd.unique(status_data.program_id):
    S = status_data[status_data.program_id == pid].trips_starting.sum()
    if S > 1000:
        E = status_data[status_data.program_id == pid].trips_ending.sum()
        print('%s\t%s\t\t%s\t\t%s' % (pid.ljust(25), str(S), str(E), S-E))
        fig, ax = plt.subplots(figsize=(48,6))
        data = status_data_resample[status_data_resample.program_id == pid].groupby([status_data_resample.time.dt.dayofweek, status_data_resample.time.dt.hour])
        data.trips_starting.sum().plot(kind ='line', color='b', alpha= 0.5, ax=ax, marker='o')
        data.trips_ending.sum().plot(kind ='line', color='r', alpha= 0.5, ax=ax, marker='s')
        title = 'Program - %s' % pid
        ax.set_title(title)
        plt.show()

In [None]:
print('Program ID\t\t\tTrips Starting\tTrips Ending\tNet')
for pid in pd.unique(status_data.program_id):
    S = status_data[status_data.program_id == pid].trips_starting.sum()
    if S > 1000:
        E = status_data[status_data.program_id == pid].trips_ending.sum()
        print('%s\t%s\t\t%s\t\t%s' % (pid.ljust(25), str(S), str(E), S-E))
        fig, ax = plt.subplots(figsize=(48,6))
#         data = status_data_resample[status_data_resample.program_id == pid].groupby([status_data_resample.time.dt.dayofweek, status_data_resample.time.dt.hour])
        data = status_data_resample[status_data_resample.program_id == pid].groupby([status_data_resample.time.dt.hour])
        data.trips_starting.sum().plot(kind ='line', color='b', alpha= 0.5, ax=ax, marker='o')
        data.trips_ending.sum().plot(kind ='line', color='r', alpha= 0.5, ax=ax, marker='s')
        title = 'Program - %s' % pid
        ax.set_title(title)
        x_ticks = range(0, 24)
        ax.set_xticks(x_ticks)
        plt.show()