# Data Investigation - Status

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Import Data

In [None]:
print('Loading Status Data...')

try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_status_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    status_import = pd.DataFrame()

    counter = 1

    # load data from each file
    for file in file_list:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True):

            # append chunk to chunks list
            chunks.append(chunk)

            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1

        if counter == 1:
            status_01 = pd.DataFrame()
            status_01 = pd.concat(chunks)
        elif counter == 2:
            status_02 = pd.DataFrame()
            status_02 = pd.concat(chunks)
        elif counter == 3:
            status_03 = pd.DataFrame()
            status_03 = pd.concat(chunks)
        elif counter == 4:
            status_04 = pd.DataFrame()
            status_04 = pd.concat(chunks)
        else:
            pass


        print('Finished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')
    

## Clean Data

In [None]:
print('[%s] Started status_01[\'time\'] Cleaning' % datetime.datetime.now().time())
status_01['time']   = pd.to_datetime(status_01['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_02[\'time\'] Cleaning' % datetime.datetime.now().time())
status_02['time']   = pd.to_datetime(status_02['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_03[\'time\'] Cleaning' % datetime.datetime.now().time())
status_03['time']   = pd.to_datetime(status_03['time'],   format="%m/%d/%Y %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_04[\'time\'] Cleaning' % datetime.datetime.now().time())
status_04['time']   = pd.to_datetime(status_04['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())


# Concat all frames
print('[%s] Starting concat' % datetime.datetime.now().time())
status_data_import = pd.DataFrame()
status_data_import = pd.concat([status_01, status_02, status_03, status_04])
print('\t[%s]Complete!\n' % datetime.datetime.now().time())
status_data_import.head()

In [None]:
status_data = status_data_import.copy()
status_data.reset_index(inplace=True, drop=True)
status_data.head(3)

In [None]:
def show_station(df, id, time_min, time_max, color='b'):
    t = df[df.station_id == id]
    t.set_index('time', inplace=True)
    ax = t['bikes_available'].plot(color=color, figsize=(24,3))
    ax.set_xlim(time_min, time_max)
    title = 'Station %s' % id
    ax.set_title(title)
    plt.show()

## Preview Relocated Stations pre cleaning

In [None]:
closed_stations = [21, 22, 23, 24, 25, 26]

t_min = pd.Timestamp('2013-07-01')
t_max = pd.Timestamp('2016-10-01')

for station in closed_stations:
    show_station(status_data, station, t_min, t_max)

In [None]:
relocated_stations = [88, 89, 90, 91]

t_min = pd.Timestamp('2013-07-01')
t_max = pd.Timestamp('2016-10-01')

for station in relocated_stations:
    show_station(status_data, station, t_min, t_max)

## Correct Relocated Stations

> There was a delay in station_id updates when stations 23, 24, 25, and 26 were relocated, update status information by changing the station_id for these stations in dates after they were relocated

In [None]:
def date_fixes(old_station, new_station, change_date):

    print('[%s]\tStarted indexing...' % datetime.datetime.now().time())
    index_to_update = status_data[(status_data.station_id == old_station) & (status_data.time >= change_date)].index
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())    

    station_check = [old_station, new_station]

    t_min = pd.Timestamp('2013-07-01')
    t_max = pd.Timestamp('2016-10-01')

    for station in station_check:
        show_station(status_data, station, t_min, t_max)

    print('[%s]\tStarted Update...' % datetime.datetime.now().time())
    status_data.loc[index_to_update, 'station_id'] = new_station
    print('\t[%s]\tComplete!' % datetime.datetime.now().time())

    for station in station_check:
        show_station(status_data, station, t_min, t_max)

In [None]:
change_date = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')
date_fixes(23, 88, change_date)
date_fixes(24, 89, change_date)

change_date = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
date_fixes(25, 91, change_date)
date_fixes(26, 90, change_date)


## Drop station 87, there is no station 87

In [None]:
# drop station 87 - there is no station 87
print('[%s] - Removing Station 87 status, there is no station 87' % (datetime.datetime.now().time()))
print('\t\tremoving %s items' % len(status_data[status_data.station_id == 87].index))
status_data.drop(status_data[status_data.station_id == 87].index, inplace=True)
print('[%s] - Complete' % (datetime.datetime.now().time()))

In [None]:
status_data.info()

In [None]:
status_data.reset_index(inplace=True, drop=True)
status_data.info()

## Graphical EDA of Station Bike Availability

In [None]:
t_min = pd.Timestamp('2013-07-01')
t_max = pd.Timestamp('2016-10-01')

for station in sorted(pd.unique(status_data.station_id)):
    show_station(status_data, station, t_min, t_max)    

## Station Status Summary

In [None]:
for i in pd.unique(status_data.station_id):
    tmp = status_data[status_data.station_id == i]
    print('Station ', i)
    for item in tmp.columns[1:-1]:
        print('%s - %s' % (item, sorted(pd.unique(tmp[item]))))
    
    print('------------------------------')

## Downsample Cleaned Data to 5 Minute Intervals - Mean

In [None]:
print('[%s] Resampling on interval' % datetime.datetime.now().time())
status_data_5m = status_data.copy()
status_data_5m.set_index(['station_id', 'time'], inplace=True)

# resample on 6 minute interval mean
status_data_5m = status_data_5m.groupby([pd.Grouper(level=0), pd.Grouper(freq='5T', level=-1)]).mean()
status_data_5m.reset_index(inplace=True)

print('[%s] Complete' % datetime.datetime.now().time())

## Downsample Cleaned Data to Hour Intervals - Mean

In [None]:
print('[%s] Resampling on interval' % datetime.datetime.now().time())
status_data_1h = status_data.copy()
status_data_1h.set_index(['station_id', 'time'], inplace=True)

# resample on 1 hour interval mean
status_data_1h = status_data_1h.groupby([pd.Grouper(level=0), pd.Grouper(freq='H', level=-1)]).mean()
status_data_1h.reset_index(inplace=True)

print('[%s] Complete' % datetime.datetime.now().time())

## Downsample Cleaned Data to Day Intervals - Mean

In [None]:
print('[%s] Resampling on interval' % datetime.datetime.now().time())
status_data_1d = status_data.copy()
status_data_1d.set_index(['station_id', 'time'], inplace=True)

# resample on 1 day interval mean
status_data_1d = status_data_1d.groupby([pd.Grouper(level=0), pd.Grouper(freq='D', level=-1)]).mean()
status_data_1d.reset_index(inplace=True)

print('[%s] Complete' % datetime.datetime.now().time())

## Downsample Review

In [None]:
print('Original Data Set\t', len(status_data[status_data.station_id ==2]))
print('Resampled on 5m mean\t', len(status_data_5m[status_data_5m.station_id ==2]))
print('Resampled on 1h mean\t', len(status_data_1h[status_data_1h.station_id ==2]))
print('Resampled on 1d mean\t', len(status_data_1d[status_data_1d.station_id ==2]))

In [None]:
status_data.info()

In [None]:
status_data_5m.info()

In [None]:
status_data_1h.info()

In [None]:
status_data_1d.info()

## Downsample Graphical EDA on Select Stations

In [None]:
t_min = pd.Timestamp('2013-07-01')
t_max = pd.Timestamp('2016-10-01')

station_test_list = [2, 3, 25, 90]

for station in station_test_list:
# for station in sorted(pd.unique(status_data_6m.station_id)):
    show_station(status_data, station, t_min, t_max, color='b') 
    show_station(status_data_5m, station, t_min, t_max, color='g') 
    show_station(status_data_1h, station, t_min, t_max, color='r') 
    show_station(status_data_1d, station, t_min, t_max, color='c') 

## Write Downsampled data sets to files

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
status_data.to_csv('../clean_data/status_data_cleaned_1m.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
status_data_5m.to_csv('../clean_data/status_data_cleaned_5m.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
status_data_1h.to_csv('../clean_data/status_data_cleaned_1h.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
status_data_1d.to_csv('../clean_data/status_data_cleaned_1d.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

# Merge Station Data into Status Data

In [None]:
station_data = pd.DataFrame()
station_data = pd.read_csv('../clean_data/station_data_cleaned_final.csv', parse_dates=['first_service_date', 'last_service_date'])

station_data_basic = pd.DataFrame()
# drp duplicated station_id ros, keep first
station_data_basic = station_data.copy()
station_data_basic.drop_duplicates(subset=['station_id'], keep='first', inplace=True)

station_data_basic.drop(['Unnamed: 0', 'lat', 'long', 'landmark', 'zip_code'], axis=1, inplace=True)

station_data_basic.info()

In [None]:
station_data_basic.head()

In [None]:
# merge status data with station data,  we will have to correct the dock count for station 73 later
status_merged = pd.DataFrame()

print('[%s] - Starting Merge' % (datetime.datetime.now().time()))
status_merged = pd.merge(status_15min_mean, station_data_basic, on='station_id')
print('[%s] - Merge Complete' % (datetime.datetime.now().time()))

status_merged.drop(['first_service_date', 'last_service_date'], axis=1, inplace=True)

In [None]:
# cleanup station 73, after 05/19/2015 the dock count is 19, up from 15
change_date = datetime.datetime.strptime('2015-05-19', '%Y-%m-%d')

print('[%s]\tSubsetting dates...' % datetime.datetime.now().time())
index_to_update = status_merged[(status_merged.station_id == 73) & (status_merged.time >= change_date)].index
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

print('[%s]\tStarted Update...' % datetime.datetime.now().time())
status_merged.loc[index_to_update, 'dock_count'] = 19
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

In [None]:
cols = ['station_id', 'name', 'dock_count', 'days_in_service']

for i in pd.unique(status_merged.station_id):
    tmp = status_merged[status_merged.station_id == i]
    print('\nStation ', i)
    for item in cols:
        print('%s - %s' % (item, sorted(pd.unique(tmp[item]))))
    
    print('------------------------------')

## Calculate Station Utilization at each interval

> number of docks available / number of total docks

In [None]:
print('[%s] Calculate Utilization' % datetime.datetime.now().time())
# calculate utilization, round off at 1.0 if overage are calculated, 
# some station data reports inaccuratly on resample
status_merged['utilization'] = status_merged['docks_available'] / status_merged['dock_count']
status_merged['utilization'] = status_merged['utilization'].apply(lambda x: 1.0 if x > 1.0 else x)

# merge_15min.reset_index(inplace=True)

print('[%s] Complete!' % datetime.datetime.now().time())
status_merged.head(5)

In [None]:
status_merged.info()

In [None]:
print('[%s]\tWriting File...' % datetime.datetime.now().time())
status_merged.to_csv('../clean_data/status_data_cleaned.csv', encoding='utf-8')
print('\t[%s]\tComplete!' % datetime.datetime.now().time())

# Load Cleaned File

In [None]:
try:
    status_data = pd.DataFrame()
    status_data = status_merged.copy()
    print('already working, keep going')
    
except:
    print('Loading Cleaned Status Data...')

    status_data = pd.DataFrame()
    file = '../clean_data/status_data_cleaned.csv'

    try:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True, parse_dates=['time'], index_col=0):

            # append chunk to chunks list
            chunks.append(chunk)

            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1

        print('Concatenating Chunks')
        status_data = pd.DataFrame()
        status_data = pd.concat(chunks)
        print('Data Loaded Successfully!')
    
    except:
        print('oops... something went wrong importing the data :(')

In [None]:
status_data.sort_values(['station_id', 'time'], inplace=True)
status_data.head(10)

### Analysis

In [None]:
status_data.info()

In [None]:
status_data.head()

In [None]:
status_data.set_index('time', inplace=True)

In [None]:
for i in sorted(pd.unique(status_data.station_id)):
    tmp = status_data[status_data.station_id == i]
#     ax = tmp.utilization.resample('M').mean().plot(figsize=(12,3))
    ax = tmp.utilization.resample('M').mean().fillna(0).rolling(window=3, min_periods=1).mean().plot(figsize=(12,3))
    t = 'Station %s' %i
    ax.set_title(t)
    ax.set_ylim(0,1)
    plt.show()
    


In [None]:
s91 = status_data[status_data.station_id == 91]

In [None]:
s91.info()

In [None]:
s91.head()

In [None]:
closed_stations = [21, 22, 23, 24, 25, 26]
closed_stations = [73]

redwood_city_station_utilization = pd.DataFrame()
chunks = []

# for i in closed_stations:
for i in sorted(pd.unique(status_data.station_id)):
    ax = status_data[status_data.station_id == i]['dock_count'].plot(figsize=(12,2))
    t = 'Station %s' % i
    ax.set_title(t)
    plt.show()
    

In [None]:
top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
temp = status_data[status_data.station_id.isin(top_am_commute_start_terms)]

In [None]:
plt.subplots(figsize=(15,5))
ax = sns.boxplot(x=temp.index.dayofweek, y="utilization", hue="station_id", data=temp)

ax.set(xlabel='Day of Week')
# plt.legend()
plt.show()



In [None]:
for i in top_am_commute_start_terms:
    temp = status_data[status_data.station_id == i]

    plt.subplots(figsize=(15,5))
    ax = sns.boxplot(x=temp.index.hour, y="utilization", data=temp)
    ax.set(xlabel='Hour of Day')
    ax.set(title='Station %s' % i)
    # plt.legend()
    plt.show()




In [None]:

top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
temp = status_data[status_data.station_id.isin(top_am_commute_start_terms)]
plt.subplots(figsize=(15,5))
ax = sns.pointplot(x=temp.index.hour, y="utilization", hue='station_id', data=temp)
ax.set(xlabel='Hour of Day')
ax.set(title='Station %s' % i)
plt.legend()
plt.show()

In [None]:
top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
temp = status_data[status_data.station_id.isin(top_am_commute_start_terms)]

sns.jointplot(x=temp.index.hour, y="utilization", data=temp)

In [None]:

top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
temp = status_data[status_data.station_id.isin(top_am_commute_start_terms)]
plt.subplots(figsize=(15,5))
ax = sns.jointplot(x=temp.index.hour, y="utilization", data=temp, kind='reg')
ax.set(xlabel='Hour of Day')
ax.set(title='Station %s' % i)
plt.legend()
plt.show()

In [None]:
status_data.groupby(['station_id', status_data.index.day, status_data.index.hour])['utilization'].mean().plot(figsize=(12,6), alpha=0.2)
plt.show()

In [None]:
test = status_data[status_data.station_id == 73]
ax = test.groupby([test.index.hour, test.index.minute])['bikes_available'].median().plot(kind='bar', figsize=(12,6))
test.groupby([test.index.hour, test.index.minute])['bikes_available'].mean().plot(kind='line', color='g', ax=ax)
# test.groupby([test.index.hour, test.index.minute])['dock_count'].max().plot(kind='line', color='r', ax=ax)
plt.show()

In [None]:
top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]

In [None]:
counter = 0

for station in [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]:
    test = status_data[status_data.station_id == station]
    
    if counter == 0:
#         ax = test.groupby([test.index.dayofweek, test.index.hour, test.index.minute])['utilization'].mean().plot(kind='line', figsize=(12,6))
        ax = test.groupby([test.index.dayofweek, test.index.hour])['utilization'].mean().plot(kind='line', figsize=(12,6))
        counter += 1
    else:
#         test.groupby([test.index.dayofweek, test.index.hour, test.index.minute])['utilization'].mean().plot(kind='line', ax=ax)
        test.groupby([test.index.dayofweek, test.index.hour])['utilization'].mean().plot(kind='line', ax=ax)

plt.show()