# Data Investigation - Status Data

### Import Data

In [3]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [4]:
file = '../clean_data/status_data_15min_mean_cleaned.csv'

chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True, parse_dates=['time']):

    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1
    
status_15min = pd.DataFrame()
status_15min = pd.concat(chunks)

status_15min.set_index('time', inplace=True)
status_15min.head()

status_15min.info()

	[22:24:40.755816] finished chunk 1 of 2345
	[22:24:48.165002] finished chunk 235 of 2345
	[22:24:54.018791] finished chunk 470 of 2345
	[22:25:00.078369] finished chunk 705 of 2345
	[22:25:05.833652] finished chunk 940 of 2345
	[22:25:11.502158] finished chunk 1175 of 2345
	[22:25:17.710693] finished chunk 1410 of 2345
	[22:25:23.177233] finished chunk 1645 of 2345
	[22:25:28.667707] finished chunk 1880 of 2345
	[22:25:34.341166] finished chunk 2115 of 2345
	[22:25:39.858090] finished chunk 2345 of 2345
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23449611 entries, 2014-01-31 09:30:00 to 2016-08-31 23:45:00
Data columns (total 5 columns):
station_id         int64
bikes_available    float64
docks_available    float64
dock_count         int64
utilization        float64
dtypes: float64(3), int64(2)
memory usage: 1.0 GB


In [6]:
status_15min.head(24)

Unnamed: 0_level_0,station_id,bikes_available,docks_available,dock_count,utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-31 09:30:00,2,6.0,17.0,27,0.62963
2014-01-31 09:45:00,2,3.0,8.5,27,0.314815
2014-01-31 10:00:00,2,2.0,5.666667,27,0.209877
2014-01-31 10:15:00,2,0.0,0.0,27,0.0
2014-01-31 10:30:00,2,0.0,0.0,27,0.0
2014-01-31 10:45:00,2,0.0,0.0,27,0.0
2014-01-31 11:00:00,2,0.0,0.0,27,0.0
2014-01-31 11:15:00,2,0.0,0.0,27,0.0
2014-01-31 11:30:00,2,0.0,0.0,27,0.0
2014-01-31 11:45:00,2,0.0,0.0,27,0.0


### Clean And Resample Data

In [None]:
# convert time to datetime
status_01_copy = status_01.copy()
status_01_copy['time']   = pd.to_datetime(status_01_copy['time'],   format="%Y/%m/%d %H:%M:%S")


status_02_copy = status_02.copy()
status_02_copy['time']   = pd.to_datetime(status_02_copy['time'],   format="%Y/%m/%d %H:%M:%S")


status_03_copy = status_03.copy()
status_03_copy['time']   = pd.to_datetime(status_03_copy['time'],   format="%m/%d/%Y %H:%M:%S")


status_04_copy = status_04.copy()
status_04_copy['time']   = pd.to_datetime(status_04_copy['time'],   format="%Y/%m/%d %H:%M:%S")



In [None]:
status_02_copy.info()

In [None]:
status_time_clean = pd.concat([status_01_copy, status_02_copy, status_03_copy, status_04_copy])
status_time_clean.info()


In [None]:
# append total_docks column
status_time_clean['total_docks'] = status_time_clean['bikes_available'] + status_time_clean['docks_available']
status_time_clean.info()

In [None]:
status_time_clean.head()

In [None]:
# set time column as index
status_time_clean.set_index('time', inplace=True)

status_data = status_time_clean.copy()

> From Analysis of Trip Data, we can see that key commute hours for Subscribers are:
-    Morning Commute Hours 07:00-11:00
-    Evening Commute Hours 16:00-20:00

> From Analysis of Trip Data, we can see that key Stations for Subscribers are:
-    Morning Commute Stations 
    - top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
    - top_am_commute_end_terms   = [51, 55, 60, 61, 63, 65, 69, 70, 74, 77]
>
>
-    Evening Commute Stations 
    - top_pm_commute_start_terms = [55, 60, 61, 64, 65, 67, 69, 70, 74, 77]
    - top_pm_commute_end_terms   = [39, 50, 55, 60, 61, 65, 69, 70, 74, 77]
>

In [None]:
# prune only morning commute hours from subscribers [07:00 - 11:00]
am_commute_start = datetime.datetime.strptime('07:00', '%H:%M').time()
am_commute_end = datetime.datetime.strptime('11:00', '%H:%M').time()
morning_commute_status = status_data.between_time(start_time=am_commute_start,
                                                     end_time=am_commute_end,
                                                     include_start=True,
                                                     include_end=True)

morning_commute_status.info()

In [None]:
# prune only evening commute hours from subscribers [16:00 - 20:00]
pm_commute_start = datetime.datetime.strptime('16:00', '%H:%M').time()
pm_commute_end = datetime.datetime.strptime('20:00', '%H:%M').time()
evening_commute_status = status_data.between_time(start_time=pm_commute_start,
                                                     end_time=pm_commute_end,
                                                     include_start=True,
                                                     include_end=True)

evening_commute_status.info()


In [None]:
# prune only important stations for each commute period
top_am_commute_start_terms = [50, 54, 55, 61, 67, 69, 70, 73, 74, 77]
top_am_commute_end_terms = [51, 55, 60, 61, 63, 65, 69, 70, 74, 77]
top_pm_commute_start_terms = [55, 60, 61, 64, 65, 67, 69, 70, 74, 77]
top_pm_commute_end_terms = [39, 50, 55, 60, 61, 65, 69, 70, 74, 77]


morning_commute_status_start = morning_commute_status[morning_commute_status.station_id.isin(top_am_commute_start_terms)].copy()
morning_commute_status_end   = morning_commute_status[morning_commute_status.station_id.isin(top_am_commute_end_terms)].copy()
evening_commute_status_start = evening_commute_status[evening_commute_status.station_id.isin(top_pm_commute_start_terms)].copy()
evening_commute_status_end   = evening_commute_status[evening_commute_status.station_id.isin(top_pm_commute_end_terms)].copy()

In [None]:
morning_commute_status_start.groupby(['station_id', morning_commute_status_start.index.hour])['docks_available'].plot(figsize=(12,6))
plt.show()

In [None]:
morning_commute_status_start.groupby(['station_id', morning_commute_status_start.index.hour, morning_commute_status_start.index.minute]).mean()['docks_available'].plot(figsize=(12,6))
plt.show()

In [None]:
for i in pd.unique(morning_commute_status_start.station_id):
    data = morning_commute_status_start[morning_commute_status_start.station_id == i]
    data.groupby(['station_id', data.index.hour, data.index.minute]).mean()['bikes_available'].plot(figsize=(12,3))
    plt.show()

In [None]:
status_data.info()






In [None]:
for i in pd.unique(status_data.station_id):
    data = status_data[status_data.station_id == i]
    data.groupby(['station_id', data.index.hour, data.index.minute]).mean()['bikes_available'].plot(figsize=(12,3))
    plt.show()

In [None]:
# for i in pd.unique(status_data.station_id):
#     data = status_data[status_data.station_id == i]
status_data.groupby([status_data.index.hour, status_data.index.minute]).mean()['bikes_available'].plot(figsize=(12,3))
plt.show()

In [None]:

ax = evening_commute_status.groupby([evening_commute_status.index.hour, evening_commute_status.index.minute]).mean()['bikes_available'].plot(figsize=(12,6))
morning_commute_status.groupby([morning_commute_status.index.hour, morning_commute_status.index.minute]).mean()['bikes_available'].plot(ax=ax)
plt.legend(['evening', 'morning'],loc='best')
plt.show()

In [None]:

ax = evening_commute_status.groupby([evening_commute_status.index.hour, evening_commute_status.index.minute]).sum()['bikes_available'].plot(figsize=(12,6))
morning_commute_status.groupby([morning_commute_status.index.hour, morning_commute_status.index.minute]).sum()['bikes_available'].plot(ax=ax)
plt.legend(['evening', 'morning'],loc='best')
plt.show()

In [None]:
status_73_data = status_data[status_data.station_id == 73]

status_73_data.groupby([status_73_data.index.hour, status_73_data.index.minute]).max()['bikes_available'].plot(figsize=(12,6))
plt.show()