# Data Investigation - Status

In [2]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

### Import Data

In [None]:
print('Loading Status Data...')

status_01 = pd.DataFrame()
status_02 = pd.DataFrame()
status_03 = pd.DataFrame()
status_04 = pd.DataFrame()

try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_status_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    status_import = pd.DataFrame()

    counter = 1

    # load data from each file
    for file in file_list:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True):

            # append chunk to chunks list
            chunks.append(chunk)

            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1

        if counter == 1:
            status_01 = pd.concat(chunks)
        elif counter == 2:
            status_02 = pd.concat(chunks)
        elif counter == 3:
            status_03 = pd.concat(chunks)
        elif counter == 4:
            status_04 = pd.concat(chunks)
        else:
            pass


        print('Finished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')
    

    



In [None]:
print('[%s] Started status_01[\'time\'] Cleaning' % datetime.datetime.now().time())
status_01['time']   = pd.to_datetime(status_01['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_02[\'time\'] Cleaning' % datetime.datetime.now().time())
status_02['time']   = pd.to_datetime(status_02['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_03[\'time\'] Cleaning' % datetime.datetime.now().time())
status_03['time']   = pd.to_datetime(status_03['time'],   format="%m/%d/%Y %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_04[\'time\'] Cleaning' % datetime.datetime.now().time())
status_04['time']   = pd.to_datetime(status_04['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

In [None]:
s1 = status_01.copy()
s2 = status_02.copy()
s3 = status_03.copy()
s4 = status_04.copy()

In [None]:
station_data = pd.DataFrame()
station_data = pd.read_csv('../clean_data/station_data_cleaned.csv')

In [None]:
s1.head()

In [None]:
station_data.head()

In [None]:
s1_merged = pd.merge(s1, station_data, left_on='station_id', right_on='station_id', how='left')
s2_merged = pd.merge(s2, station_data, left_on='station_id', right_on='station_id', how='left')
s3_merged = pd.merge(s3, station_data, left_on='station_id', right_on='station_id', how='left')
s4_merged = pd.merge(s4, station_data, left_on='station_id', right_on='station_id', how='left')

In [None]:
s1_merged.head()

In [None]:
s1_merged.drop(['installation', 'lat_long'], axis=1, inplace=True)
print('\t[%s] s1_merged Complete!\n' % datetime.datetime.now().time())
s2_merged.drop(['installation', 'lat_long'], axis=1, inplace=True)
print('\t[%s] s2_merged Complete!\n' % datetime.datetime.now().time())
s3_merged.drop(['installation', 'lat_long'], axis=1, inplace=True)
print('\t[%s] s3_merged Complete!\n' % datetime.datetime.now().time())
s4_merged.drop(['installation', 'lat_long'], axis=1, inplace=True)
print('\t[%s] s4_merged Complete!\n' % datetime.datetime.now().time())

In [None]:
print('[%s] Starting concat' % datetime.datetime.now().time())
status_data = pd.concat([s1_merged, s2_merged, s3_merged, s4_merged])
print('\t[%s]Complete!\n' % datetime.datetime.now().time())
status_data.head()

In [None]:
status_data.info()

In [None]:
# df.dropna(subset=[1]) 

status_data.dropna(subset=['dockcount'], inplace=True)
status_data.info()

In [None]:
print('[%s] Writing to File' % datetime.datetime.now().time())
status_data.to_csv('../clean_data/status_data_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

# Load Cleaned File

In [3]:
print('Loading Cleaned Status Data...')

status_data = pd.DataFrame()
file = '../clean_data/status_data_cleaned.csv'

try:

    chunks = []
    chunk_counter = 1
    chunksize = 10000
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

    # import file in chunks
    for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True, parse_dates=['time']):

        # append chunk to chunks list
        chunks.append(chunk)

        if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
            print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
        chunk_counter += 1

    print('Concatenating Chunks')
    status_data = pd.concat(chunks)
    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

Loading Cleaned Status Data...
	[13:38:03.993795] finished chunk 1 of 10751
	[13:38:42.163956] finished chunk 1076 of 10751
	[13:39:21.636168] finished chunk 2152 of 10751
	[13:40:02.563629] finished chunk 3228 of 10751
	[13:40:48.489060] finished chunk 4304 of 10751
	[13:41:34.597598] finished chunk 5380 of 10751
	[13:42:21.216513] finished chunk 6456 of 10751
	[13:43:06.946821] finished chunk 7532 of 10751
	[13:43:52.744544] finished chunk 8608 of 10751
	[13:44:39.958804] finished chunk 9684 of 10751
	[13:45:26.679440] finished chunk 10751 of 10751
Concatenating Chunks
Data Loaded Successfully!


In [4]:
status_data.head()

Unnamed: 0.1,Unnamed: 0,station_id,bikes_available,docks_available,time,name,lat,long,dockcount,landmark,zip_code
0,0,2,12,15,2014-03-01 00:00:00,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0
1,1,2,12,15,2014-03-01 00:01:00,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0
2,2,2,12,15,2014-03-01 00:02:00,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0
3,3,2,12,15,2014-03-01 00:03:00,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0
4,4,2,12,15,2014-03-01 00:04:00,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0


In [5]:
status_data.set_index('time', inplace=True)

status_data = status_data[status_data.lat.notnull()]

status_data['station_utilization'] = status_data['docks_available'] / status_data['dockcount'] * 100.

In [6]:
status_data.head()

Unnamed: 0_level_0,Unnamed: 0,station_id,bikes_available,docks_available,name,lat,long,dockcount,landmark,zip_code,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-03-01 00:00:00,0,2,12,15,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0,55.555556
2014-03-01 00:01:00,1,2,12,15,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0,55.555556
2014-03-01 00:02:00,2,2,12,15,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0,55.555556
2014-03-01 00:03:00,3,2,12,15,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0,55.555556
2014-03-01 00:04:00,4,2,12,15,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,95113.0,55.555556


In [7]:
status_data.drop(['Unnamed: 0', 'name', 'lat', 'long', 'landmark', 'zip_code'], axis=1, inplace=True)

In [8]:
status_data.head()

Unnamed: 0_level_0,Unnamed: 0,station_id,bikes_available,docks_available,dockcount,zip_code,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-03-01 00:00:00,0,2,12,15,27.0,95113.0,55.555556
2014-03-01 00:01:00,1,2,12,15,27.0,95113.0,55.555556
2014-03-01 00:02:00,2,2,12,15,27.0,95113.0,55.555556
2014-03-01 00:03:00,3,2,12,15,27.0,95113.0,55.555556
2014-03-01 00:04:00,4,2,12,15,27.0,95113.0,55.555556


In [10]:
status_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 107501452 entries, 2014-03-01 00:00:00 to 2014-02-28 23:59:00
Data columns (total 6 columns):
station_id             int64
bikes_available        int64
docks_available        int64
dockcount              float64
zip_code               float64
station_utilization    float64
dtypes: float64(3), int64(3)
memory usage: 5.6 GB


In [11]:
status_data['dockcount'] = status_data['dockcount'].astype('int')
status_data['zip_code'] = status_data['zip_code'].astype('int')

status_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 107501452 entries, 2014-03-01 00:00:00 to 2014-02-28 23:59:00
Data columns (total 6 columns):
station_id             int64
bikes_available        int64
docks_available        int64
dockcount              int64
zip_code               int64
station_utilization    float64
dtypes: float64(1), int64(5)
memory usage: 5.6 GB


In [12]:
status_data.head()

Unnamed: 0_level_0,station_id,bikes_available,docks_available,dockcount,zip_code,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-03-01 00:00:00,2,12,15,27,95113,55.555556
2014-03-01 00:01:00,2,12,15,27,95113,55.555556
2014-03-01 00:02:00,2,12,15,27,95113,55.555556
2014-03-01 00:03:00,2,12,15,27,95113,55.555556
2014-03-01 00:04:00,2,12,15,27,95113,55.555556


In [14]:
status_data.head()

Unnamed: 0_level_0,station_id,bikes_available,docks_available,dockcount,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-03-01 00:00:00,2,12,15,27,55.555556
2014-03-01 00:01:00,2,12,15,27,55.555556
2014-03-01 00:02:00,2,12,15,27,55.555556
2014-03-01 00:03:00,2,12,15,27,55.555556
2014-03-01 00:04:00,2,12,15,27,55.555556


In [15]:
status_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 107501452 entries, 2014-03-01 00:00:00 to 2014-02-28 23:59:00
Data columns (total 5 columns):
station_id             int64
bikes_available        int64
docks_available        int64
dockcount              int64
station_utilization    float64
dtypes: float64(1), int64(4)
memory usage: 4.8 GB


In [16]:
print('[%s] Writing to File' % datetime.datetime.now().time())
status_data.to_csv('../clean_data/status_data_cleaned_slim.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[14:20:23.167952] Writing to File
	[14:34:29.068686]Complete!



## Resampling

### Daily Mean

In [17]:
print('[%s] Resampling Daily Mean' % datetime.datetime.now().time())
status_daily_mean = status_data.groupby(['station_id', 'dockcount']).resample('D').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_daily_mean.index = status_daily_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[14:36:52.556932] Resampling Daily Mean
	[14:40:05.738097]Complete!



In [19]:
status_daily_mean.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 75834 entries, (27, 2013-08-29 00:00:00) to (35, 2016-08-31 00:00:00)
Data columns (total 5 columns):
station_id             75834 non-null float64
bikes_available        75834 non-null float64
docks_available        75834 non-null float64
dockcount              75834 non-null float64
station_utilization    75834 non-null float64
dtypes: float64(5)
memory usage: 3.1 MB


In [20]:
print('[%s] Writing to File' % datetime.datetime.now().time())
status_daily_mean.to_csv('../clean_data/status_data_daily_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[14:40:34.242599] Writing to File
	[14:40:35.306466]Complete!



### 15 Minute Mean

In [21]:
print('[%s] Resampling 15min Mean' % datetime.datetime.now().time())
status_15min_mean = status_data.groupby(['station_id', 'dockcount']).resample('15T').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_15min_mean.index = status_15min_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_15min_mean.to_csv('../clean_data/status_data_15min_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[14:40:40.353006] Resampling 15min Mean
	[14:45:20.532549]Complete!

[14:45:20.542960] Writing to File
	[14:48:29.003775]Complete!



In [22]:
status_15min_mean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station_id,bikes_available,docks_available,dockcount,station_utilization
dockcount,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
27,2013-08-29 12:00:00,2.0,2.0,25.0,27.0,92.592593
27,2013-08-29 12:15:00,2.0,2.0,25.0,27.0,92.592593
27,2013-08-29 12:30:00,2.0,2.0,25.0,27.0,92.592593
27,2013-08-29 12:45:00,2.0,2.0,25.0,27.0,92.592593
27,2013-08-29 13:00:00,2.0,2.076923,24.923077,27.0,92.307692


In [76]:
test = status_15min_mean[status_15min_mean.docks_available > status_15min_mean.dockcount]

In [23]:
status_15min_mean.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7276249 entries, (27, 2013-08-29 12:00:00) to (35, 2016-08-31 23:45:00)
Data columns (total 5 columns):
station_id             float64
bikes_available        float64
docks_available        float64
dockcount              float64
station_utilization    float64
dtypes: float64(5)
memory usage: 313.1 MB


## PreCleaned Files

### Load Status Data

In [24]:
file = '../clean_data/status_data_15min_mean_cleaned.csv'

status_15min_mean_temp = pd.DataFrame()

chunks = []
chunk_counter = 1
chunksize = 10000
num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

# import file in chunks
for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True, parse_dates=['time']):

    # append chunk to chunks list
    chunks.append(chunk)

    if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
        print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
    chunk_counter += 1

status_15min_mean_temp = pd.concat(chunks)

status_15min_mean_temp.info()


	[14:49:36.843031] finished chunk 1 of 728
	[14:49:39.672829] finished chunk 73 of 728
	[14:49:42.559801] finished chunk 146 of 728
	[14:49:45.552416] finished chunk 219 of 728
	[14:49:48.515669] finished chunk 292 of 728
	[14:49:51.341204] finished chunk 365 of 728
	[14:49:54.166579] finished chunk 438 of 728
	[14:49:57.066713] finished chunk 511 of 728
	[14:49:59.886279] finished chunk 584 of 728
	[14:50:02.810409] finished chunk 657 of 728
	[14:50:05.523198] finished chunk 728 of 728
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7276249 entries, 0 to 7276248
Data columns (total 7 columns):
dockcount              int64
time                   datetime64[ns]
station_id             float64
bikes_available        float64
docks_available        float64
dockcount.1            float64
station_utilization    float64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 444.1 MB


In [26]:
status_15min_mean_temp.drop(['dockcount.1'], axis=1, inplace=True)
status_15min_mean_temp.set_index('time', inplace=True)

In [27]:
status_15min_mean_temp.head()

Unnamed: 0_level_0,dockcount,station_id,bikes_available,docks_available,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-08-29 12:00:00,27,2.0,2.0,25.0,92.592593
2013-08-29 12:15:00,27,2.0,2.0,25.0,92.592593
2013-08-29 12:30:00,27,2.0,2.0,25.0,92.592593
2013-08-29 12:45:00,27,2.0,2.0,25.0,92.592593
2013-08-29 13:00:00,27,2.0,2.076923,24.923077,92.307692


### Analysis

In [28]:
status_15min = status_15min_mean_temp.copy()

In [71]:
status_15min
status_15min['dockcount'] = status_15min['dockcount'].astype('int')
status_15min['station_id'] = status_15min['station_id'].astype('int')
status_15min['bikes_available'] = status_15min['bikes_available'].astype('int')
status_15min['docks_available'] = status_15min['docks_available'].astype('int')

In [72]:
status_15min.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7276249 entries, 2013-08-29 12:00:00 to 2016-08-31 23:45:00
Data columns (total 5 columns):
dockcount              int64
station_id             int64
bikes_available        int64
docks_available        int64
station_utilization    float64
dtypes: float64(1), int64(4)
memory usage: 493.1 MB


In [74]:
test = status_15min[status_15min.docks_available > status_15min.dockcount]
test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3960 entries, 2015-06-05 13:15:00 to 2013-08-29 12:00:00
Data columns (total 5 columns):
dockcount              3960 non-null int64
station_id             3960 non-null int64
bikes_available        3960 non-null int64
docks_available        3960 non-null int64
station_utilization    3960 non-null float64
dtypes: float64(1), int64(4)
memory usage: 185.6 KB


In [75]:
test

Unnamed: 0_level_0,dockcount,station_id,bikes_available,docks_available,station_utilization
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-05 13:15:00,25,22,0,26,105.422222
2015-06-05 13:30:00,25,22,0,26,107.644444
2015-06-05 13:45:00,25,22,0,26,108.000000
2015-06-05 14:00:00,25,22,0,26,108.000000
2015-06-05 14:15:00,25,22,0,26,108.000000
2015-06-05 14:30:00,25,22,0,26,108.000000
2015-06-05 14:45:00,25,22,0,26,108.000000
2015-06-05 15:00:00,25,22,0,26,108.000000
2015-06-05 15:15:00,25,22,0,26,108.000000
2015-06-05 15:30:00,25,22,0,26,108.000000
