# Data Investigation - Status

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

### Import Data

In [None]:
print('Loading Status Data...')

status_01 = pd.DataFrame()
status_02 = pd.DataFrame()
status_03 = pd.DataFrame()
status_04 = pd.DataFrame()

try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_status_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    status_import = pd.DataFrame()

    counter = 1

    # load data from each file
    for file in file_list:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True):

            # append chunk to chunks list
            chunks.append(chunk)

            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1

        if counter == 1:
            status_01 = pd.concat(chunks)
        elif counter == 2:
            status_02 = pd.concat(chunks)
        elif counter == 3:
            status_03 = pd.concat(chunks)
        elif counter == 4:
            status_04 = pd.concat(chunks)
        else:
            pass


        print('Finished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')
    

    



In [None]:
s1 = status_01.copy()
s2 = status_02.copy()
s3 = status_03.copy()
s4 = status_04.copy()

In [None]:
print('[%s] Started status_01[\'time\'] Cleaning' % datetime.datetime.now().time())
s1['time']   = pd.to_datetime(s1['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_02[\'time\'] Cleaning' % datetime.datetime.now().time())
s2['time']   = pd.to_datetime(s2['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_03[\'time\'] Cleaning' % datetime.datetime.now().time())
s3['time']   = pd.to_datetime(s3['time'],   format="%m/%d/%Y %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_04[\'time\'] Cleaning' % datetime.datetime.now().time())
s4['time']   = pd.to_datetime(s4['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

In [None]:
print('[%s] Starting concat' % datetime.datetime.now().time())
status_data = pd.concat([s1, s2, s3, s4])
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_data.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

In [None]:
status_data.head()

In [None]:
status_data.set_index('time', inplace=True)

In [None]:
status_data.head()

## Resampling

### Daily Mean

In [None]:
print('[%s] Resampling Daily Mean' % datetime.datetime.now().time())
status_daily_mean = status_data.groupby('station_id').resample('D').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_daily_mean.index = status_daily_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_daily_mean.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_daily_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

### 15 Minute Mean

In [None]:
print('[%s] Resampling 15min Mean' % datetime.datetime.now().time())
status_15min_mean = status_data.groupby('station_id').resample('15T').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_15min_mean.index = status_15min_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_15min_mean.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_15min_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

## PreCleaned Files

### Load Status Data

In [17]:
status_15min_mean = pd.DataFrame
status_15min_mean = pd.read_csv('../clean_data/status_data_15min_mean_cleaned.csv', index_col='time', parse_dates=['time'])

status_15min_mean['station_id'] = status_15min_mean['station_id'].astype('int')


### Load Station Data

In [20]:
station_data = pd.DataFrame()
station_data = pd.read_csv('../clean_data/station_data_cleaned.csv')

In [21]:
status_15min_mean.head(3)

Unnamed: 0_level_0,station_id,bikes_available,docks_available
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-08-29 12:00:00,2,2.0,25.0
2013-08-29 12:15:00,2,2.0,25.0
2013-08-29 12:30:00,2,2.0,25.0


In [22]:
station_data.head(3)

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,95113,"(37.329732, -121.90178200000001)"
1,3,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,95113,"(37.330698, -121.888979)"
2,4,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,95113,"(37.333988, -121.894902)"


### Join Data to compute utilization

In [42]:
status_loc_info = pd.merge(status_15min_mean, station_data, on='station_id')
status_loc_info['utilization'] = status_loc_info['docks_available'] / status_loc_info['dockcount']
status_loc_info.drop(['lat', 'long', 'installation', 'lat_long'], axis=1, inplace=True)

In [43]:
status_loc_info.head(10)

Unnamed: 0,station_id,bikes_available,docks_available,name,dockcount,landmark,zip_code,utilization
0,2,2.0,25.0,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.925926
1,2,2.0,25.0,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.925926
2,2,2.0,25.0,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.925926
3,2,2.0,25.0,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.925926
4,2,2.076923,24.923077,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.923077
5,2,2.410256,24.589744,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.910731
6,2,2.74359,24.25641,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.898386
7,2,2.846154,24.153846,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.894587
8,2,2.512821,24.487179,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.906933
9,2,2.179487,24.820513,San Jose Diridon Caltrain Station,27.0,San Jose,95113,0.919278


In [44]:
print('[%s] Writing to File' % datetime.datetime.now().time())
status_loc_info.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_15min_mean_location_cleaned.csv', encoding='utf-8')
status_loc_info.to_csv('../clean_data/status_data_15min_mean_location_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[22:51:29.854678] Writing to File
	[22:54:38.444307]Complete!

