# Data Investigation - Status

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

### Import Data

In [2]:
print('Loading Status Data...')

status_01 = pd.DataFrame()
status_02 = pd.DataFrame()
status_03 = pd.DataFrame()
status_04 = pd.DataFrame()

try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_status_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    status_import = pd.DataFrame()

    counter = 1

    # load data from each file
    for file in file_list:

        chunks = []
        chunk_counter = 1
        chunksize = 10000
        num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True):

            # append chunk to chunks list
            chunks.append(chunk)

            if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
                print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
            chunk_counter += 1

        if counter == 1:
            status_01 = pd.concat(chunks)
        elif counter == 2:
            status_02 = pd.concat(chunks)
        elif counter == 3:
            status_03 = pd.concat(chunks)
        elif counter == 4:
            status_04 = pd.concat(chunks)
        else:
            pass


        print('Finished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')
    

    



Loading Status Data...
	[19:37:59.939231] finished chunk 1 of 1835
	[19:38:01.941357] finished chunk 184 of 1835
	[19:38:04.001189] finished chunk 368 of 1835
	[19:38:06.028219] finished chunk 552 of 1835
	[19:38:08.088627] finished chunk 736 of 1835
	[19:38:10.268740] finished chunk 920 of 1835
	[19:38:12.348546] finished chunk 1104 of 1835
	[19:38:14.407318] finished chunk 1288 of 1835
	[19:38:16.520816] finished chunk 1472 of 1835
	[19:38:18.700287] finished chunk 1656 of 1835
	[19:38:20.892560] finished chunk 1835 of 1835
Finished file! (1 of 4)
	[19:38:32.838601] finished chunk 1 of 3665
	[19:38:37.961790] finished chunk 367 of 3665
	[19:38:43.271058] finished chunk 734 of 3665
	[19:38:48.537630] finished chunk 1101 of 3665
	[19:38:53.913744] finished chunk 1468 of 3665
	[19:38:59.660930] finished chunk 1835 of 3665
	[19:39:05.317785] finished chunk 2202 of 3665
	[19:39:11.040887] finished chunk 2569 of 3665
	[19:39:16.897358] finished chunk 2936 of 3665
	[19:39:22.800571] finishe

In [3]:
s1 = status_01.copy()
s2 = status_02.copy()
s3 = status_03.copy()
s4 = status_04.copy()

In [6]:
print('[%s] Started status_01[\'time\'] Cleaning' % datetime.datetime.now().time())
s1['time']   = pd.to_datetime(s1['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_02[\'time\'] Cleaning' % datetime.datetime.now().time())
s2['time']   = pd.to_datetime(s2['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_03[\'time\'] Cleaning' % datetime.datetime.now().time())
s3['time']   = pd.to_datetime(s3['time'],   format="%m/%d/%Y %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

print('[%s] Started status_04[\'time\'] Cleaning' % datetime.datetime.now().time())
s4['time']   = pd.to_datetime(s4['time'],   format="%Y/%m/%d %H:%M:%S").values.astype('<M8[m]')
print('\t[%s] Complete!\n' % datetime.datetime.now().time())

[20:26:53.353496] Started status_01['time'] Cleaning
	[20:26:56.228288]Complete!

[20:26:56.228531] Started status_02['time'] Cleaning
	[20:27:01.152647]Complete!

[20:27:01.153027] Started status_03['time'] Cleaning
	[20:27:05.299268]Complete!

[20:27:05.299538] Started status_04['time'] Cleaning
	[20:27:07.215406]Complete!



In [7]:
print('[%s] Starting concat' % datetime.datetime.now().time())
status_data = pd.concat([s1, s2, s3, s4])
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_data.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[20:27:25.031690] Starting concat
	[20:27:46.880513]Complete!

[20:27:46.884507] Wrighting to File
	[20:39:35.387372]Complete!



In [8]:
status_data.head()

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,12,15,2014-03-01 00:00:00
1,2,12,15,2014-03-01 00:01:00
2,2,12,15,2014-03-01 00:02:00
3,2,12,15,2014-03-01 00:03:00
4,2,12,15,2014-03-01 00:04:00


In [12]:
status_data.set_index('time', inplace=True)

In [13]:
status_data.head()

Unnamed: 0_level_0,station_id,bikes_available,docks_available
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-03-01 00:00:00,2,12,15
2014-03-01 00:01:00,2,12,15
2014-03-01 00:02:00,2,12,15
2014-03-01 00:03:00,2,12,15
2014-03-01 00:04:00,2,12,15


## Resampling

### Daily Mean

In [67]:
print('[%s] Resampling Daily Mean' % datetime.datetime.now().time())
status_daily_mean = status_data.groupby('station_id').resample('D').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_daily_mean.index = status_daily_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_daily_mean.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_daily_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[21:31:25.534197] Resampling Daily Mean
	[21:33:31.618103]Complete!

[21:33:31.622463] Writing to File
	[21:33:33.106404]Complete!



### 15 Minute Mean

In [69]:
print('[%s] Resampling 15min Mean' % datetime.datetime.now().time())
status_15min_mean = status_data.groupby('station_id').resample('15T').mean().fillna(0).rolling(window=3, min_periods=1).mean()
status_15min_mean.index = status_15min_mean.index.droplevel(0)
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

print('[%s] Writing to File' % datetime.datetime.now().time())
status_15min_mean.to_csv('../../../datasets/bayareabikeshare/CLEANED/status_data_15min_mean_cleaned.csv', encoding='utf-8')
print('\t[%s]Complete!\n' % datetime.datetime.now().time())

[21:38:06.372187] Resampling 15min Mean
	[21:40:50.505325]Complete!

[21:40:50.510883] Writing to File
	[21:42:06.968426]Complete!

