In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

sns.set()

In [2]:
# station ID numbers that are in San Francisco
sf_stations = [ 39,41,42,45,46,47,48,49,50,51,54,55,56,57,58,59,60,61,62,63,
                64,65,66,67,68,69,70,71,72,73,74,75,76,77,82,90,91]

## Load Trip Data

In [3]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_trip_data.csv'
    
    # glob all files
    file_list = glob(file_path_slug)
    
    trip = pd.DataFrame()
    
    counter = 1
    chunks = []
    
    # load data from each file
    for file in file_list:
        
        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            
            chunk = chunk.set_index('Trip ID')
            
            chunk.columns = ['Duration', 'Start Date', 'Start Station', 'Start Terminal', 'End Date', 
                             'End Station', 'End Terminal', 'Bike #', 'Subscriber Type', 'Zip Code']

            chunks.append(chunk)
            
        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1
    
    # concat chunks
    trip = pd.concat(chunks)
    
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')
    
trip.info()

Loading Trip Data...
	finished file! (1 of 4)
	finished file! (2 of 4)
	finished file! (3 of 4)
	finished file! (4 of 4)
Data Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
Int64Index: 983648 entries, 913465 to 198776
Data columns (total 10 columns):
Duration           983648 non-null int64
Start Date         983648 non-null object
Start Station      983648 non-null object
Start Terminal     983648 non-null int64
End Date           983648 non-null object
End Station        983648 non-null object
End Terminal       983648 non-null int64
Bike #             983648 non-null int64
Subscriber Type    983648 non-null object
Zip Code           976838 non-null object
dtypes: int64(4), object(6)
memory usage: 82.6+ MB


## Load Weather Data

In [4]:
print('Loading Weather Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_weather_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    weather = pd.DataFrame()

    counter = 1
    chunks = []

    # load data from each file
    for file in file_list:

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):

            chunk.columns = ['Date', 'Max_Temperature_F', 'Mean_Temperature_F', 'Min_TemperatureF', 'Max_Dew_Point_F', 
                             'MeanDew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 
                             'Max_Sea_Level_Pressure_In', 'Mean_Sea_Level_Pressure_In', 'Min_Sea_Level_Pressure_In', 
                             'Max_Visibility_Miles', 'Mean_Visibility_Miles', 'Min_Visibility_Miles', 
                             'Max_Wind_Speed_MPH', 'Mean_Wind_Speed_MPH', 'Max_Gust_Speed_MPH', 'Precipitation_In', 
                             'Cloud_Cover', 'Events', 'Wind_Dir_Degrees', 'zip']

            chunk = chunk.set_index('Date')

            chunks.append(chunk)

        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1

    # concat chunks
    weather = pd.concat(chunks)

    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')

weather.info()

Loading Weather Data...
	finished file! (1 of 4)
	finished file! (2 of 4)
	finished file! (3 of 4)
	finished file! (4 of 4)
Data Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
Index: 5495 entries, 3/1/2014 to 8/31/2015
Data columns (total 23 columns):
Max_Temperature_F             5491 non-null float64
Mean_Temperature_F            5491 non-null float64
Min_TemperatureF              5491 non-null float64
Max_Dew_Point_F               5441 non-null float64
MeanDew_Point_F               5441 non-null float64
Min_Dewpoint_F                5441 non-null float64
Max_Humidity                  5441 non-null float64
Mean_Humidity                 5441 non-null float64
Min_Humidity                  5441 non-null float64
Max_Sea_Level_Pressure_In     5494 non-null float64
Mean_Sea_Level_Pressure_In    5494 non-null float64
Min_Sea_Level_Pressure_In     5494 non-null float64
Max_Visibility_Miles          5473 non-null float64
Mean_Visibility_Miles         5473 non-null float64
Min_Visi

## Load Status Data

In [5]:
print('Loading Status Data...')

# try:
file_path_slug = '../../datasets/bayareabikeshare/*02_status_data.csv'

# glob all files
file_list = glob(file_path_slug)

status = pd.DataFrame()

counter = 1
chunks = []

# load data from each file
for file in file_list:    
    
    chunk = pd.read_csv(file, parse_dates=True)
    chunks.append(chunk)
    print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
    counter += 1
    
# concat chunks
status = pd.concat(chunks)

print('Data Loaded Successfully!')

status.info()

Loading Status Data...
	finished file! (1 of 1)
Data Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16994602 entries, 0 to 16994601
Data columns (total 4 columns):
station_id         int64
bikes_available    int64
docks_available    int64
time               object
dtypes: int64(3), object(1)
memory usage: 648.3+ MB


## Load Station Data

In [6]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_station_data.csv'
    
    # glob all files
    file_list = glob(file_path_slug)
    
    station = pd.DataFrame()
    
    counter = 1
    chunks = []
    
    # load data from each file
    for file in file_list:
        
        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            
            chunks.append(chunk)
            
        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1
    
    # concat chunks
    station = pd.concat(chunks)
    
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')
    
station.info()

Loading Trip Data...
	finished file! (1 of 4)
	finished file! (2 of 4)
	finished file! (3 of 4)
	finished file! (4 of 4)
Data Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 69
Data columns (total 7 columns):
station_id      276 non-null float64
name            276 non-null object
lat             276 non-null float64
long            276 non-null float64
dockcount       276 non-null float64
landmark        276 non-null object
installation    276 non-null object
dtypes: float64(4), object(3)
memory usage: 75.1+ KB


In [7]:
station.head()

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation
0,2.0,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013
1,3.0,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013
2,4.0,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013
3,5.0,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013
4,6.0,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013


## Cleanup Data

### Clean Trip Data

In [8]:
trip = trip.drop_duplicates(keep='first')
trip = trip.dropna(how='all')
trip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 983647 entries, 913465 to 198776
Data columns (total 10 columns):
Duration           983647 non-null int64
Start Date         983647 non-null object
Start Station      983647 non-null object
Start Terminal     983647 non-null int64
End Date           983647 non-null object
End Station        983647 non-null object
End Terminal       983647 non-null int64
Bike #             983647 non-null int64
Subscriber Type    983647 non-null object
Zip Code           976837 non-null object
dtypes: int64(4), object(6)
memory usage: 82.6+ MB


### Clean Weather Data

In [9]:
weather = weather.drop_duplicates(keep='first')
weather = weather.dropna(how='all')
weather.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5495 entries, 3/1/2014 to 8/31/2015
Data columns (total 23 columns):
Max_Temperature_F             5491 non-null float64
Mean_Temperature_F            5491 non-null float64
Min_TemperatureF              5491 non-null float64
Max_Dew_Point_F               5441 non-null float64
MeanDew_Point_F               5441 non-null float64
Min_Dewpoint_F                5441 non-null float64
Max_Humidity                  5441 non-null float64
Mean_Humidity                 5441 non-null float64
Min_Humidity                  5441 non-null float64
Max_Sea_Level_Pressure_In     5494 non-null float64
Mean_Sea_Level_Pressure_In    5494 non-null float64
Min_Sea_Level_Pressure_In     5494 non-null float64
Max_Visibility_Miles          5473 non-null float64
Mean_Visibility_Miles         5473 non-null float64
Min_Visibility_Miles          5473 non-null float64
Max_Wind_Speed_MPH            5494 non-null float64
Mean_Wind_Speed_MPH           5494 non-null float64
Ma

### Clean Station Data

In [10]:
station = station.drop_duplicates(keep='first')
station = station.dropna(how='all')

station['installation'] = pd.to_datetime(station['installation'],infer_datetime_format=True).copy()
station['dockcount'] = station['dockcount'].astype('int')
station['station_id'] = station['station_id'].astype('int').astype('str')

station = station.set_index('station_id')
station.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83 entries, 2 to 25
Data columns (total 6 columns):
name            83 non-null object
lat             83 non-null float64
long            83 non-null float64
dockcount       83 non-null int64
landmark        83 non-null object
installation    83 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 4.5+ KB


In [11]:
station.head(10)


Unnamed: 0_level_0,name,lat,long,dockcount,landmark,installation
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,2013-08-06
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,2013-08-05
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,2013-08-06
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,2013-08-05
6,San Pedro Square,37.336721,-121.894074,15,San Jose,2013-08-07
7,Paseo de San Antonio,37.333798,-121.886943,15,San Jose,2013-08-07
8,San Salvador at 1st,37.330165,-121.885831,15,San Jose,2013-08-05
9,Japantown,37.348742,-121.894715,15,San Jose,2013-08-05
10,San Jose City Hall,37.337391,-121.886995,15,San Jose,2013-08-06
11,MLK Library,37.335885,-121.88566,19,San Jose,2013-08-06


### Clean Status Data

In [None]:
status = status.drop_duplicates(keep='first')
status = status.dropna(how='all')
status.info()

In [12]:
status.head()

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013/08/29 12:06:01
1,2,2,25,2013/08/29 12:07:01
2,2,2,25,2013/08/29 12:08:01
3,2,2,25,2013/08/29 12:09:01
4,2,2,25,2013/08/29 12:10:01


In [13]:
status['total_docks'] = status['bikes_available'] + status['docks_available']

In [None]:
status['time'] = pd.to_datetime(status['time'], format='%Y/%m/%d %H:%M:%S').copy()
status.info()

In [None]:
# status['bike_availability_ratio'] = status['bikes_available'] / (status['bikes_available'] + status['docks_available']) * 100.0

# status.head()

In [None]:
# resample to hourly mean by station
status_station_grouped = status.groupby(['station_id', status.time.dt.hour*3]).mean()
status_station_grouped.head()



In [None]:
for station in status.station_id.unique():
    try:
        s = status[status['station_id'] == station]
        s.groupby(s.time.dt.hour).mean()['bike_availability_ratio'].plot(figsize=(20,5))
        plt.title('Station %s Bike Availability Ratio' % station)
        plt.show()
        
    except:
        print('station %s has no data, skipping...' % station)