In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

sns.set()

In [2]:
# station ID numbers that are in San Francisco
sf_stations = [ 39,41,42,45,46,47,48,49,50,51,54,55,56,57,58,59,60,61,62,63,
                64,65,66,67,68,69,70,71,72,73,74,75,76,77,82,90,91]

## Load Trip Data

In [None]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_trip_data.csv'
    
    # glob all files
    file_list = glob(file_path_slug)
    
    trip = pd.DataFrame()
    
    counter = 1
    chunks = []
    
    # load data from each file
    for file in file_list:
        
        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            
            chunk = chunk.set_index('Trip ID')
            
            chunk.columns = ['Duration', 'Start Date', 'Start Station', 'Start Terminal', 'End Date', 
                             'End Station', 'End Terminal', 'Bike #', 'Subscriber Type', 'Zip Code']

            chunks.append(chunk)
            
        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1
    
    # concat chunks
    trip = pd.concat(chunks)
    
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :(')
    
trip.info()

## Load Weather Data

In [None]:
print('Loading Weather Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_weather_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    weather = pd.DataFrame()

    counter = 1
    chunks = []

    # load data from each file
    for file in file_list:

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):

            chunk.columns = ['Date', 'Max_Temperature_F', 'Mean_Temperature_F', 'Min_TemperatureF', 'Max_Dew_Point_F', 
                             'MeanDew_Point_F', 'Min_Dewpoint_F', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 
                             'Max_Sea_Level_Pressure_In', 'Mean_Sea_Level_Pressure_In', 'Min_Sea_Level_Pressure_In', 
                             'Max_Visibility_Miles', 'Mean_Visibility_Miles', 'Min_Visibility_Miles', 
                             'Max_Wind_Speed_MPH', 'Mean_Wind_Speed_MPH', 'Max_Gust_Speed_MPH', 'Precipitation_In', 
                             'Cloud_Cover', 'Events', 'Wind_Dir_Degrees', 'zip']

            chunk = chunk.set_index('Date')

            chunks.append(chunk)

        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1

    # concat chunks
    weather = pd.concat(chunks)

    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')

weather.info()

## Load Status Data

In [None]:
print('Loading Status Data...')

# try:
file_path_slug = '../../datasets/bayareabikeshare/*_status_data.csv'

# glob all files
file_list = glob(file_path_slug)

status = pd.DataFrame()

counter = 1
chunks = []

# load data from each file
for file in file_list:    
    
    chunk = pd.read_csv(file, parse_dates=True)
    chunks.append(chunk)
    print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
    counter += 1
    
# concat chunks
status = pd.concat(chunks)

print('Data Loaded Successfully!')

status.info()

## Load Station Data

In [None]:
print('Loading Station Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_station_data.csv'
    
    # glob all files
    file_list = glob(file_path_slug)
    
    station = pd.DataFrame()
    
    counter = 1
    chunks = []
    
    # load data from each file
    for file in file_list:
        
        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            
            chunks.append(chunk)
            
        print('\tfinished file! (%d of %d)'% (counter, len(file_list)))
        counter += 1
    
    # concat chunks
    station = pd.concat(chunks)
    
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong loading the data :()')
    
station.info()

In [None]:
station.head()

## Cleanup Data

### Clean Trip Data

In [None]:
trip = trip.drop_duplicates(keep='first')
trip = trip.dropna(how='all')
trip.info()

### Clean Weather Data

In [None]:
weather = weather.drop_duplicates(keep='first')
weather = weather.dropna(how='all')
weather.info()

### Clean Station Data

In [None]:
station = station.drop_duplicates(keep='first')
station = station.dropna(how='all')

station['installation'] = pd.to_datetime(station['installation'],infer_datetime_format=True).copy()
station['dockcount'] = station['dockcount'].astype('int')
station['station_id'] = station['station_id'].astype('int').astype('str')

station = station.set_index('station_id')
station.info()

In [None]:
station.head(90)


### Clean Status Data

In [None]:
status = status.drop_duplicates(keep='first')
status = status.dropna(how='all')
status.info()

In [None]:
status.head()

In [None]:
status['total_docks'] = status['bikes_available'] + status['docks_available']

In [None]:
status.head()

In [None]:
status['time'] = pd.to_datetime(status['time'], format='%Y/%m/%d %H:%M:%S').copy()
status.info()

In [None]:
status['bike_availability_ratio'] = status['bikes_available'] / status['total_docks'] * 100.0

status.head()

In [None]:
# resample to hourly mean by station
status_station_grouped = status.groupby(['station_id', status.time.dt.hour*3]).mean()
status_station_grouped.head()



In [None]:
for station in status.station_id.unique():
    try:
        s = status[status['station_id'] == station]
        s.groupby(s.time.dt.hour).mean()['bike_availability_ratio'].plot(figsize=(20,5))
        plt.title('Station %s Bike Availability Ratio' % station)
        plt.show()
        
    except:
        print('station %s has no data, skipping...' % station)