In [1]:
import numpy as np
import pandas as pd
import json
# import bike_project as bp

In [20]:
urls = {
        'Ballard' : 'https://data.seattle.gov/resource/47yq-6ugv.json',
        'Capitol_Hill' : 'https://data.seattle.gov/resource/j4vh-b42a.json',
        'Central_2nd_Ave' : 'https://data.seattle.gov/resource/avwm-i8ym.json',
# no data until 2019        'Central_7th_Ave' : 'https://data.seattle.gov/resource/qfzg-zmyj.json', 
        'West_Seattle' : 'https://data.seattle.gov/resource/mefu-7eau.json',
        'I90_lid' : 'https://data.seattle.gov/resource/u38e-ybnc.json',
        'NE_Seattle' : 'https://data.seattle.gov/resource/2z5v-ecg8.json',
        'Myrtle_Edwards' : 'https://data.seattle.gov/resource/4qej-qvrz.json'
        }

bike_shops = {
        'Ballard' : 3,
        'Capitol_Hill' : 2,
        'Central_2nd_Ave' : 5,
#         'Central_7th_Ave' : 6,
        'West_Seattle' : 3,
        'I90_lid' : 5,
        'NE_Seattle' :6 ,
        'Myrtle_Edwards' : 3
}


## Functions written for Seattle Spins: bike trip counter analysis

In [2]:
def r_w_bike_trips(path, location_name, num_shops, app_token, limit=50000, offset=50000):
    '''
    Reads an API accessible dataset given url path and other url call variables.
    Writes out to a json file

    Parameters:
    -----------
    path (str): url path beginning with https:
    location_name (str): name to assign to the trip counts column
    token (str): app_token
    limit (int): max rows to request
    offset (int): starting point for gathering rows

    Output:
    --------
    json-formatted file named 'location_name.json'
    '''
    df_a = pd.read_json( f'{path}?$limit={limit}&$offset={0}&$$app_token={app_token}')
    df_b = pd.read_json( f'{path}?$limit={limit}&$offset={offset}&$$app_token={app_token}')
    df = df_a.append(df_b)    
    # rename trip_count column to location_name
    df[location_name] = df.iloc[:, -2:].sum(axis=1)   
    
    # calculate date, month, year, dow, commuter (boolean), trip count am peak, trip count other times
    df['short_date'] = pd.DatetimeIndex(df['date']).date
    df['month'] = pd.DatetimeIndex(df['date']).month
    df['year'] = pd.DatetimeIndex(df['date']).year
    df['dow'] = pd.DatetimeIndex(df['date']).dayofweek
    df['hour'] = pd.DatetimeIndex(df['date']).hour
    df['am_commuter'] = df['dow'].isin([0,1,2,3,4]) & (df['hour'].isin([5,6,7,8,9]))
    df[f'{location_name}_am_peak'] = np.where(df['am_commuter']==True, df[location_name], 0)
    df[f'{location_name}_other'] = np.where(df['am_commuter']==False, df[location_name], 0)
    
    # collapse table by date and create sum counts for commuter and (other-(2 x commuter))
    df_by_date = df.groupby(['short_date', 'month', 'year', 'dow']).agg(
                                        {f'{location_name}_am_peak':'sum',
                                         f'{location_name}_other':'sum'
                                          }).reset_index()
    
    # commuters travel 2 ways--remove assumed pm commuter trips from trip count other
    df[f'{location_name}_other'] = df[f'{location_name}_other'] - df[f'{location_name}_am_peak']
    
    # add in count of nearby bike shops
    df_by_date[f'{location_name}_bike_shops'] = num_shops
    df_by_date.to_json(f'data/{location_name}.json', date_format='iso')
    
    #test output to be sure the read operation went ok
    print(df_by_date.head())
    

In [39]:
def merge_counter_locations(location_dict):
    '''
    Loops through a dict of counter locations, creating a json file path for each 
    location and merging counter files by date to create a master bike trip
    count file with 'am_peak' and 'other' bike trips for each location for each day.

    Parameters:
    ----------
    location_dict (dict): dict with root names of json files

    Returns:
    -------
    df (dataframe): master bike trip count file
    '''
    count = 1
    for k,v in urls.items():
        if count == 1:
            df = pd.read_json(f'data/{k}.json')
        else:
            df_next = pd.read_json(f'data/{k}.json')
            df = df.merge(df_next, how="left", left_on=["short_date", "month", "year", "dow"], right_on=['short_date', "month", "year", "dow"])
        count += 1
    df['date'] = pd.DatetimeIndex(df['short_date']).date
    return df


In [40]:
def add_weather(df, weather_file, keep_cols):
    '''
    Reads in daily weather data from a NOAA-generated source and merges it
    into a dataframe by date
    Parameters:
    ----------
    df (dataframe): destination dataframe
    weather_file (.csv): source weather .csv file
    keep_cols (list): list of weather attributes to keep

    Returns:
    -------
    Updated dataframe
    '''
    df_seattle_weather = pd.read_csv(weather_csv, usecols=keep_cols)
    df_seattle_weather['date'] = pd.DatetimeIndex(df_seattle_weather["DATE"]).date
    df = df.merge(df_seattle_weather, how="left", left_on="date", right_on='date')
    df.index = df['date']
    return df

In [49]:
def add_daily_summary_data(df, bike_shops_d):
    '''
    Adds daily summary statistics for trip counts
    Parameters:
    ----------
    df (dataframe): pandas table of bike counts by station
    bike_shops_d (dict): dict with number of bike shops in vicinity of bike trip counter location
    
    Returns:
    -------
    df (dataframe): df with summary statistics added
    '''
    few_am_peak, many_am_peak, few_other, many_other = [], [], [], []

    for k,v in bike_shops.items():
        if v <= 3:
            few_am_peak.append('{}_am_peak'.format(k))
            few_other.append(f'{k}_other')
        else:
            many_am_peak.append(f'{k}_am_peak')
            many_other.append(f'{k}_other')

    df['few_am_peak_ttl'] = df[few_am_peak].sum(axis=1)
    df['many_am_peak_ttl'] = df[many_am_peak].sum(axis=1)
    df['few_other_ttl'] = df[few_other].sum(axis=1)
    df['many_other_ttl'] = df[many_other].sum(axis=1)
    df['am_peak_ttl'] = df['few_am_peak_ttl'] + df['many_am_peak_ttl']
    df['other_ttl'] = df['few_other_ttl'] + df['many_other_ttl']
    return df
    

## Function calls follow.

#### This function call to 'r_w_bike_trips' reads externally-sourced raw data and writes one json output file per location. 

In [42]:
# No need to rerun this cell--it reads data from external sources and creates locally-stored json files. 

limit = 30000
offset = 30000

with open('data/app_token.txt', 'r') as t_file: 
    app_token = t_file.read() 

for k, v in urls.items():
    r_w_bike_trips(v, k, bike_shops[k], app_token, limit, offset)

   short_date  month  year  dow  Ballard_am_peak  Ballard_other  \
0  2014-01-01      1  2014    2             15.0          369.0   
1  2014-01-02      1  2014    3             80.0          375.0   
2  2014-01-03      1  2014    4             86.0          491.0   
3  2014-01-04      1  2014    5              0.0          555.0   
4  2014-01-05      1  2014    6              0.0          488.0   

   Ballard_bike_shops  
0                   3  
1                   3  
2                   3  
3                   3  
4                   3  
   short_date  month  year  dow  Capitol_Hill_am_peak  Capitol_Hill_other  \
0  2014-01-01      1  2014    2                   5.0               115.0   
1  2014-01-02      1  2014    3                  49.0               140.0   
2  2014-01-03      1  2014    4                  45.0               152.0   
3  2014-01-04      1  2014    5                   0.0               161.0   
4  2014-01-05      1  2014    6                   0.0               

#### This function call brings all locations together into one master dataframe.

In [53]:
df = merge_counter_locations(urls)


#### This function call appends daily weather data.


In [54]:
weather_csv = 'data/weather.csv'
use_cols = ['DATE',"PRCP","TAVG","TMAX","TMIN"]
add_weather(df, weather_csv, use_cols)

Unnamed: 0_level_0,short_date,month,year,dow,Ballard_am_peak,Ballard_other,Ballard_bike_shops,Capitol_Hill_am_peak,Capitol_Hill_other,Capitol_Hill_bike_shops,...,NE_Seattle_bike_shops,Myrtle_Edwards_am_peak,Myrtle_Edwards_other,Myrtle_Edwards_bike_shops,date,DATE,PRCP,TAVG,TMAX,TMIN
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,2014-01-01T00:00:00.000Z,1,2014,2,15,369,3,5.0,115.0,2.0,...,6,27,423,3,2014-01-01,2014-01-01,0.00,42.0,45.0,38.0
2014-01-02,2014-01-02T00:00:00.000Z,1,2014,3,80,375,3,49.0,140.0,2.0,...,6,235,354,3,2014-01-02,2014-01-02,0.16,45.0,51.0,43.0
2014-01-03,2014-01-03T00:00:00.000Z,1,2014,4,86,491,3,45.0,152.0,2.0,...,6,248,524,3,2014-01-03,2014-01-03,0.06,45.0,48.0,37.0
2014-01-04,2014-01-04T00:00:00.000Z,1,2014,5,0,555,3,0.0,161.0,2.0,...,6,0,665,3,2014-01-04,2014-01-04,0.00,41.0,46.0,33.0
2014-01-05,2014-01-05T00:00:00.000Z,1,2014,6,0,488,3,0.0,188.0,2.0,...,6,0,538,3,2014-01-05,2014-01-05,0.00,37.0,47.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-26,2020-06-26T00:00:00.000Z,6,2020,4,18,117,3,0.0,0.0,2.0,...,6,243,1388,3,2020-06-26,2020-06-26,0.00,70.0,82.0,57.0
2020-06-27,2020-06-27T00:00:00.000Z,6,2020,5,0,33,3,0.0,0.0,2.0,...,6,0,449,3,2020-06-27,2020-06-27,0.56,62.0,64.0,56.0
2020-06-28,2020-06-28T00:00:00.000Z,6,2020,6,0,137,3,0.0,0.0,2.0,...,6,0,1766,3,2020-06-28,2020-06-28,0.34,58.0,71.0,52.0
2020-06-29,2020-06-29T00:00:00.000Z,6,2020,0,13,123,3,0.0,4.0,2.0,...,6,177,1109,3,2020-06-29,2020-06-29,0.00,62.0,72.0,54.0


#### This function call adds daily summary statistics for bike trip counts.

In [55]:
add_daily_summary_data(df, bike_shops)

Unnamed: 0,short_date,month,year,dow,Ballard_am_peak,Ballard_other,Ballard_bike_shops,Capitol_Hill_am_peak,Capitol_Hill_other,Capitol_Hill_bike_shops,...,Myrtle_Edwards_am_peak,Myrtle_Edwards_other,Myrtle_Edwards_bike_shops,date,few_am_peak_ttl,many_am_peak_ttl,few_other_ttl,many_other_ttl,am_peak_ttl,other_ttl
0,2014-01-01T00:00:00.000Z,1,2014,2,15,369,3,5.0,115.0,2.0,...,27,423,3,2014-01-01,51.0,91.0,1011.0,935.0,142.0,1946.0
1,2014-01-02T00:00:00.000Z,1,2014,3,80,375,3,49.0,140.0,2.0,...,235,354,3,2014-01-02,398.0,163.0,1039.0,631.0,561.0,1670.0
2,2014-01-03T00:00:00.000Z,1,2014,4,86,491,3,45.0,152.0,2.0,...,248,524,3,2014-01-03,424.0,206.0,1344.0,727.0,630.0,2071.0
3,2014-01-04T00:00:00.000Z,1,2014,5,0,555,3,0.0,161.0,2.0,...,0,665,3,2014-01-04,0.0,0.0,1589.0,1658.0,0.0,3247.0
4,2014-01-05T00:00:00.000Z,1,2014,6,0,488,3,0.0,188.0,2.0,...,0,538,3,2014-01-05,0.0,0.0,1333.0,1133.0,0.0,2466.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340,2020-06-26T00:00:00.000Z,6,2020,4,18,117,3,0.0,0.0,2.0,...,243,1388,3,2020-06-26,261.0,576.0,1505.0,2570.0,837.0,4075.0
2341,2020-06-27T00:00:00.000Z,6,2020,5,0,33,3,0.0,0.0,2.0,...,0,449,3,2020-06-27,0.0,0.0,482.0,1122.0,0.0,1604.0
2342,2020-06-28T00:00:00.000Z,6,2020,6,0,137,3,0.0,0.0,2.0,...,0,1766,3,2020-06-28,0.0,0.0,1903.0,3738.0,0.0,5641.0
2343,2020-06-29T00:00:00.000Z,6,2020,0,13,123,3,0.0,4.0,2.0,...,177,1109,3,2020-06-29,190.0,418.0,1236.0,2429.0,608.0,3665.0


In [58]:
df[['am_peak_ttl','other_ttl']].sum()

am_peak_ttl    2043853.0
other_ttl      7593003.0
dtype: float64