In [1]:
import numpy as np
import pandas as pd
import json
# import bike_project as bp

In [2]:
def read_bike_trips(path, location_name, num_shops, app_token, limit=50000, offset=50000):
    '''
    Reads an API accessible dataset given url path and other url call variables
    Parameters:
    -----------
    path (str): url path beginning with https:
    location_name (str): name to assign to the trip counts column
    token (str): app_token
    limit (int): max rows to request
    offset (int): starting point for gathering rows

    Returns:
    --------
    pandas dataframe
    '''
    df_a = pd.read_json( f'{path}?$limit={limit}&$offset={0}&$$app_token={app_token}')
    df_b = pd.read_json( f'{path}?$limit={limit}&$offset={offset}&$$app_token={app_token}')
    df = df_a.append(df_b)    
    # rename trip_count column to location_name
    df[location_name] = df.iloc[:, -2:].sum(axis=1)   
    
    # calculate date, month, year, dow, commuter (boolean), trip count am peak, trip count other times
    df['short_date'] = pd.DatetimeIndex(df['date']).date
    df['month'] = pd.DatetimeIndex(df['date']).month
    df['year'] = pd.DatetimeIndex(df['date']).year
    df['dow'] = pd.DatetimeIndex(df['date']).dayofweek
    df['hour'] = pd.DatetimeIndex(df['date']).hour
    df['am_commuter'] = df['dow'].isin([7,1,2,3,4]) & (df['hour'].isin([5,6,7,8,9]))
    df[f'{location_name}_am_peak'] = np.where(df['am_commuter']==True, df[location_name], 0)
    df[f'{location_name}_other'] = np.where(df['am_commuter']==False, df[location_name], 0)
    
    # collapse table by date and create sum counts for commuter and (other-(2 x commuter))
    df_by_date = df.groupby(['short_date', 'month', 'year', 'dow']).agg(
                                        {f'{location_name}_am_peak':'sum',
                                         f'{location_name}_other':'sum'
                                          }).reset_index()
    
    # commuters travel 2 ways--remove assumed pm commuter trips from trip count other
    df[f'{location_name}_other'] = df[f'{location_name}_other'] - df[f'{location_name}_am_peak']
    
    # add in count of nearby bike shops
    df_by_date[f'{location_name}_bike_shops'] = num_shops
    df_by_date.to_json(f'data/{location_name}.json')
    print(df_by_date.head())
    

In [3]:
urls = {
        'Ballard' : 'https://data.seattle.gov/resource/47yq-6ugv.json',
        'Capitol_Hill' : 'https://data.seattle.gov/resource/j4vh-b42a.json',
        'Central_2nd_Ave' : 'https://data.seattle.gov/resource/avwm-i8ym.json',
# no data until 2019        'Central_7th_Ave' : 'https://data.seattle.gov/resource/qfzg-zmyj.json', 
        'West_Seattle' : 'https://data.seattle.gov/resource/mefu-7eau.json',
        'I90_lid' : 'https://data.seattle.gov/resource/u38e-ybnc.json',
        'NE_Seattle' : 'https://data.seattle.gov/resource/2z5v-ecg8.json',
        'Myrtle_Edwards' : 'https://data.seattle.gov/resource/4qej-qvrz.json'
        }

bike_shops = {
        'Ballard' : 3,
        'Capitol_Hill' : 2,
        'Central_2nd_Ave' : 5,
        'Central_7th_Ave' : 6,
        'West_Seattle' : 3,
        'I90_lid' : 5,
        'NE_Seattle' :6 ,
        'Myrtle_Edwards' : 3
}


In [4]:
limit = 30000
offset = 30000

with open('data/app_token.txt', 'r') as t_file: 
    app_token = t_file.read() 

for k, v in urls.items():
    read_bike_trips(v, k, bike_shops[k], app_token, limit, offset)

   short_date  month  year  dow  Ballard_am_peak  Ballard_other  \
0  2014-01-01      1  2014    2             15.0          369.0   
1  2014-01-02      1  2014    3             80.0          375.0   
2  2014-01-03      1  2014    4             86.0          491.0   
3  2014-01-04      1  2014    5              0.0          555.0   
4  2014-01-05      1  2014    6              0.0          488.0   

   Ballard_bike_shops  
0                   3  
1                   3  
2                   3  
3                   3  
4                   3  
   short_date  month  year  dow  Capitol_Hill_am_peak  Capitol_Hill_other  \
0  2014-01-01      1  2014    2                   5.0               115.0   
1  2014-01-02      1  2014    3                  49.0               140.0   
2  2014-01-03      1  2014    4                  45.0               152.0   
3  2014-01-04      1  2014    5                   0.0               161.0   
4  2014-01-05      1  2014    6                   0.0               

In [5]:
count = 1
for k,v in urls.items():
    if count == 1:
        df = pd.read_json(f'data/{k}.json')
    else:
        df_next = pd.read_json(f'data/{k}.json')
        df = df.merge(df_next, how="left", left_on=["short_date", "month", "year", "dow"], right_on=['short_date', "month", "year", "dow"])
    count += 1

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2345 entries, 0 to 2344
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   short_date                  2345 non-null   int64  
 1   month                       2345 non-null   int64  
 2   year                        2345 non-null   int64  
 3   dow                         2345 non-null   int64  
 4   Ballard_am_peak             2345 non-null   int64  
 5   Ballard_other               2345 non-null   int64  
 6   Ballard_bike_shops          2345 non-null   int64  
 7   Capitol_Hill_am_peak        2314 non-null   float64
 8   Capitol_Hill_other          2314 non-null   float64
 9   Capitol_Hill_bike_shops     2314 non-null   float64
 10  Central_2nd_Ave_am_peak     1980 non-null   float64
 11  Central_2nd_Ave_other       1980 non-null   float64
 12  Central_2nd_Ave_bike_shops  1980 non-null   float64
 13  West_Seattle_am_peak        2345 

In [7]:
df['year']

0       2014
1       2014
2       2014
3       2014
4       2014
        ... 
2340    2020
2341    2020
2342    2020
2343    2020
2344    2020
Name: year, Length: 2345, dtype: int64