<h1>Prepare Dataset for analysis</h1>
11/17/23<br>
This notebook appends snapshots of Divvy station status with similar file names (e.g. station_status_2023_11_16*.csv) into one csv<br>
and then merges station info and geography info<br>

### next steps
in api-request.py, should rename timestamp to time_reported and remove that code here

# Define Time Period to Combine

In [1]:
#update this to combine different files
filename_import = "station_status_2023_11_17*.csv"
filename_export = "station-status-hourly-2023-11-17.csv"

# import libraries

In [2]:
import pandas as pd
import requests
from datetime import datetime
from pathlib import Path

# read data

### request station data

In [3]:
url = f"https://gbfs.lyft.com/gbfs/2.3/chi/en/station_information.json?"

#request data
response = requests.get(url)
data = response.json()
response

<Response [200]>

In [4]:
df_station_info = pd.DataFrame(data['data']['stations'], columns=['station_id','name','lat','lon','capacity'])
df_station_info.head()

Unnamed: 0,station_id,name,lat,lon,capacity
0,a3b48c07-a135-11e9-9cda-0a87ae2ba916,Leavitt St & Chicago Ave,41.895501,-87.682017,19
1,a3b222fc-a135-11e9-9cda-0a87ae2ba916,Racine Ave & Garfield Blvd,41.794228,-87.655073,11
2,a3af9a83-a135-11e9-9cda-0a87ae2ba916,Drake Ave & Montrose Ave,41.961154,-87.716569,15
3,a3b2d7d9-a135-11e9-9cda-0a87ae2ba916,Western Blvd & 48th Pl,41.805661,-87.683392,11
4,a3b11480-a135-11e9-9cda-0a87ae2ba916,Laramie Ave & Kinzie St,41.887832,-87.755527,11


### read station geography info
I prepared this in QGIS (point-in-polygon, mapping stations to community areas)

In [5]:
df_station_geography = pd.read_csv(
    "../data/station_info_w_communities.csv", usecols=['station_id','community'])
df_station_geography

Unnamed: 0,community,station_id
0,DOUGLAS,a3aae35a-a135-11e9-9cda-0a87ae2ba916
1,DOUGLAS,a3a75d8a-a135-11e9-9cda-0a87ae2ba916
2,DOUGLAS,a3a859c4-a135-11e9-9cda-0a87ae2ba916
3,DOUGLAS,a3ad2843-a135-11e9-9cda-0a87ae2ba916
4,DOUGLAS,a3a781a6-a135-11e9-9cda-0a87ae2ba916
...,...,...
1647,EDISON PARK,1806749752967643708
1648,EDISON PARK,1806749752967643704
1649,EDISON PARK,1806753657092916076
1650,EDISON PARK,1806749761557578336


### read station staus data files matching specified pattern

In [6]:
df_station_status = pd.DataFrame()

In [7]:
for file_path in Path('../data').glob(filename_import):
    #read next file
    df = pd.read_csv(file_path)
    
    #timestamp for data retrieval. This should later be moved to the api-request.py script called in GitHub actions
    time_retrieved_str = file_path.name[15:32]

    date_object = datetime.strptime(time_retrieved_str, '%Y_%m_%d_%I%M%p')
    time_retrieved = date_object.strftime('%m/%d/%Y %I:%M %p')
    
    df["time_retrieved"]= time_retrieved
    df.rename(columns={'timestamp': 'time_reported'}, inplace=True)
    
    #append this file
    df_station_status = pd.concat([df_station_status,df], ignore_index=True)
df_station_status.head()

Unnamed: 0,num_bikes_disabled,num_scooters_available,num_docks_available,num_docks_disabled,is_installed,is_renting,num_bikes_available,num_scooters_unavailable,num_ebikes_available,station_id,is_returning,time_reported,n_classic,n_electric,n_scooters,time_retrieved
0,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,1,11/17/2023 01:02 PM,8,1,0,11/17/2023 01:04 PM
1,0,0.0,6,0,1,1,14,1.0,1,a3a3b731-a135-11e9-9cda-0a87ae2ba916,1,11/17/2023 01:02 PM,13,1,0,11/17/2023 01:04 PM
2,3,0.0,6,0,1,1,5,0.0,1,792ceba0-564f-4a2c-bcb5-90da781ffeef,1,11/17/2023 01:02 PM,4,1,0,11/17/2023 01:04 PM
3,0,0.0,7,0,1,1,8,0.0,2,a3afd294-a135-11e9-9cda-0a87ae2ba916,1,11/17/2023 01:02 PM,6,2,0,11/17/2023 01:04 PM
4,0,0.0,6,0,1,1,13,0.0,6,a3a8804b-a135-11e9-9cda-0a87ae2ba916,1,11/17/2023 01:02 PM,7,6,0,11/17/2023 01:04 PM


# Prepare Data for Analysis

### Merge Community Info into Station info

In [8]:
df_station_info = pd.merge(df_station_info, df_station_geography, on='station_id', how='left')
df_station_info.head()

Unnamed: 0,station_id,name,lat,lon,capacity,community
0,a3b48c07-a135-11e9-9cda-0a87ae2ba916,Leavitt St & Chicago Ave,41.895501,-87.682017,19,WEST TOWN
1,a3b222fc-a135-11e9-9cda-0a87ae2ba916,Racine Ave & Garfield Blvd,41.794228,-87.655073,11,NEW CITY
2,a3af9a83-a135-11e9-9cda-0a87ae2ba916,Drake Ave & Montrose Ave,41.961154,-87.716569,15,ALBANY PARK
3,a3b2d7d9-a135-11e9-9cda-0a87ae2ba916,Western Blvd & 48th Pl,41.805661,-87.683392,11,NEW CITY
4,a3b11480-a135-11e9-9cda-0a87ae2ba916,Laramie Ave & Kinzie St,41.887832,-87.755527,11,AUSTIN


### Merge Station Info into Station Status

In [9]:
df_stations = pd.merge(df_station_status, df_station_info, on='station_id')
df_stations.head()

Unnamed: 0,num_bikes_disabled,num_scooters_available,num_docks_available,num_docks_disabled,is_installed,is_renting,num_bikes_available,num_scooters_unavailable,num_ebikes_available,station_id,...,time_reported,n_classic,n_electric,n_scooters,time_retrieved,name,lat,lon,capacity,community
0,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 01:02 PM,8,1,0,11/17/2023 01:04 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK
1,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 01:04 AM,7,2,0,11/17/2023 01:05 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK
2,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 02:06 PM,8,1,0,11/17/2023 02:06 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK
3,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 02:06 AM,7,2,0,11/17/2023 02:07 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK
4,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 03:03 PM,8,1,0,11/17/2023 03:05 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK


### calculate key metrics

In [10]:
#add calculated columns
df_stations['is_no_classic']=(df_stations['n_classic']==0)
df_stations['is_no_docks']=(df_stations['num_docks_available']==0)
df_stations['is_problem_station']= df_stations['is_no_classic'] | df_stations['is_no_docks']

df_stations.head()

Unnamed: 0,num_bikes_disabled,num_scooters_available,num_docks_available,num_docks_disabled,is_installed,is_renting,num_bikes_available,num_scooters_unavailable,num_ebikes_available,station_id,...,n_scooters,time_retrieved,name,lat,lon,capacity,community,is_no_classic,is_no_docks,is_problem_station
0,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,0,11/17/2023 01:04 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False
1,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,0,11/17/2023 01:05 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False
2,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,0,11/17/2023 02:06 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False
3,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,0,11/17/2023 02:07 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False
4,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,0,11/17/2023 03:05 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False


In [11]:
#identify and remove public racks
df_stations['is_public_rack']=df_stations['name'].str[:11]=='Public Rack'
df_stations_divvy = df_stations[df_stations['is_public_rack']==False]
df_stations_divvy.head()

Unnamed: 0,num_bikes_disabled,num_scooters_available,num_docks_available,num_docks_disabled,is_installed,is_renting,num_bikes_available,num_scooters_unavailable,num_ebikes_available,station_id,...,time_retrieved,name,lat,lon,capacity,community,is_no_classic,is_no_docks,is_problem_station,is_public_rack
0,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 01:04 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False,False
1,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 01:05 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False,False
2,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 02:06 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False,False
3,0,0.0,2,0,1,1,9,0.0,2,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 02:07 AM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False,False
4,0,0.0,2,0,1,1,9,0.0,1,a3b10a40-a135-11e9-9cda-0a87ae2ba916,...,11/17/2023 03:05 PM,Pulaski Rd & Lake St,41.885409,-87.726491,11,WEST GARFIELD PARK,False,False,False,False


In [14]:
df_stations_divvy.keys()

Index(['num_bikes_disabled', 'num_scooters_available', 'num_docks_available',
       'num_docks_disabled', 'is_installed', 'is_renting',
       'num_bikes_available', 'num_scooters_unavailable',
       'num_ebikes_available', 'station_id', 'is_returning', 'time_reported',
       'n_classic', 'n_electric', 'n_scooters', 'time_retrieved', 'name',
       'lat', 'lon', 'capacity', 'community', 'is_no_classic', 'is_no_docks',
       'is_problem_station', 'is_public_rack'],
      dtype='object')

# Export for analysis

In [15]:
df_stations_divvy.to_csv(f"../results/{filename_export}", index=False)

In [None]:
#df_station_info.to_csv(f"../data/station_info.csv", index=False)