# Walkability

# Setup

In [1]:
import pandas as pd

In [2]:
walkability_df = pd.read_csv("https://edg.epa.gov/EPADataCommons/public/OA/EPA_SmartLocationDatabase_V3_Jan_2021_Final.csv")

In [3]:
walkability_df

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,1,4.811300e+11,4.811300e+11,48,113,7825,4,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.184697,0.000476,0.137707,6,14,15,17,14.000000,3110.360820,2.978361e+05
1,2,4.811300e+11,4.811300e+11,48,113,7825,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.323221,0.000801,0.231868,3,10,12,14,10.833333,3519.469110,4.849451e+05
2,3,4.811300e+11,4.811300e+11,48,113,7825,3,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.314628,0.000736,0.213146,1,1,7,17,8.333333,1697.091802,1.067059e+05
3,4,4.811300e+11,4.811300e+11,48,113,7824,1,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.229821,0.000708,0.205018,16,10,17,17,15.666667,2922.609204,4.818284e+05
4,5,4.811300e+11,4.811300e+11,48,113,7824,2,206.0,"Dallas-Fort Worth, TX-OK",19100.0,...,0.164863,0.000433,0.125296,4,7,11,14,10.166667,3731.971773,6.876848e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220735,220736,7.803100e+11,7.803100e+11,78,30,961000,2,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,3414.446949,3.355857e+05
220736,220737,7.803100e+11,7.803100e+11,78,30,961000,3,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,2421.025608,2.924305e+05
220737,220738,7.803100e+11,7.803100e+11,78,30,961000,5,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,20,1,7.333333,1955.909418,1.619395e+05
220738,220739,7.803100e+11,7.803100e+11,78,30,960700,3,,,,...,-99999.000000,-99999.000000,-99999.000000,1,1,10,1,4.000000,16896.768870,1.038966e+07


# Data Cleaning

There's way too many columns in this dataset. What does everything mean?

Documentation: https://www.epa.gov/sites/default/files/2021-06/documents/epa_sld_3.0_technicaldocumentationuserguide_may2021.pdf

Some things we might want: 
- location data ==> STATEFP
- demographics ==> TotPop
- transit access ==> D2B_E8MIXA, D3b, D4a, D4b025, D4b050
- destination accessibility ==> D5br, D5be, NatWalkInd

In [4]:
walk_df = walkability_df[
    ['STATEFP',
     'D2B_E8MIXA', 'D3B', 'D4A', 'D4B025', 'D4B050',
     'D5BR', 'D5BE', 'NatWalkInd']]
walk_df

Unnamed: 0,STATEFP,D2B_E8MIXA,D3B,D4A,D4B025,D4B050,D5BR,D5BE,NatWalkInd
0,48,0.662091,115.981747,362.10,0.0,0.000000,135362,53504,14.000000
1,48,0.554458,80.145600,718.84,0.0,0.009516,236885,90089,10.833333
2,48,0.000000,24.272717,398.31,0.0,0.000000,230587,82815,8.333333
3,48,0.553831,141.604424,386.24,0.0,0.515377,168433,79657,15.666667
4,48,0.459064,65.307963,638.37,0.0,0.248922,120826,48682,10.166667
...,...,...,...,...,...,...,...,...,...
220735,78,0.000000,370.600642,-99999.00,0.0,0.000000,-99999,-99999,7.333333
220736,78,0.000000,383.915317,-99999.00,0.0,0.000000,-99999,-99999,7.333333
220737,78,0.000000,314.650951,-99999.00,0.0,0.000000,-99999,-99999,7.333333
220738,78,0.000000,54.219116,-99999.00,0.0,0.000000,-99999,-99999,4.000000


| Field Name | Description  | Data Source |
| ---------- | :-- | :-- |
| STATEFP    | State FIPS code  | 2020 Census TIGER/Line |
| D2B_E8MIXA |The mix of employment types in a block group (such as retail, office, or industrial). Higher values correlate with more walk trips. |
| D3B        |Street intersection density (pedestrian-oriented intersections). Higher intersection density is correlated with more walk trips. |
| D4A        | Distance from the population-weighted centroid to nearest transit stop (meters) | 2020 GTFS, 2020 CTOD |
| D4B025     | Proportion of CBG employment within ¼ mile of fixed-guideway transit stop   | 2020 GTFS, 2020 CTOD, 2018 USGS PAD-US, SLD unprotected area polygons  |
| D4B050     | Proportion of CBG employment within ½ mile of fixed-<br>guideway transit stop                                                                                      | 2020 GTFS, 2020 CTOD, 2018<br>USGS PAD-US                                                                                     |
| D5BR       | Jobs within 45-minute transit commute, distance decay<br>(walk network travel time, GTFS schedules) weighted                                                       | 2020 TravelTime API, 2017<br>Census LEHD, 2020 GTFS                                                                           |
| D5BE       | Working age population within 45-minute transit<br>commute, time decay (walk network travel time, GTFS<br>schedules) weighted                                      | 2020 TravelTime API, 2018<br>Census ACS, 2020 GTFS                                                                            |
| NatWalkInd | Walkability index comprised of weighted sum of the ranked values of [D2a_EpHHm] (D2A_Ranked), [D2b_E8MixA] (D2B_Ranked), [D3b] (D3B_Ranked) and [D4a] (D4A_Ranked) | Derived from other SLD<br>variables                                                                                           |

Since these are by city, but our other datasets are by state, we will collapse cities into their states
1. Replace state codes with state names
2. Average metric values

In [5]:
walk_df.describe()

Unnamed: 0,STATEFP,D2B_E8MIXA,D3B,D4A,D4B025,D4B050,D5BR,D5BE,NatWalkInd
count,220740.0,220740.0,220740.0,220740.0,220740.0,220740.0,220740.0,220740.0,220740.0
mean,28.62319,0.531642,78.256064,-57132.809682,0.029835,0.066813,44138.97,152.0207,9.541628
std,16.386075,0.221917,83.199666,49684.780296,0.144842,0.228685,385456.8,203859.0,4.373952
min,1.0,0.0,0.0,-99999.0,0.0,0.0,-99999.0,-99999.0,1.0
25%,13.0,0.397387,13.27715,-99999.0,0.0,0.0,-99999.0,-99999.0,5.833333
50%,29.0,0.570694,60.496455,-99999.0,0.0,0.0,-99999.0,-99999.0,9.166667
75%,42.0,0.701531,116.505177,340.64,0.0,0.0,47751.25,31746.5,13.166667
max,78.0,0.99398,5289.518414,1207.0,1.0,1.0,6963967.0,2697797.0,20.0


In [6]:
# create a dictionary map to convert state codes
fips = pd.read_csv("STATEFPS.csv")
fips_abr = fips["Abr"].values
fips_code =fips["FIPS"].values
fips_dict = dict(zip(fips_code, fips_abr))

In [7]:
# filter out states.territories not in code 
walk_df = walk_df[walk_df["STATEFP"].isin(fips["FIPS"])]
walk_df = walk_df[walk_df["STATEFP"].notna()]
walk_df = walk_df.replace({"STATEFP" : fips_dict})

In [8]:
# not all states have all metrics, so let's filter to positive values only
walk_df = walk_df.groupby("STATEFP").agg(lambda x: x[x>0].mean())
walk_df

Unnamed: 0_level_0,D2B_E8MIXA,D3B,D4A,D4B025,D4B050,D5BR,D5BE,NatWalkInd
STATEFP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AK,0.552704,46.665058,502.789939,4.9e-05,0.000197,50234.6,276.58011,8.204744
AL,0.532267,37.900557,462.952639,,,15096.04,5033.032653,6.8312
AR,0.547573,41.377094,549.742464,0.19512,0.33869,26965.94,10081.834906,6.722559
AZ,0.530323,91.559774,556.426466,0.3125,0.58497,65861.01,42820.979055,10.104197
CA,0.571791,113.788509,482.944305,0.319048,0.524269,160437.7,114664.596189,12.22497
CO,0.579195,96.974266,541.78812,0.220197,0.42317,102538.4,64043.920225,10.530861
CT,0.573835,56.589947,464.678433,0.166857,0.373722,44294.45,26866.644231,10.084462
DC,0.500691,160.734818,263.704391,0.404095,0.704895,442742.0,247796.115813,14.471481
DE,0.560712,68.859402,532.070841,0.150458,0.342986,56307.21,35739.929577,10.481417
FL,0.577565,82.713595,552.172718,0.279499,0.461687,69952.47,49812.244836,10.470168
