In [1]:
import pandas as pd
import altair as alt
from vega_datasets import data

In [2]:
#pd.set_option('display.max_columns', 50)

In [4]:
df = pd.read_csv('../data/train_data.zip')

In [8]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


In [9]:
# Select primary key and neighbourhood columns
df_neigh = df.iloc[:, 498:673]
df_neigh['unacast_session_count'] = df['unacast_session_count']
df_neigh['external_id'] = df['external_id']
df_neigh['month'] = df['month']
df_neigh['year'] = df['year']

In [10]:
# Reorder columns so `external_id`, `month` & `year` come first
cols = df_neigh.columns.tolist()
cols = cols[-3:] + cols[:-3]
df_neigh = df_neigh[cols]

In [11]:
# Drop unnecessary columns
df_neigh = df_neigh.drop('country', axis=1)

In [12]:
# Calculate percentage of missing values
missing = df_neigh.isna()
num_missing = missing.sum().sort_values(ascending=False)
prop_missing = num_missing / df_neigh.shape[0]
df_prop_missing = prop_missing.to_frame().reset_index()
df_prop_missing = df_prop_missing.rename(columns={'index':'col_name', 0:'prop'})

In [13]:
df_prop_missing.head()

Unnamed: 0,col_name,prop
0,clean_intersection_density_km,1.0
1,node_density_km,1.0
2,clean_intersection_count_osid,1.0
3,node_density_km_osdw,1.0
4,intersection_density_km_osdw,1.0


In [14]:
# Distribution of proportion of missing values
alt.Chart(df_prop_missing).mark_bar().encode(
    alt.X('prop:Q', bin=True),
    alt.Y('count()', title='Number of columns')
)

In [15]:
# Identify columns missing more than 20% 
df_prop_missing.query('prop > 0.2')

Unnamed: 0,col_name,prop
0,clean_intersection_density_km,1.0
1,node_density_km,1.0
2,clean_intersection_count_osid,1.0
3,node_density_km_osdw,1.0
4,intersection_density_km_osdw,1.0
5,street_density_km_osid,1.0
6,edge_density_km_osid,1.0
7,intersection_density_km_osid,1.0
8,node_density_km_osid,1.0
9,edge_density_km_osdw,1.0


In [20]:
# Summary of number of places of interest near playgrounds
df_unique = df_neigh.query("month == 2 & year == 2019")
df_unique.loc[:, 'alcohol':'tourism'].describe()

Unnamed: 0,alcohol,amenity,bakery,bank,bar,cafe,camp_site,car_repair,childcare,clothes_store,...,pharmacy,picnic_site,place_of_worship,police,post_office,restaurant,shop,supermarket,swimming_pool,tourism
count,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,...,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0
mean,0.034717,0.692737,0.039904,0.118516,0.138867,0.153631,0.005188,0.0415,0.006784,0.085794,...,0.048683,0.027534,0.655626,0.039505,0.069832,0.622506,0.644852,0.096967,0.057861,0.169194
std,0.252714,2.208621,0.334336,0.6053,1.386515,1.012468,0.091413,0.270789,0.086826,0.861358,...,0.278333,0.301096,1.515299,0.208683,0.270122,3.448676,2.807171,0.44919,0.63705,1.168492
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,40.0,9.0,10.0,44.0,28.0,3.0,5.0,2.0,21.0,...,4.0,8.0,18.0,3.0,2.0,98.0,52.0,8.0,14.0,33.0


- Median is 0 for every place of interest
    - Can we infer something from the absence of certain kinds of businesses?

In [21]:
# Summary of distances between playgrounds and roads/schools
df_unique.loc[:, 'distance_to_C':'distance_to_nearest_school'].describe()

Unnamed: 0,distance_to_C,distance_to_I,distance_to_M,distance_to_O,distance_to_S,distance_to_U,distance_to_nearest_school
count,2309.0,2309.0,2309.0,2309.0,2309.0,2309.0,2309.0
mean,17400.153828,13564.65,78.474019,76058.3,1986.205409,72425.02,2258.246709
std,27869.79634,74559.32,78.888509,470099.6,6085.353064,484456.8,4468.447873
min,0.31,44.44,0.0,24.42,4.75,36.31,3.62
25%,1887.15,1618.01,34.79,4218.23,370.61,1013.76,416.75
50%,9451.51,4006.54,61.54,10926.63,945.9,3516.78,1162.73
75%,22763.52,11315.53,97.08,22586.84,2450.47,10440.86,2655.62
max,615435.97,2682853.0,1213.82,3776539.0,268116.39,3906135.0,135919.11


- Distances are large values
    - Something to consider if we decide to use algorithms that require feature scaling

In [22]:
# Summary of walk/bike scores
df_unique.loc[:, ['walk_score','bike_score']].describe()

Unnamed: 0,walk_score,bike_score
count,2273.0,2265.0
mean,33.328201,41.054746
std,24.195735,17.518431
min,0.0,1.0
25%,13.0,29.0
50%,30.0,39.0
75%,51.0,51.0
max,100.0,100.0


In [249]:
# Summary of walking networks
df_loc = df_unique.loc[:, ['external_id', 'longitude', 'latitude']]
df_walk = df_unique.loc[:, 'n_osdw':'streets_per_node_proportion_8_osdw']
df_walk = df_loc.join(df_walk)
print(df_walk.isna().sum())

external_id                              0
longitude                                0
latitude                                 0
n_osdw                                 236
m_osdw                                 236
k_avg_osdw                             236
intersection_count_osdw                236
streets_per_node_avg_osdw              236
edge_length_total_osdw                 236
edge_length_avg_osdw                   236
street_length_total_osdw               236
street_length_avg_osdw                 236
street_segments_count_osdw             236
node_density_km_osdw                  2506
intersection_density_km_osdw          2506
edge_density_km_osdw                  2506
street_density_km_osdw                2506
circuity_avg_osdw                      236
self_loop_proportion_osdw              236
clean_intersection_count_osdw         2506
clean_intersection_density_km_osdw    2506
streets_per_node_counts_0_osdw         236
streets_per_node_counts_1_osdw         236
streets_per

Check that there are 236 playgrounds that are completely missing OSMnx data related to walking networks:

In [250]:
df_walk.iloc[:, 3:][df_walk['n_osdw'].isna()].isna().all().all()

True

In [251]:
# List of playgrounds that are completely missing walking network data
df_walk_missing = df_walk[df_walk['n_osdw'].isna()]
df_walk_missing

Unnamed: 0,external_id,longitude,latitude,n_osdw,m_osdw,k_avg_osdw,intersection_count_osdw,streets_per_node_avg_osdw,edge_length_total_osdw,edge_length_avg_osdw,...,streets_per_node_counts_8_osdw,streets_per_node_proportion_0_osdw,streets_per_node_proportion_1_osdw,streets_per_node_proportion_2_osdw,streets_per_node_proportion_3_osdw,streets_per_node_proportion_4_osdw,streets_per_node_proportion_5_osdw,streets_per_node_proportion_6_osdw,streets_per_node_proportion_7_osdw,streets_per_node_proportion_8_osdw
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


Repeat the same procedure for biking networks:

In [252]:
df_bike = df_unique.loc[:, 'n':'streets_per_node_proportion_8']
df_bike = df_loc.join(df_bike)
print(df_bike.isna().sum())

external_id                         0
longitude                           0
latitude                            0
n                                 236
m                                 236
k_avg                             236
intersection_count                236
streets_per_node_avg              236
edge_length_total                 236
edge_length_avg                   236
street_length_total               236
street_length_avg                 236
street_segments_count             236
node_density_km                  2506
intersection_density_km          2506
edge_density_km                  2506
street_density_km                2506
circuity_avg                      236
self_loop_proportion              236
clean_intersection_count         2506
clean_intersection_density_km    2506
streets_per_node_counts_0         236
streets_per_node_counts_1         236
streets_per_node_counts_2         236
streets_per_node_counts_3         236
streets_per_node_counts_4         268
streets_per_

Same pattern is observed. Check that there are 236 playgrounds completely missing OSMnx data related to bike networks:

In [253]:
df_bike.iloc[:, 3:][df_bike['n'].isna()].isna().all().all()

True

In [254]:
# List of playgrounds that are completely missing biking network data
df_bike_missing = df_bike[df_bike['n'].isna()]
df_bike_missing

Unnamed: 0,external_id,longitude,latitude,n,m,k_avg,intersection_count,streets_per_node_avg,edge_length_total,edge_length_avg,...,streets_per_node_counts_8,streets_per_node_proportion_0,streets_per_node_proportion_1,streets_per_node_proportion_2,streets_per_node_proportion_3,streets_per_node_proportion_4,streets_per_node_proportion_5,streets_per_node_proportion_6,streets_per_node_proportion_7,streets_per_node_proportion_8
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


Repeat the same procedure for driving routes:

In [255]:
df_drive = df_unique.loc[:, 'n_osid':'streets_per_node_proportion_7_osid']
df_drive = df_loc.join(df_drive)
print(df_drive.isna().sum())

external_id                              0
longitude                                0
latitude                                 0
n_osid                                 240
m_osid                                 240
k_avg_osid                             240
intersection_count_osid                240
streets_per_node_avg_osid              240
edge_length_total_osid                 240
edge_length_avg_osid                   240
street_length_total_osid               240
street_length_avg_osid                 240
street_segments_count_osid             240
node_density_km_osid                  2506
intersection_density_km_osid          2506
edge_density_km_osid                  2506
street_density_km_osid                2506
circuity_avg_osid                      240
self_loop_proportion_osid              240
clean_intersection_count_osid         2506
clean_intersection_density_km_osid    2506
streets_per_node_counts_0_osid         240
streets_per_node_counts_1_osid         240
streets_per

It appears that there are 240 playgrounds completely missing OMSnx data related to driving routes. Let's check:

In [256]:
df_drive.iloc[:, 3:][df_drive['n_osid'].isna()].isna().all().all()

True

In [257]:
# List of playgrounds that are completely missing driving route data
df_drive_missing = df_drive[df_drive['n_osid'].isna()]
df_drive_missing

Unnamed: 0,external_id,longitude,latitude,n_osid,m_osid,k_avg_osid,intersection_count_osid,streets_per_node_avg_osid,edge_length_total_osid,edge_length_avg_osid,...,streets_per_node_counts_6_osid,streets_per_node_counts_7_osid,streets_per_node_proportion_0_osid,streets_per_node_proportion_1_osid,streets_per_node_proportion_2_osid,streets_per_node_proportion_3_osid,streets_per_node_proportion_4_osid,streets_per_node_proportion_5_osid,streets_per_node_proportion_6_osid,streets_per_node_proportion_7_osid
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


*Observations related to OSMnx data*

Walking
- `m_osdw` / `n_osdw` * 2 ≈ `k_avg_osdw`
- `street_length_total_osdw` * 2 ≈ `edge_length_total_osdw`
- `edge_length_avg_osdw` ≈ `street_length_avg_osdw`

Biking

- `m` / `n` * 2 ≈ `k_avg`

In [23]:
# Summary of crime rates
df_unique.loc[:, 'violent_crime':'motor_vehicle_theft'].describe()

Unnamed: 0,violent_crime,criminal_homicide,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft
count,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0
mean,331.798762,4.193613,38.697126,75.633134,212.057565,2064.488982,324.918363,1536.409062,206.936846
std,326.336004,6.918084,33.381932,90.285153,228.236959,1430.691573,268.634873,1078.661369,203.942911
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,105.5,0.0,15.0,12.6,54.9,980.9,128.0,744.3,58.9
50%,250.1,1.8,31.9,45.5,152.7,1910.0,275.1,1410.1,143.6
75%,443.8,6.0,54.9,98.0,283.5,2870.3,435.8,2076.2,308.8
max,2007.8,109.3,332.6,836.8,1747.5,12004.7,2369.9,9078.8,1645.7


*Other considerations:*

- Will `income_class`, `density_class` and `climate` be captured by census and weather data?