In [177]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
#pd.set_option('display.max_columns', 50)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [None]:
df = pd.read_csv('../data/train_data.zip')

In [None]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


In [None]:
# Select primary key and neighbourhood columns
df_neigh = df.iloc[:, 498:673]
df_neigh['unacast_session_count'] = df['unacast_session_count']
df_neigh['external_id'] = df['external_id']
df_neigh['month'] = df['month']
df_neigh['year'] = df['year']

In [None]:
# Reorder columns so `external_id`, `month` & `year` come first
cols = df_neigh.columns.tolist()
cols = cols[-3:] + cols[:-3]
df_neigh = df_neigh[cols]

In [None]:
# Drop unnecessary columns
df_neigh = df_neigh.drop('country', axis=1)

In [None]:
# Calculate percentage of missing values
missing = df_neigh.isna()
num_missing = missing.sum().sort_values(ascending=False)
prop_missing = num_missing / df_neigh.shape[0]
df_prop_missing = prop_missing.to_frame().reset_index()
df_prop_missing = df_prop_missing.rename(columns={'index':'col_name', 0:'prop'})

In [None]:
prop_missing[50:60]

streets_per_node_counts_2_osid        0.096169
streets_per_node_proportion_2_osid    0.096169
streets_per_node_proportion_3_osid    0.096169
streets_per_node_counts_3_osid        0.096169
m_osid                                0.095770
self_loop_proportion_osid             0.095770
edge_length_avg_osid                  0.095770
streets_per_node_proportion_1_osid    0.095770
streets_per_node_counts_0_osid        0.095770
circuity_avg_osid                     0.095770
dtype: float64

In [None]:
df_prop_missing.head()

Unnamed: 0,col_name,prop
0,clean_intersection_density_km,1.0
1,node_density_km,1.0
2,clean_intersection_count_osid,1.0
3,node_density_km_osdw,1.0
4,intersection_density_km_osdw,1.0


In [None]:
# Distribution of proportion of missing values
alt.Chart(df_prop_missing).mark_bar().encode(
    alt.X('prop:Q', bin=True),
    alt.Y('count()', title='Number of columns')
)

In [None]:
# Identify columns missing more than 20% 
df_prop_missing.query('prop > 0.2')

Unnamed: 0,col_name,prop
0,clean_intersection_density_km,1.0
1,node_density_km,1.0
2,clean_intersection_count_osid,1.0
3,node_density_km_osdw,1.0
4,intersection_density_km_osdw,1.0
5,street_density_km_osid,1.0
6,edge_density_km_osid,1.0
7,intersection_density_km_osid,1.0
8,node_density_km_osid,1.0
9,edge_density_km_osdw,1.0


What are the clean variables?


In [None]:
# Summary of number of places of interest near playgrounds
df_unique = df_neigh.query("month == 2 & year == 2019")
df_unique.loc[:, 'alcohol':'tourism'].describe()

Unnamed: 0,alcohol,amenity,bakery,bank,bar,cafe,camp_site,car_repair,childcare,clothes_store,...,pharmacy,picnic_site,place_of_worship,police,post_office,restaurant,shop,supermarket,swimming_pool,tourism
count,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,...,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0,2506.0
mean,0.034717,0.692737,0.039904,0.118516,0.138867,0.153631,0.005188,0.0415,0.006784,0.085794,...,0.048683,0.027534,0.655626,0.039505,0.069832,0.622506,0.644852,0.096967,0.057861,0.169194
std,0.252714,2.208621,0.334336,0.6053,1.386515,1.012468,0.091413,0.270789,0.086826,0.861358,...,0.278333,0.301096,1.515299,0.208683,0.270122,3.448676,2.807171,0.44919,0.63705,1.168492
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,40.0,9.0,10.0,44.0,28.0,3.0,5.0,2.0,21.0,...,4.0,8.0,18.0,3.0,2.0,98.0,52.0,8.0,14.0,33.0


In [27]:
# checking how many playgrounds dont have any form of amenity near them
(df_unique.loc[:, 'alcohol':'tourism'].sum(axis=1)==0).sum()

875

In [39]:
#correlation of type of amenity with target
df_amen = df.loc[:,list(df_unique.columns[10:47])].join(df.loc[:,"unacast_session_count"])

In [45]:
df_amen.corr().unacast_session_count[:-1].sort_values(ascending=False)

restaurant                  0.224898
cafe                        0.210442
alcohol                     0.207067
bar                         0.191859
fast_food                   0.189773
supermarket                 0.188776
bakery                      0.185682
shop                        0.185602
convenience_store           0.181395
amenity                     0.164743
laundry_or_dry_cleaning     0.159887
clothes_store               0.142488
ice_cream                   0.137340
museum_or_gallery           0.135595
community_centre            0.133017
bank                        0.127938
pharmacy                    0.112607
hairdresser_or_beauty       0.105988
park                        0.099185
hotel                       0.099054
childcare                   0.096826
fitness_or_sports_centre    0.088969
post_office                 0.086175
place_of_worship            0.084818
office                      0.077607
tourism                     0.072798
healthcare                  0.061859
p

In [38]:
df_amen['total_amenities'] = df_amen.loc[:, 'alcohol':'tourism'].sum(axis=1)
df_amen['total_amenities_to_population'] = df_amen.loc[:, 'total_amenities']/df.B09001e1
df_amen['total_amenities_multiplied_population'] = df_amen.loc[:, 'total_amenities']*df.B09001e1
df_amen.loc[:,["unacast_session_count", "total_amenities", "total_amenities_to_population",'total_amenities_multiplied_population']].corr()

Unnamed: 0,unacast_session_count,total_amenities,total_amenities_to_population,total_amenities_multiplied_population
unacast_session_count,1.0,0.227574,0.040597,0.158489
total_amenities,0.227574,1.0,0.315675,0.638294
total_amenities_to_population,0.040597,0.315675,1.0,0.034565
total_amenities_multiplied_population,0.158489,0.638294,0.034565,1.0


- Median is 0 for every place of interest
    - Can we infer something from the absence of certain kinds of businesses?

In [15]:
# Summary of distances between playgrounds and roads/schools
df_unique.loc[:, 'distance_to_C':'distance_to_nearest_school'].describe()

Unnamed: 0,distance_to_C,distance_to_I,distance_to_M,distance_to_O,distance_to_S,distance_to_U,distance_to_nearest_school
count,2309.0,2309.0,2309.0,2309.0,2309.0,2309.0,2309.0
mean,17400.153828,13564.65,78.474019,76058.3,1986.205409,72425.02,2258.246709
std,27869.79634,74559.32,78.888509,470099.6,6085.353064,484456.8,4468.447873
min,0.31,44.44,0.0,24.42,4.75,36.31,3.62
25%,1887.15,1618.01,34.79,4218.23,370.61,1013.76,416.75
50%,9451.51,4006.54,61.54,10926.63,945.9,3516.78,1162.73
75%,22763.52,11315.53,97.08,22586.84,2450.47,10440.86,2655.62
max,615435.97,2682853.0,1213.82,3776539.0,268116.39,3906135.0,135919.11


- Distances are large values
    - Something to consider if we decide to use algorithms that require feature scaling

In [16]:
# Summary of walk/bike scores
df_unique.loc[:, ['walk_score','bike_score']].describe()

Unnamed: 0,walk_score,bike_score
count,2273.0,2265.0
mean,33.328201,41.054746
std,24.195735,17.518431
min,0.0,1.0
25%,13.0,29.0
50%,30.0,39.0
75%,51.0,51.0
max,100.0,100.0


In [17]:
df_loc = df_unique.loc[:, ['external_id', 'longitude', 'latitude']]
df_walk = df_unique.loc[:, 'n_osdw':'streets_per_node_proportion_8_osdw']
df_walk = df_loc.join(df_walk)
print(df_walk.isna().sum())

external_id                              0
longitude                                0
latitude                                 0
n_osdw                                 236
m_osdw                                 236
k_avg_osdw                             236
intersection_count_osdw                236
streets_per_node_avg_osdw              236
edge_length_total_osdw                 236
edge_length_avg_osdw                   236
street_length_total_osdw               236
street_length_avg_osdw                 236
street_segments_count_osdw             236
node_density_km_osdw                  2506
intersection_density_km_osdw          2506
edge_density_km_osdw                  2506
street_density_km_osdw                2506
circuity_avg_osdw                      236
self_loop_proportion_osdw              236
clean_intersection_count_osdw         2506
clean_intersection_density_km_osdw    2506
streets_per_node_counts_0_osdw         236
streets_per_node_counts_1_osdw         236
streets_per

Check that there are 236 playgrounds that are completely missing OSMnx data related to walking networks:

In [18]:
df_walk.iloc[:, 3:][df_walk['n_osdw'].isna()].isna().all().all()

True

In [19]:
# List of playgrounds that are completely missing walking network data
df_walk_missing = df_walk[df_walk['n_osdw'].isna()]
df_walk_missing

Unnamed: 0,external_id,longitude,latitude,n_osdw,m_osdw,k_avg_osdw,intersection_count_osdw,streets_per_node_avg_osdw,edge_length_total_osdw,edge_length_avg_osdw,...,streets_per_node_counts_8_osdw,streets_per_node_proportion_0_osdw,streets_per_node_proportion_1_osdw,streets_per_node_proportion_2_osdw,streets_per_node_proportion_3_osdw,streets_per_node_proportion_4_osdw,streets_per_node_proportion_5_osdw,streets_per_node_proportion_6_osdw,streets_per_node_proportion_7_osdw,streets_per_node_proportion_8_osdw
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


Repeat the same procedure for biking networks:

In [20]:
df_bike = df_unique.loc[:, 'n':'streets_per_node_proportion_8']
df_bike = df_loc.join(df_bike)
print(df_bike.isna().sum())

external_id                         0
longitude                           0
latitude                            0
n                                 236
m                                 236
k_avg                             236
intersection_count                236
streets_per_node_avg              236
edge_length_total                 236
edge_length_avg                   236
street_length_total               236
street_length_avg                 236
street_segments_count             236
node_density_km                  2506
intersection_density_km          2506
edge_density_km                  2506
street_density_km                2506
circuity_avg                      236
self_loop_proportion              236
clean_intersection_count         2506
clean_intersection_density_km    2506
streets_per_node_counts_0         236
streets_per_node_counts_1         236
streets_per_node_counts_2         236
streets_per_node_counts_3         236
streets_per_node_counts_4         268
streets_per_

Same pattern is observed. Check that there are 236 playgrounds completely missing OSMnx data related to bike networks:

In [21]:
df_bike.iloc[:, 3:][df_bike['n'].isna()].isna().all().all()

True

In [22]:
# List of playgrounds that are completely missing biking network data
df_bike_missing = df_bike[df_bike['n'].isna()]
df_bike_missing

Unnamed: 0,external_id,longitude,latitude,n,m,k_avg,intersection_count,streets_per_node_avg,edge_length_total,edge_length_avg,...,streets_per_node_counts_8,streets_per_node_proportion_0,streets_per_node_proportion_1,streets_per_node_proportion_2,streets_per_node_proportion_3,streets_per_node_proportion_4,streets_per_node_proportion_5,streets_per_node_proportion_6,streets_per_node_proportion_7,streets_per_node_proportion_8
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


Repeat the same procedure for driving routes:

In [23]:
df_drive = df_unique.loc[:, 'n_osid':'streets_per_node_proportion_7_osid']
df_drive = df_loc.join(df_drive)
print(df_drive.isna().sum())

external_id                              0
longitude                                0
latitude                                 0
n_osid                                 240
m_osid                                 240
k_avg_osid                             240
intersection_count_osid                240
streets_per_node_avg_osid              240
edge_length_total_osid                 240
edge_length_avg_osid                   240
street_length_total_osid               240
street_length_avg_osid                 240
street_segments_count_osid             240
node_density_km_osid                  2506
intersection_density_km_osid          2506
edge_density_km_osid                  2506
street_density_km_osid                2506
circuity_avg_osid                      240
self_loop_proportion_osid              240
clean_intersection_count_osid         2506
clean_intersection_density_km_osid    2506
streets_per_node_counts_0_osid         240
streets_per_node_counts_1_osid         240
streets_per

It appears that there are 240 playgrounds completely missing OSMnx data related to driving routes. Let's check:

In [24]:
df_drive.iloc[:, 3:][df_drive['n_osid'].isna()].isna().all().all()

True

In [25]:
# List of playgrounds that are completely missing driving route data
df_drive_missing = df_drive[df_drive['n_osid'].isna()]
df_drive_missing

Unnamed: 0,external_id,longitude,latitude,n_osid,m_osid,k_avg_osid,intersection_count_osid,streets_per_node_avg_osid,edge_length_total_osid,edge_length_avg_osid,...,streets_per_node_counts_6_osid,streets_per_node_counts_7_osid,streets_per_node_proportion_0_osid,streets_per_node_proportion_1_osid,streets_per_node_proportion_2_osid,streets_per_node_proportion_3_osid,streets_per_node_proportion_4_osid,streets_per_node_proportion_5_osid,streets_per_node_proportion_6_osid,streets_per_node_proportion_7_osid
15,FM00170822,-88.196035,41.595238,,,,,,,,...,,,,,,,,,,
266,MR00098513,-79.991272,33.425063,,,,,,,,...,,,,,,,,,,
403,FM00171092,-97.073783,32.625273,,,,,,,,...,,,,,,,,,,
410,MR00111088,-98.418530,29.806990,,,,,,,,...,,,,,,,,,,
413,MR00102373,-83.249613,42.287850,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48799,MR00098653,-81.124837,34.318880,,,,,,,,...,,,,,,,,,,
48891,FM00160117,-115.179558,36.140171,,,,,,,,...,,,,,,,,,,
49618,1803910,-73.810522,42.860227,,,,,,,,...,,,,,,,,,,
49696,MR00097431,-95.699479,30.380611,,,,,,,,...,,,,,,,,,,


*Observations related to OSMnx data*

Correlations:
    - Walking
        - `m_osdw` / `n_osdw` * 2 ≈ `k_avg_osdw`
        - `edge_length_avg_osdw` ≈ `street_length_avg_osdw`
    - Biking
        - `m` / `n` * 2 ≈ `k_avg`
        - `edge_length_avg` ≈ `street_length_avg`
    - Driving
        - `m_osid` / `n_osid` * 2 ≈ `k_avg_osid`
        - `edge_length_avg_osid` ≈ `street_length_avg_osid`

In [26]:
def plot_street_networks(columns, title):
    
    """Visualize distribution of chosen feature for bike paths, walk paths, and driving routes."""
    
    source = df_unique.loc[:, columns].melt()
    
    chart = alt.Chart(source).mark_bar().encode(
        alt.X('value:Q', bin=alt.Bin(maxbins=30), title=title),
        alt.Y('count()', title='Number of playgrounds'),
        alt.Color('variable:N')
    ).properties(
        width=300
    ).facet(
        column='variable:N'
    )
    
    display(chart)

In [27]:
# Visualizing the number of intersections for bike paths, walk paths, and driving routes

plot_street_networks(['n', 'n_osdw','n_osid'], 'Number of intersections')

In [28]:
# Visualizing the average number of streets that emanate from each node
# for bike paths, walk paths, and driving routes

plot_street_networks(['streets_per_node_avg', 
                      'streets_per_node_avg_osdw', 
                      'streets_per_node_avg_osid'],'Average number of streets')

In [29]:
# Visualizing the sum of all edge lengths in the graph, in meters
# for bike paths, walk paths, and driving routes

plot_street_networks(['edge_length_total',
                      'edge_length_total_osdw',
                      'edge_length_total_osid'], 'Sum of all edge lengths (m)')

In [30]:
# Visualizing the mean edge length in the graph, in meters
# for bike paths, walk paths, and driving routes

plot_street_networks(['edge_length_avg',
                      'edge_length_avg_osdw',
                      'edge_length_avg_osid'], 'Mean edge length (m)')

In [31]:
# Visualizing the sum of all edges in the undirected representation, in meters
# for bike paths, walk paths, and driving routes

plot_street_networks(['street_length_total',
                      'street_length_total_osdw',
                      'street_length_total_osid'], 'Sum of all edges (m)')

In [32]:
# Visualizing the mean edge length in the undirected representation, in meters
# for bike paths, walk paths, and driving routes

plot_street_networks(['street_length_avg',
                      'street_length_avg_osdw',
                      'street_length_avg_osid'], 'Mean edge length (m)')

In [33]:
# Visualizing the number of edges in the undirected representation
# for bike paths, walk paths, and driving routes

plot_street_networks(['street_segments_count',
                      'street_segments_count_osdw',
                      'street_segments_count_osid'], 'Number of edges')

In [34]:
# Counts/proportions of nodes with 1 street emanating from them
# for bike paths, walk paths, and driving routes

plot_street_networks(['streets_per_node_proportion_1',
                      'streets_per_node_proportion_1_osdw',
                      'streets_per_node_proportion_1_osid'], 'Proportions of nodes')

In [35]:
# Counts/proportions of nodes with 2 streets emanating from them
# for bike paths, walk paths, and driving routes
plot_street_networks(['streets_per_node_proportion_2',
                      'streets_per_node_proportion_2_osdw',
                      'streets_per_node_proportion_2_osid'], 'Proportions of nodes')

In [36]:
# Counts/proportions of nodes with 3 streets emanating from them
# for bike paths, walk paths, and driving routes
plot_street_networks(['streets_per_node_proportion_3',
                      'streets_per_node_proportion_3_osdw',
                      'streets_per_node_proportion_3_osid'], 'Proportions of nodes')

In [37]:
# Counts/proportions of nodes with 4 streets emanating from them
# for bike paths, walk paths, and driving routes
plot_street_networks(['streets_per_node_proportion_4',
                      'streets_per_node_proportion_4_osdw',
                      'streets_per_node_proportion_4_osid'], 'Proportions of nodes')

In [38]:
# Summary of crime rates
df_unique.loc[:, 'violent_crime':'motor_vehicle_theft'].describe()

Unnamed: 0,violent_crime,criminal_homicide,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft
count,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0,2505.0
mean,331.798762,4.193613,38.697126,75.633134,212.057565,2064.488982,324.918363,1536.409062,206.936846
std,326.336004,6.918084,33.381932,90.285153,228.236959,1430.691573,268.634873,1078.661369,203.942911
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,105.5,0.0,15.0,12.6,54.9,980.9,128.0,744.3,58.9
50%,250.1,1.8,31.9,45.5,152.7,1910.0,275.1,1410.1,143.6
75%,443.8,6.0,54.9,98.0,283.5,2870.3,435.8,2076.2,308.8
max,2007.8,109.3,332.6,836.8,1747.5,12004.7,2369.9,9078.8,1645.7


In [39]:
neigh_desc = df_neigh.describe().T
neigh_desc["session_corr"] = df_neigh.corr().unacast_session_count
neigh_desc["CoV"] = (neigh_desc["std"])/neigh_desc["mean"]
# neigh_desc['zero'] = zero_val
# neigh_desc['null'] = null_val

In [40]:
alt.Chart(neigh_desc).mark_bar().encode(
    alt.X("session_corr:Q", bin=alt.Bin(maxbins=30),title="Correlation (binned)"),
    y=alt.Y('count()', title="Count")).properties(
    title='Histogram - Correlation with Session Count'
)

In [41]:
neigh_desc.sort_values(by="CoV", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,session_corr,CoV
camp_site,50120.0,0.005188,0.091396,0.0,0.0,0.0,0.0,3.000000,-0.006473,17.618247
streets_per_node_proportion_2_osdw,45400.0,0.000020,0.000310,0.0,0.0,0.0,0.0,0.010417,-0.002615,15.634859
childcare,50120.0,0.006784,0.086810,0.0,0.0,0.0,0.0,2.000000,0.096826,12.796781
streets_per_node_counts_2_osdw,45400.0,0.007930,0.093530,0.0,0.0,0.0,0.0,2.000000,0.000697,11.795191
community_centre,50120.0,0.013966,0.155393,0.0,0.0,0.0,0.0,4.000000,0.133017,11.126142
...,...,...,...,...,...,...,...,...,...,...
street_density_km_osid,0.0,,,,,,,,,
clean_intersection_count_osid,0.0,,,,,,,,,
clean_intersection_density_km_osid,0.0,,,,,,,,,
streets_per_node_counts_0_osid,45320.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,,


In [42]:
alt.Chart(neigh_desc).mark_bar().encode(
    alt.X("std:Q", bin=alt.Bin(maxbins=30)),
    y=alt.Y('count()', title="Count")).properties(
    title='Histogram - standard deviation (std/mean)'
)


In [43]:
neigh_desc.query('std > 200000')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,session_corr,CoV
distance_to_O,46180.0,76058.296748,470002.887858,24.42,4218.23,10926.63,22586.84,3776539.49,0.032987,6.179508
distance_to_U,46180.0,72425.024786,484357.154691,36.31,1013.76,3516.78,10440.86,3906134.56,0.033962,6.687704


In [44]:
alt.Chart(neigh_desc).mark_bar().encode(
    alt.X("CoV:Q", bin=alt.Bin(maxbins=30)),
    y=alt.Y('count()', title="Count")).properties(
    title='Histogram - Coefficient of Variance (std/mean)'
)


In [45]:
neigh_desc.query('CoV > 14')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,session_corr,CoV
camp_site,50120.0,0.005188,0.091396,0.0,0.0,0.0,0.0,3.0,-0.006473,17.618247
streets_per_node_proportion_2_osdw,45400.0,2e-05,0.00031,0.0,0.0,0.0,0.0,0.010417,-0.002615,15.634859


Fit a PCA model and plot the proportion of total variance explained by the first $k$ components. This analysis only includes numerical features and the data has been scaled.

In [195]:
df_neigh_pca = df_neigh.loc[:, 'longitude':'houses_per_sq_km']

In [196]:
# Input type is string
string_features = ['closest_place_distance',
                        'closest_place_category',
                        'income_class',
                        'density_class',
                        'climate',
                        'node_density_km',
                        'intersection_density_km',
                        'edge_density_km',
                        'street_density_km',
                        'clean_intersection_count',
                        'clean_intersection_density_km',
                        'streets_per_node_counts_8',
                        'streets_per_node_proportion_8',
                        'node_density_km_osdw',
                        'intersection_density_km_osdw',
                        'edge_density_km_osdw',
                        'street_density_km_osdw',
                        'clean_intersection_count_osdw',
                        'clean_intersection_density_km_osdw',
                        'node_density_km_osid',
                        'intersection_density_km_osid',
                        'edge_density_km_osid',
                        'street_density_km_osid',
                        'clean_intersection_count_osid',
                        'clean_intersection_density_km_osid',
                        'county']

In [197]:
# Drop features with string as input type
df_neigh_pca = df_neigh_pca.drop(string_features, axis=1)

In [198]:
# Impute missing values with the mean along each column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df_neigh_pca = imputer.fit_transform(df_neigh_pca)

In [199]:
# Scale the data before performing PCA
scaler = StandardScaler()
df_neigh_pca = scaler.fit_transform(df_neigh_pca)

In [200]:
pca_dict = {'k': [], 'prop': []}
for k in range(1, 81):
    pca = PCA(n_components=k)
    pca.fit(df_neigh_pca)
    pca_dict['k'].append(k)
    pca_dict['prop'].append(np.sum(pca.explained_variance_ratio_))

In [201]:
pca_df = pd.DataFrame(pca_dict)

alt.Chart(pca_df).mark_line().encode(
    alt.X('k:Q'),
    alt.Y('prop:Q')
).properties(title='Proportion of total variance explained by k principal components')

Let's try running PCA again. We'll try imputing the missing values with 0.

In [205]:
df_pca_zero = df_neigh.loc[:, 'longitude':'houses_per_sq_km']
df_pca_zero = df_pca_zero.drop(string_features, axis=1)

# Impute missing values with 0
imputer_zero = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
df_pca_zero = imputer_zero.fit_transform(df_pca_zero)

In [206]:
# Scale the data before performing PCA
scaler_zero = StandardScaler()
df_pca_zero = scaler_zero.fit_transform(df_pca_zero)

In [208]:
pca_dict_zero = {'k': [], 'prop': []}
for k in range(1, 81):
    pca = PCA(n_components=k)
    pca.fit(df_pca_zero)
    pca_dict_zero['k'].append(k)
    pca_dict_zero['prop'].append(np.sum(pca.explained_variance_ratio_))

In [209]:
pca_df_zero = pd.DataFrame(pca_dict_zero)

alt.Chart(pca_df_zero).mark_line().encode(
    alt.X('k:Q'),
    alt.Y('prop:Q')
).properties(title='Proportion of total variance explained by k principal components')

*Summary*
- I found 34 columns that were missing data for 90% of rows
    - I'll add these feature names to the Google sheet
- The median count of places of interest near playgrounds is 0 across all establishments
    - Distributions appear to be skewed
    - Perhaps it's possible to infer something from the absence of certain types of businesses. For example, the absence of `alcohol`, `bar`, and `police` might suggest that the playground is located in a family-friendly area.
- 236 playgrounds missing walking network data (592-627); 236 playgrounds missing biking network data (556-591); 240 playgrounds missing driving route data (628-661)
- Some features derived from OSMnx data are linear transformations of one another
- Distribution of crime rates appear to be less skewed
- Many of the categorical features (i.e. input type is `string`) in this subset are to be placed on the chopping block
- Performing PCA on the numerical features (146 columns) revealed that 90% of the total variance could be explained by about 50 principal components. About 60 principal components are required to explain 95% of the total variance in those 146 columns.