In [1]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

Note: this notebook was mostly used to generate plots for the proposal and final report.

### Visualizations for the proposal

In [2]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
df = pd.read_csv('../data/old_train_data.zip')

In [4]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


In [5]:
# Calculate percentage of missing values
missing = df.isna()
num_missing = missing.sum().sort_values(ascending=False)
prop_missing = num_missing / df.shape[0]
df_prop_missing = prop_missing.to_frame().reset_index()
df_prop_missing = df_prop_missing.rename(columns={'index':'col_name', 0:'prop'})

In [6]:
df_20 = df_prop_missing.query('prop < 0.2')
df_20.shape[0]

662

In [7]:
df_20_40 = df_prop_missing.query('0.2 <= prop < 0.4')
df_20_40.shape[0]

19

In [8]:
df_40_60 = df_prop_missing.query('0.4 <= prop < 0.6')
df_40_60.shape[0]

31

In [9]:
df_60_80 = df_prop_missing.query('0.6 <= prop < 0.8')
df_60_80.shape[0]

37

In [10]:
df_80_100 = df_prop_missing.query('0.8 <= prop')
df_80_100.shape[0]

112

In [11]:
nan_rate = (df.isna().sum(axis=0)/df.shape[0]).to_frame().rename(columns={0: "Nan_rate"})
vals = [0.2,0.4,0.6,0.8,1]
count_list = [(nan_rate < 0.2).sum()[0]]
for i in range(len(vals)-1):
    a = ((nan_rate > vals[i]) * (nan_rate <= vals[i+1])).sum()
    count_list.append(a[0])
    
count_list

  f"evaluating in Python space because the {repr(op_str)} "


[662, 19, 31, 37, 112]

In [12]:
# Check that counts add up to 861
print(count_list, sum(count_list))

[662, 19, 31, 37, 112] 861


In [13]:
# Plot histogram binned by 10%
alt.Chart(nan_rate).mark_bar().encode(
    alt.X("Nan_rate:Q", bin=alt.Bin(maxbins=10),title="% NaN in column"),
    y=alt.Y('count()', title="Count")).properties(
    title='Number of columns by percentage of missing values'
)

In [14]:
# Plot histogram binned by 20%
alt.Chart(nan_rate).mark_bar().encode(
    alt.X("Nan_rate:Q", bin=alt.Bin(maxbins=5),title="% NaN in column"),
    y=alt.Y('count()', title="Count")).properties(
    title='Number of columns by percentage of missing values'
)

### EDA on how to impute `NaN` values

In [15]:
# Look just at OSMnx data
df_osm = df.loc[:, 'n':'streets_per_node_proportion_7_osid']
missing = df_osm.isna()
    
# Count number of missing values for each column
num_missing = missing.sum().sort_values(ascending=False)
    
# Calculate proportion of missing values for each column
prop_missing = num_missing / df_osm.shape[0]
    
# Create a list of columns with <30% of values missing
to_impute_osm = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()

In [16]:
# Look just at OSMnx data
osm_dict = {'column': [], 'n': []}

for column in to_impute_osm:
    # Create a series with `external_id` and count of missing values  
    nan = df[df[column].isnull()]['external_id'].value_counts()
    
    # Number of missing 
    num_nan = len(nan)
    
    osm_dict['column'].append(column)
    osm_dict['n'].append(num_nan)
    
df_osm_nan = pd.DataFrame(osm_dict)

In [17]:
# Counts of playgrounds with values missing in certain column
df_osm_nan

Unnamed: 0,column,n
0,streets_per_node_counts_4_osid,291
1,streets_per_node_proportion_4_osid,291
2,streets_per_node_counts_4,268
3,streets_per_node_counts_4_osdw,268
4,streets_per_node_proportion_4,268
...,...,...
61,m_osdw,236
62,n_osdw,236
63,streets_per_node_proportion_3,236
64,streets_per_node_proportion_2,236


In [18]:
# Look at distribution of `poor_mental_health_days`
test = df[['Poor_mental_health_days']]

In [19]:
alt.Chart(test).mark_bar().encode(
    alt.X('Poor_mental_health_days', bin=alt.Bin(maxbins=20)),
    alt.Y('count()')
)

### Visualizations for the final report

In [26]:
df = pd.read_csv('../data/playground_stats.zip')

In [27]:
df = df[~(df['year']==2019) | ~(df['month'] > 9)]

In [28]:
# FIGURE 1 IN REPORT

# Plot marginal distribution of `unacast_session_count`
plot_1 = (alt
         .Chart(df[['unacast_session_count']])
         .mark_bar().encode(
             alt.X('unacast_session_count:Q', 
                   bin=alt.Bin(maxbins=50),
                   title='Target'),
             alt.Y('count()')
         )
        )

plot_1

In [29]:
# FIGURE 2 IN REPORT

# Calculate sparsity
sparse = df == 0
num_sparse = sparse.sum().sort_values(ascending=False)
prop_sparse = num_sparse / df.shape[0]
df_prop_sparse = prop_sparse.to_frame().reset_index()
df_prop_sparse = df_prop_sparse.rename(columns={'index':'col_name', 0:'prop'})

# Plot sparsity of data
plot_2 = (alt
          .Chart(df_prop_sparse)
          .mark_bar()
          .encode(
              alt.X('prop:Q', 
                    bin=alt.Bin(maxbins=10), 
                    title='Proportion of zeros'),
              alt.Y('count()')
          )
         )

plot_2

In [30]:
# Calculate percentage of missing values
missing = df.isna()
num_missing = missing.sum().sort_values(ascending=False)
prop_missing = num_missing / df.shape[0]
df_prop_missing = prop_missing.to_frame().reset_index()
df_prop_missing = df_prop_missing.rename(columns={'index':'col_name', 0:'prop'})

In [31]:
# FIGURE 3 IN REPORT

# Plot to visualize null rate
plot_3 = (alt
          .Chart(df_prop_missing)
          .mark_bar()
          .encode(
              alt.X('prop:Q', 
                    bin=alt.Bin(maxbins=10), 
                    title='Proportion of missing values'),
              alt.Y('count()')
          )
         )

plot_3

In [32]:
# FIGURE 4 IN REPORT

# Calculate how many observations are available for each playground
not_na_count = df[df.notna()['unacast_session_count']]
not_na_count_pg = not_na_count['external_id'].value_counts()
plot_dict = {'n_available': not_na_count_pg}

# Plot how many observations are available for each playground
plot_4 = (alt
          .Chart(pd.DataFrame(plot_dict))
          .mark_bar()
          .encode(
              alt.X('n_available:Q', 
                    title='Number of observations available'),
              alt.Y('count()', 
                    title='Count of Records (log)',
                    scale=alt.Scale(type='log'))
          )
         )

plot_4

In [33]:
# FIGURE 5 IN REPORT

# Calculate how many observations are available for each month
df_avail_time = df[df['unacast_session_count'].notna()]
df_avail_time = df_avail_time.loc[:, ['month', 'year']]
df_avail_time['date'] = pd.to_datetime(df_avail_time[['year', 'month']].assign(DAY=1))

# Plot how many observations are available for each month
plot_5 = (alt
          .Chart(df_avail_time)
          .mark_bar()
          .encode(
              alt.X('month:O', title='Month'),
              alt.Y('count()'),
              alt.Color('year:O', title='Year')
          )
         ).facet(column='year:O')

plot_5