In [18]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [15]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [16]:
df = pd.read_csv('../data/train_data.zip')

In [17]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1807558,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,37.0,418.67,0.116368,0.025516,0.090852
1,1806729,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,63.0,710.0,0.175774,0.02017,0.155604
2,MR00096403,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,15.0,241.24,0.112038,0.006893,0.105145
3,1806350,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,45.0,468.51,0.120088,0.003619,0.116469
4,FM00169477,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,83.0,326.05,0.132935,0.025298,0.107637


In [5]:
# Calculate percentage of missing values
missing = df.isna()
num_missing = missing.sum().sort_values(ascending=False)
prop_missing = num_missing / df.shape[0]
df_prop_missing = prop_missing.to_frame().reset_index()
df_prop_missing = df_prop_missing.rename(columns={'index':'col_name', 0:'prop'})

In [6]:
df_20 = df_prop_missing.query('prop < 0.2')
df_20.shape[0]

662

In [7]:
df_20_40 = df_prop_missing.query('0.2 <= prop < 0.4')
df_20_40.shape[0]

19

In [8]:
df_40_60 = df_prop_missing.query('0.4 <= prop < 0.6')
df_40_60.shape[0]

31

In [9]:
df_60_80 = df_prop_missing.query('0.6 <= prop < 0.8')
df_60_80.shape[0]

37

In [10]:
df_80_100 = df_prop_missing.query('0.8 <= prop')
df_80_100.shape[0]

112

In [13]:
nan_rate = (df.isna().sum(axis=0)/df.shape[0]).to_frame().rename(columns={0: "Nan_rate"})
vals = [0.2,0.4,0.6,0.8,1]
count_list = [(nan_rate < 0.2).sum()[0]]
for i in range(len(vals)-1):
    a = ((nan_rate > vals[i]) * (nan_rate <= vals[i+1])).sum()
    count_list.append(a[0])
    
count_list

[662, 19, 31, 37, 112]

In [15]:
# Check that counts add up to 861
print(count_list, sum(count_list))

[662, 19, 31, 37, 112] 861


In [16]:
# Plot histogram binned by 10%
alt.Chart(nan_rate).mark_bar().encode(
    alt.X("Nan_rate:Q", bin=alt.Bin(maxbins=10),title="% NaN in column"),
    y=alt.Y('count()', title="Count")).properties(
    title='Number of columns by percentage of missing values'
)

In [17]:
# Plot histogram binned by 20%
alt.Chart(nan_rate).mark_bar().encode(
    alt.X("Nan_rate:Q", bin=alt.Bin(maxbins=5),title="% NaN in column"),
    y=alt.Y('count()', title="Count")).properties(
    title='Number of columns by percentage of missing values'
)

# EDA on how to impute `NaN` values

In [227]:
# Look just at OSMnx data
df_osm = df.loc[:, 'n':'streets_per_node_proportion_7_osid']
missing = df_osm.isna()
    
# Count number of missing values for each column
num_missing = missing.sum().sort_values(ascending=False)
    
# Calculate proportion of missing values for each column
prop_missing = num_missing / df_osm.shape[0]
    
# Create a list of columns with <30% of values missing
to_impute_osm = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()

In [228]:
# Look just at OSMnx data
osm_dict = {'column': [], 'n': []}

for column in to_impute_osm:
    # Create a series with `external_id` and count of missing values  
    nan = df[df[column].isnull()]['external_id'].value_counts()
    
    # Number of missing 
    num_nan = len(nan)
    
    osm_dict['column'].append(column)
    osm_dict['n'].append(num_nan)
    
df_osm_nan = pd.DataFrame(osm_dict)

In [229]:
# Counts of playgrounds with values missing in certain column
df_osm_nan

Unnamed: 0,column,n
0,streets_per_node_counts_4_osid,123
1,streets_per_node_proportion_4_osid,123
2,streets_per_node_counts_4,96
3,streets_per_node_counts_4_osdw,96
4,streets_per_node_proportion_4,96
...,...,...
61,m_osdw,61
62,n_osdw,61
63,streets_per_node_proportion_3,61
64,streets_per_node_proportion_2,61


In [292]:
test = df[['Poor_mental_health_days']]

In [293]:
alt.Chart(test).mark_bar().encode(
    alt.X('Poor_mental_health_days', bin=alt.Bin(maxbins=20)),
    alt.Y('count()')
)

In [23]:
df.loc[:, 'Democrats_08_Votes']

0        105656.0
1        648230.0
2          7421.0
3        205984.0
4         62755.0
           ...   
50115    422989.0
50116     65096.0
50117    648230.0
50118    660085.0
50119    346549.0
Name: Democrats_08_Votes, Length: 50120, dtype: float64