In [111]:
import pandas as pd
from pathlib import Path
import numpy as np
import os


In [112]:
root_path = Path(os.getcwd())
data_path = root_path.parent / "data" / "recvd_net_vars_v7_20180829.csv"

In [113]:
with open(data_path, "r") as f:
    cols = f.readline().strip().split(",")

cols

['adr_net_behid_u_2014',
 'adr_net_dunsnumber_x_2014',
 'adr_net_behloc_x_2014',
 'adr_net_firstyear_x_2014',
 'adr_net_lastyear_x_2014',
 'adr_net_behsic_x_2014',
 'adr_net_company_x_2014',
 'adr_net_tradename_x_2014',
 'adr_net_adl_c_2014',
 'adr_net_adp_c_2014',
 'adr_net_edu_c_2014',
 'adr_net_med_c_2014',
 'adr_net_pav_c_2014',
 'adr_net_pwd_c_2014',
 'adr_net_piz_c_2014',
 'adr_net_bkn_c_2014',
 'adr_net_eat_c_2014',
 'adr_net_bks_c_2014',
 'adr_net_met_c_2014',
 'adr_net_fvm_c_2014',
 'adr_net_nat_c_2014',
 'adr_net_fsh_c_2014',
 'adr_net_cnv_c_2014',
 'adr_net_bds_c_2014',
 'adr_net_smk_c_2014',
 'adr_net_gry_c_2014',
 'adr_net_bar_c_2014',
 'adr_net_liq_c_2014',
 'adr_net_urg_c_2014',
 'adr_net_hpc_c_2014',
 'adr_net_res_c_2014',
 'adr_net_dds_c_2014',
 'adr_net_mul_c_2014',
 'adr_net_vpa_c_2014',
 'adr_net_mpa_c_2014',
 'adr_net_bnk_c_2014',
 'adr_net_crd_c_2014',
 'adr_net_des_c_2014',
 'adr_net_nut_c_2014',
 'adr_net_beu_c_2014',
 'adr_net_lib_c_2014',
 'adr_net_rel_c_2014'

## We need the following variables:

### Identifying characteristics
- adr_net_behid_u_2014
- adr_net_dunsnumber_x_2014
- adr_net_behloc_x_2014

### Time characteristics
- adr_net_firstyear_x_2014
- adr_net_lastyear_x_2014

### NETS categories
- Medical Neighborhoods - adr_net_acth_c_2014 - ACT (All clinical treatment)
- Walking Destinations Neighborhoods - adr_net_walh_c_2014 -WAL (walkability)
- Fast food Neighborhoods - adr_net_ffah_c_2014 - FFA (all fast food)

### Geographic characteristics
- c10_cen_uid_u_2010
- m10_cen_uid_u_2010

Only within the Philadelphia CBSA, 37980

# Reading / Tidying

In [114]:
cols = ["adr_net_dunsnumber_x_2014",
"adr_net_behid_u_2014",
"adr_net_firstyear_x_2014",
"adr_net_lastyear_x_2014",
"adr_net_acth_c_2014",
"adr_net_walh_c_2014",
"adr_net_ffah_c_2014",
"t10_cen_uid_u_2010",
"m10_cen_uid_u_2010"]

df = pd.read_csv(data_path, chunksize=10**6, usecols=cols)


try:
    df_philly = pd.read_csv(root_path.parent / "data" / "nets_philly_ACT_FFA_WAL.csv", usecols=cols)

# If we don't already have it, make it and write it to disk
except IOError:
    df_philly = pd.DataFrame()
    for chunk in df:
        df_philly = df_philly.append(chunk[chunk["m10_cen_uid_u_2010"] == "37980"])

    df_philly.to_csv(root_path.parent / "data" / "nets_philly_ACT_FFA_WAL.csv")
   

In [109]:
len(df_philly)

1464683

In [110]:
df_philly.c10_cen_uid_u_2010.unique()

array([42017, 42101, 34007, 10003, 34015, 42045, 24015, 42029, 42091,
       34005, 34033], dtype=int64)

In [93]:
grouped_census = df_philly.groupby([ "adr_net_dunsnumber_x_2014", "c10_cen_uid_u_2010"]).agg({
    'adr_net_firstyear_x_2014': min,
    'adr_net_lastyear_x_2014': max
}) \
    .set_axis(["enter_year", "exit_year"], axis="columns", inplace=False)

grouped_census.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,enter_year,exit_year
adr_net_dunsnumber_x_2014,c10_cen_uid_u_2010,Unnamed: 2_level_1,Unnamed: 3_level_1
1000731,42017,2001,2006
1000798,42017,2001,2002
1000822,42101,2001,2005
1000863,42101,2007,2014
1000921,42101,2001,2014


In [101]:
len(grouped_census)

1163170

In [94]:
enter_year = grouped_census.reset_index(drop=False).groupby(["c10_cen_uid_u_2010", "enter_year"]) \
    .size() \
    .rename_axis(["c10_cen_uid_u_2010", "year"], axis="index") \
    .rename("enter_year")
    
exit_year = grouped_census.reset_index(drop=False).groupby(["c10_cen_uid_u_2010", "exit_year"]) \
    .size() \
    .rename_axis(["c10_cen_uid_u_2010", "year"], axis="index") \
    .rename("exit_year")

In [103]:
len(exit_year)

275

In [95]:
df_all = pd.concat([enter_year, exit_year], axis=1) \
    .reset_index(drop=False) \
    .assign(year = lambda x: pd.to_datetime(x.year, format="%Y")) \
    .set_index(["c10_cen_uid_u_2010", "year"])

In [100]:
ten_yr = df_all.groupby(level=0).resample("10Y", level=1).sum()
ten_yr.loc[(slice(None), slice("1/1/1990", "12/31/2014")),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,enter_year,exit_year
c10_cen_uid_u_2010,year,Unnamed: 2_level_1,Unnamed: 3_level_1
10003,1990-12-31,15956,683
10003,2000-12-31,21077,14699
10003,2010-12-31,39007,25005
24015,1990-12-31,2231,73
24015,2000-12-31,2870,1715
24015,2010-12-31,6394,3715
34005,1990-12-31,12997,618
34005,2000-12-31,17507,12141
34005,2010-12-31,27714,19255
34007,1990-12-31,17794,993


##### groupy