In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt
import sys
import os
from utils import *

In [2]:
raw_census_data_dir = '/share/garg/gs665/networks_underreporting/d01_data/d01_raw/d03_population-data/2020/'
save_path = '/share/garg/311_data/sb2377/clean_codebase/tract_demographics.csv'

In [3]:
covariates_to_save = ['log_population_density',
                      'log_population',
                      'log_income_median',
                      'education_bachelors_pct',
                      'race_white_nh_pct',
                      'age_median',
                      'households_renteroccupied_pct',
                      'highschool_degree',
                      'college_degree',
                      'graduate_degree',
                      'non_hispanic_white',
                      'african_american',
                      'asian',
                      'neg_log_income_median',
                      'population_density',
                      'population',
                      'neg_income_median',
                      'income_median',
                     ]
covariates_to_filter_on = covariates_to_save[:-2] # income_median null value is 1

In [4]:
census_gdf, final_graph, census_gdf_raw = generate_graph_census()

Using the default year of 2021
Using FIPS code '36' for input 'NY'
Using FIPS code '061' for input 'New York'
Using FIPS code '005' for input 'Bronx'
Using FIPS code '047' for input 'Kings'
Using FIPS code '081' for input 'Queens'
Using FIPS code '085' for input 'Richmond'


In [5]:
def read_table(table, raw_census_data_dir=raw_census_data_dir):
    df = pd.read_csv(f'{raw_census_data_dir}{table}.csv', low_memory=False).iloc[1:]
    df['GEOID'] = df['GEO_ID']
    df = df.set_index(df['GEOID'].apply(lambda x: str(x)[-11:]))
    return df

In [6]:
census_covariates = pd.DataFrame()

In [7]:
# Population:
pop_table = read_table('P1')
pop_col = pop_table['P1_001N'].astype(int)

census_covariates['population'] = pop_col
census_covariates['log_population'] = np.log(pop_col)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# Income:
ACS_inc_table = read_table('S1901')
med_inc = ACS_inc_table.loc[:,'S1901_C01_012E']
med_inc_col = med_inc.replace({'-':1, '2,500-':2500, '250,000+': 250000}).astype(int)

census_covariates['income_median'] = med_inc_col
census_covariates['log_income_median'] = np.log(med_inc_col)
census_covariates['neg_log_income_median'] = -1 * census_covariates['log_income_median']
census_covariates['neg_income_median'] = -1 * census_covariates['income_median']

# Categorical income:
census_covariates['income_qt'] = pd.qcut(census_covariates['income_median'], 4, labels=['lowest', '2nd lowest', '2nd highest', 'highest'])

In [9]:
# Education:
bachelors_or_more = [f'B15003_0{n:02}E' for n in range(22, 26)]
highschool_or_more = [f'B15003_0{n:02}E' for n in range(17, 26)]
college_or_more = [f'B15003_0{n:02}E' for n in range(21, 26)]
graduate_or_more = [f'B15003_0{n:02}E' for n in range(23, 26)]

ACS_ed_table = read_table('B15003')
total_ed = ACS_ed_table.loc[:,'B15003_001E'].astype(int)

census_covariates['education_bachelors_pct'] = ACS_ed_table.loc[:,bachelors_or_more].astype(int).sum(axis=1)/total_ed
census_covariates['highschool_degree'] = ACS_ed_table.loc[:,highschool_or_more].astype(int).sum(axis=1)/total_ed
census_covariates['college_degree'] = ACS_ed_table.loc[:,college_or_more].astype(int).sum(axis=1)/total_ed
census_covariates['graduate_degree'] = ACS_ed_table.loc[:,graduate_or_more].astype(int).sum(axis=1)/total_ed

In [10]:
# Race:
race_ethnicity_table = read_table('P9')
total_ethnicity = race_ethnicity_table.loc[:,'P9_001N'].astype(int)
census_covariates['race_white_nh_pct'] = race_ethnicity_table.loc[:,'P9_005N'].astype(int)/total_ethnicity
census_covariates['non_hispanic_white'] = race_ethnicity_table.loc[:,'P9_005N'].astype(int)/total_ethnicity
census_covariates['african_american'] = race_ethnicity_table.loc[:,'P9_006N'].astype(int)/total_ethnicity
census_covariates['asian'] = race_ethnicity_table.loc[:,'P9_008N'].astype(int)/total_ethnicity

In [11]:
# Median age:
age_table = read_table('P13')
census_covariates['age_median'] = age_table.loc[:,'P13_001N'].replace({'-':0}).astype(float)

In [12]:
# Households occupied by owner and renter:
household_table = read_table('B25003')
census_covariates['households_renteroccupied_pct'] = household_table.loc[:,'B25003_003E'].astype(int)/household_table.loc[:,'B25003_001E'].astype(int)

In [13]:
# Population density:
# get area
census_covariates.reset_index(inplace=True)
census_covariates = pd.merge(census_covariates, census_gdf, on='GEOID', how='left')
census_covariates = gpd.GeoDataFrame(census_covariates, geometry='geometry').to_crs('EPSG:2263')
census_covariates['area'] = census_covariates['geometry'].area
# population density = population / area
census_covariates['population_density'] = census_covariates['population'] / census_covariates['area']
census_covariates['log_population_density'] = np.log(census_covariates['population_density'])
census_covariates.set_index('GEOID', inplace=True)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
#Some small adjustments to keep covariantes at the geographic level:
geoid_len = len(str(census_covariates.index.values[0]))
census_gdf.GEOID = census_gdf.GEOID.apply(lambda x: int(str(x)[:geoid_len]))
census_covariates['GEOID'] = census_covariates.index
census_covariates.GEOID = census_covariates.GEOID.apply(lambda x: int(str(x)[:geoid_len]))
nyc_census_tracts = census_gdf['GEOID'].unique()

#Merge:
covariates_gdf = census_covariates[census_covariates['GEOID'].isin(nyc_census_tracts)]

#Select covariates:
covariates_arr = covariates_gdf[['GEOID'] + covariates_to_save]
covariates_arr.replace([np.inf, -np.inf], np.nan, inplace=True)
covariates_arr.fillna(0, inplace=True)

# drop rows where all covariates are not specified
covariates_arr = covariates_arr[(covariates_arr[covariates_to_save] != 0).any(axis=1)]

# normalize covariates
for c in covariates_to_save:
    covariates_arr['normalized_{}'.format(c)] = (covariates_arr[c] - covariates_arr[c].mean()) / covariates_arr[c].std()
    
covariates_arr.reset_index(drop= True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covariates_arr.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covariates_arr.fillna(0, inplace=True)


In [16]:
# remove rows where all 
covariates_arr = covariates_arr[(covariates_arr[covariates_to_filter_on] != 0).any(axis=1)]

In [17]:
assert(len(covariates_arr[covariates_arr.isna().any(axis=1)]) == 0)

In [19]:
covariates_arr.to_csv(save_path)