In [1]:
from pygris import blocks, tracts, block_groups
from libpysal import weights
import networkx as nx
import pandas as pd

In [2]:
# paths to files
covar_file = '/share/garg/311_data/sb2377/clean_codebase/tract_demographics.csv'
graph_save_file = '/share/garg/311_data/sb2377/clean_codebase/nyc_tracts.graphml'
df_save_file = '/share/garg/311_data/sb2377/clean_codebase/nyc_census_graph.csv'

In [3]:
def generate_graph_census(census_unit='tracts',
                          state='NY',
                          counties=['New York','Bronx','Kings','Queens','Richmond'],
                          weight_scheme='rook',
                          remove_high_degree_nodes=False,
                          remove_long_edges=False,
                          remove_zeropop=False,
                          remove_parks=False,
                          tresh_degree=9,
                          tresh_edgelength=2_000,
                          tresh_parkarea=0.75,
                          tresh_population=1):
    
    #Collect the full census geography data:
    assert census_unit.lower() in ['tracts', 'blocks', 'block groups', 'blockgroups']
    if census_unit.lower() == 'tracts':
        census_gdf_raw = tracts(state=state, county=counties)
    elif census_unit.lower() == 'blocks':
        census_gdf_raw = blocks(state=state, county=counties)
    elif census_unit.lower() == 'block groups' or census_unit.lower() == 'blockgroups':
        census_gdf_raw = block_groups(state=state, county=counties)
        
    #Get the weights:
    assert weight_scheme.lower() in ['rook', 'queen']
    if weight_scheme.lower() == 'rook':
        spatial_weights = weights.Rook.from_dataframe(census_gdf_raw, silence_warnings=True)
    elif weight_scheme.lower() == 'queen':
        spatial_weights = weights.Queen.from_dataframe(census_gdf_raw, silence_warnings=True)
        
    #Convert weights to graph:
    graph_raw = spatial_weights.to_networkx()
    
    #Trim the graph from outliers:
    if remove_high_degree_nodes: graph_raw = trim_graph_degree(graph_raw, tresh_degree)
    if remove_long_edges: graph_raw = trim_graph_edge(graph_raw, census_gdf_raw, tresh_edgelength)
    if remove_parks: graph_raw = trim_graph_parks(graph_raw, census_gdf_raw, tresh_parkarea)
    if remove_zeropop: graph_raw = trim_graph_pop(graph_raw, census_gdf_raw, tresh_population)
    
    #Collect the largest connected component:
    graph_largest_component = max(nx.connected_components(graph_raw), key=len)
    graph = graph_raw.subgraph(graph_largest_component).copy()
    
    #Filter the gdf:
    census_gdf = census_gdf_raw.iloc[list(graph.nodes())].reset_index(drop=True)
    final_graph = nx.convert_node_labels_to_integers(graph)
    
    return census_gdf, final_graph, census_gdf_raw, graph

In [4]:
# load files
covariates_arr = pd.read_csv(covar_file)

In [5]:
# generate graph
census_gdf, final_graph, census_gdf_raw, graph = generate_graph_census()

Using the default year of 2021
Using FIPS code '36' for input 'NY'
Using FIPS code '061' for input 'New York'
Using FIPS code '005' for input 'Bronx'
Using FIPS code '047' for input 'Kings'
Using FIPS code '081' for input 'Queens'
Using FIPS code '085' for input 'Richmond'


In [6]:
# remove nodes from graph that had invalid covariate data
nodes = covariates_arr['GEOID'].unique()
census_gdf['GEOID'] = census_gdf['GEOID'].astype('int')
all_node_idxs = set(census_gdf.index)
kept_node_idxs = set(census_gdf[census_gdf['GEOID'].isin(nodes)].index)
filtered_node_idxs = all_node_idxs.difference(kept_node_idxs)
final_graph.remove_nodes_from(filtered_node_idxs)

# update labeling
kept_node_idxs = list(kept_node_idxs)
mapping = {}
for i, node in enumerate(kept_node_idxs):
    mapping[node] = i
final_graph = nx.relabel_nodes(final_graph, mapping)

In [8]:
# remove nodes from df that had invalid covariate data
census_gdf = census_gdf[census_gdf['GEOID'].isin(nodes)]

In [9]:
assert(len(census_gdf[census_gdf.isna().any(axis=1)]) == 0)

In [12]:
nx.write_graphml(final_graph, graph_save_file)

In [None]:
census_gdf.to_csv(df_save_file, index=False)