In [2]:
import os
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd
import altair
import seaborn
import folium
from folium import plugins
from IPython.display import display
RANDOM_SEED = 511
rng = np.random.default_rng(RANDOM_SEED)
seaborn.set_theme(style="darkgrid")
altair.renderers.enable('mimetype')
db_url = os.getenv('DB_URL')
%load_ext autoreload
%autoreload 2
import geopandas
import geoplot

# 1. QAing District Geometries
* us congress
* ny state senate
* ny state assembly
* nyc city council

and also:
* nyc cd (maybe)
* nyc zip code

Note: districts boundaries mostly include water areas but zip_codes don't. 
Community districts ARE so we can't do any fractional-part things accurately.

In [20]:
# sql = """
# SELECT 
#     office,
#     district as d,
#     name,
#     party,
#     boundary
# FROM representatives
# WHERE office =
#     'House'
#     --'State Senate'
#     --'State Assembly'
#     --'City Council'
# """

# sql = """
#     select 
#         zip_code as d, boundary, neighborhood_name
#     from zip_codes
#     where boundary is not null
# """

# sql = """
#     select 
#         borough_district_code as d, boundary, community_board_name
#     from community_districts
#     where boundary is not null
# """


sql = """
    select 
        uhf_code as d, boundary, uhf_name
    from uhfs
    where boundary is not null
"""


df = geopandas.GeoDataFrame.from_postgis(sql, db_url, geom_col='boundary')
# df['district'] = pd.Categorical(df['district'])
print(df.shape)
# print(df.isnull().sum())
display(df.head(3).T)

df.explore(
    legend=True,
    column='d',
    categorical=True,
    style_kwds={'stroke': False}
)

(42, 3)


Unnamed: 0,0,1,2
d,403,410,302
boundary,MULTIPOLYGON Z (((-73.83935495462532 40.765056...,MULTIPOLYGON Z (((-73.76701648957598 40.614217...,POLYGON Z ((-73.93431460013473 40.824307119457...
uhf_name,Flushing - Clearview,Rockaway,Central Harlem - Morningside Heights


# 2. Aggregate Statistics by District



For each representative (council, senate, assembly, house):
* for their district
+ for Harlem (or East Harlem alone)
+ for each borough
+ for NYC as a whole

* Population + % of total
* OTP patients + % of total + % of population + rank in city
    * by zip-code, estimated
* OTP admissions + % of total + % of population + rank in city
    * need to use admissions to compare to patients
    * can geolocate by boundary
* OTP patients / OTP admissions
* OTP enrollment (or capacity) + % of total + % of population + rank in city
    * can geolocate by boundary
* Mean travel time and fraction
    * skip for now.
* ODs + % of total + % of population + rank in city?
    * by UHF--also need to estimate.

* drug arrests?
* felony arrests?

since ODs are by UHF and a lot of these are by zip code
we have to use overlaps with the districts... let's also measure:
* (district area) / (union of district area with everything it intersects with)
to see if we should throw some of these statistics out.



## QA

In [98]:

sql = """
select * from  public.representative_districts_analysis
where district_type = 'State Senate' order by patient_admissions_3_2019
"""
df = pd.read_sql(sql, con=db_url)
print(df.shape)
df.head(3).T

(28, 16)


Unnamed: 0,0,1,2
district_type,State Senate,State Senate,State Senate
district,13,16,21
patient_admissions_3_2019,87.630766,150.037249,179.776404
harlem_patient_admissions_3_2019,17.821945,20.686073,14.663183
opioid_burden_2019,220.44356,329.043648,427.187821
overdose_deaths_2020,19.440966,34.186137,30.868564
ct_otp_programs,1,0,1
total_admissions_3_2019,102.0,0.0,100.0
total_admissions_3_2019_from_zip,44.896537,129.568282,153.735795
otp_capacity,300,0,400


In [105]:
df['population_err_zip'] = (df['pop_from_zip'] - df['population_2020']) / df['population_2020']
df['population_err_uhf'] = (df['pop_from_uhf'] - df['population_2020']) / df['population_2020']
df['total_admissions_3_2019_err'] = (df['total_admissions_3_2019_from_zip'] - df['total_admissions_3_2019'] / df['total_admissions_3_2019']).replace(0, np.nan)
df['otp_capacity_err'] = (df['otp_capacity_from_zip'] - df['otp_capacity']) / df['otp_capacity'].replace(0, np.nan)
df['avg_daily_enrollment_2019_err'] = (df['avg_daily_enrollment_2019_from_zip'] - df['avg_daily_enrollment_2019']) / df['avg_daily_enrollment_2019'].replace(0, np.nan)
measures = [
    # 'population_2020',
    # 'pop_from_zip',
    # 'pop_from_uhf',
    # 'total_admissions_3_2019_from_zip',
    # 'total_admissions_3_2019',
    # 'otp_capacity_from_zip',
    # 'otp_capacity',
    # 'avg_daily_enrollment_2019_from_zip',
    # 'avg_daily_enrollment_2019'

    'population_err_zip',
    'population_err_uhf',
    'total_admissions_3_2019_err',
    'otp_capacity_err',
    'avg_daily_enrollment_2019_err'
]
df[measures]

# import scipy
# def regress(x, y):
#     data = df[~df[x].isnull() & ~df[y].isnull()]
#     r, p = scipy.stats.pearsonr(data[x], data[y])
#     return r, p
# r_df = pd.DataFrame([(measures[i], measures[j], *regress(measures[i], measures[j])) for i in range(len(measures)) for j in range(i)], 
#                     columns=['x', 'y', 'r', 'p'])
# r_df
# p = seaborn.pairplot(
#     df[measures],
#     corner=True
# )

Unnamed: 0,population_err_zip,population_err_uhf,total_admissions_3_2019_err,otp_capacity_err,avg_daily_enrollment_2019_err
0,-0.312533,-0.427097,43.896537,-0.559838,-0.559838
1,0.037482,-0.179974,,,
2,-0.19546,-0.363579,152.735795,0.47493,0.238503
3,-0.237526,-0.267486,,,
4,0.035721,0.017001,,,
5,0.043816,0.178438,386.989379,-0.297461,-0.287563
6,-0.077674,0.081412,,,
7,-0.289115,-0.256382,,,
8,-0.070283,-0.12699,,,
9,0.000142,-0.097086,520.56731,-0.06507,-0.073047
