In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd
import altair
import seaborn
RANDOM_SEED = 511
rng = np.random.default_rng(RANDOM_SEED)
seaborn.set_theme(style="darkgrid")
altair.renderers.enable('mimetype')
db_url = os.getenv('DB_URL')
%load_ext autoreload
%autoreload 2
import geopandas
import geoplot

# 1. Visualizing Arrest Counts by Block

Along with police precincts and OTPs.

In [None]:
## Felony arrests

# sql = """
# SELECT
#     bctcb2020,
#     cdta_name,
#     borough_district_code,
#     boundary,
#     coalesce(ct_arrests, 0) as ct_arrests,
#     districts.district_name
# FROM census_blocks
#     JOIN districts USING (borough_district_code)
#     LEFT JOIN (
#         SELECT bctcb2020, count(*) AS ct_arrests
#         FROM arrests
#         WHERE distance_to_precinct_meters > 125
#         GROUP BY 1
#     ) AS a USING (bctcb2020)
# """


sql = """
SELECT
    bctcb2020,
    cdta_name,
    borough_district_code,
    districts.district_name,
    boundary,
    adjusted_arrests as ct_arrests,
    raw_arrests,
    precinct,
    near_precinct
FROM census_blocks
    JOIN districts USING (borough_district_code)
    JOIN arrests_by_block AS a USING (bctcb2020)
"""

df = geopandas.GeoDataFrame.from_postgis(sql, db_url, geom_col='boundary', index_col='bctcb2020')
print(df.shape)
df.head(3).T

In [None]:
precinct_loc_sql = """
SELECT
    precinct,
    precinct_name,
    full_address,
    latitude, 
    longitude,
    location
FROM nypd_precincts
"""
precincts_locs_df = geopandas.GeoDataFrame.from_postgis(
    precinct_loc_sql, db_url, 
    geom_col='location', index_col='precinct', 
    crs=df.crs
)
print(precincts_locs_df.shape)
precincts_locs_df.head(3).T


In [None]:
precinect_geom_sql = """
SELECT
    precinct,
    precinct_name,
    full_address,
    boundary,
    1 as indicator
FROM nypd_precincts
    join nypd_precinct_geometries using (precinct)
"""
precincts_geom_df = geopandas.GeoDataFrame.from_postgis(
    precinect_geom_sql, db_url, 
    geom_col='boundary', index_col='precinct', 
    crs=df.crs
)
print(precincts_geom_df.shape)
precincts_geom_df.head(3).T


In [None]:
otp_locs_sql = """
SELECT
    program_number,
    program_name,
    _record_source,
    capacity_estimate,
    address_full,
    program_status,
    latitude, 
    longitude,
    ST_SetSRID(ST_POINT(longitude, latitude), 4326) :: GEOGRAPHY AS location
FROM programs
WHERE program_category = 'Opioid Treatment Program'
    AND latitude IS NOT NULL

"""
otp_locs_df = geopandas.GeoDataFrame.from_postgis(
    otp_locs_sql, db_url, 
    geom_col='location', index_col='program_number', 
    crs=df.crs
)
print(otp_locs_df.shape)
otp_locs_df.head(3).T


In [None]:
manhattan = df[(df.borough_district_code.isin([107, 108, 109, 110, 111, 112]))]
map = manhattan.explore(
    column='ct_arrests',
    legend=True,
    style_kwds={'stroke': False}
)
map = precincts_locs_df.explore(m=map, marker_kwds={'radius': 2.5, 'color': 'blue', 'fill': True})
# otp_locs_df.explore(m=map, marker_kwds={'radius': 2.5, 'color': 'red', 'fill': True})
map


# 2. Spreading out arrests in the radius of the police precinct.

Arrests in vicinity of precinct, unless removed, stand out on the map. Options are:
* remove arrests in vicinity of precinct (then they'll stand out as too low)
* spread those arrests around the other census blocks in the precinct in proportion to their area
    (i.e. )
* or in proportion to their existing arrest counts, after setting the police precinct CB to the mean
    * problem: precincts on boundaries of multiple census blocks.
* set any CBs *intersecting* the 100m radius of a precinct = to precinct mean.


Done now, above.

# 3. Arrest Trends

Arrests in the immediate vicinity of 125th and Park.

In [None]:
sql = """
with params as (
    select
        ST_SetSRID(ST_POINT(-73.93904425257058, 40.80507350925206), 4326) :: GEOGRAPHY 
            AS park_and_125th
)
select
    date_trunc('month', arrest_date) as month,
    offense_category,
    avg(count(*)) over (
        partition by offense_category 
        order by date_trunc('month', arrest_date) asc
        rows between 3 preceding and current row
    ) as "Arrests"
from 
    arrests,
    params
where 
    ST_Distance(arrest_location, park_and_125th) < 250
    and date_part('year', arrest_date) >= 2010
    and offense_category in (
        'Property', 
        'Disorder', 
        'Drugs', 
        'Major'
    )
group by 1, 2
"""

arrests_df = pd.read_sql(sql, con=db_url)
arrests_df["month"] = pd.to_datetime(arrests_df["month"], utc=True)
# ct_cols = [c for c in arrests_df.columns if 'Arrests' in c and '1k' not in c and 'Felony' not in c]
# rate_cols = [c for c in arrests_df.columns if '1k' in c and 'Felony' not in c]

print(arrests_df.shape)
arrests_df.head(3).T

In [None]:
(altair.Chart(arrests_df)
    .mark_line(point=altair.OverlayMarkDef(color='year'))
    .encode(
        x='month:T',
        y='Arrests',
        color='offense_category',
    ).properties(
        width=800,
        height=300
    )
)

In [None]:
sql = """
with 
params as (
    select
        ST_SetSRID(ST_POINT(-73.93904425257058, 40.80507350925206), 4326) :: GEOGRAPHY AS park_and_125th,
        250 as radius_meters,
        '2010-01-01'::date as start_date,
        '2010-01-01'::date as baseline_start_date,
        '2014-01-01'::date as baseline_end_date,
        'Drugs' as arrest_category
       
), arrests_by_district_month as (
    select
        borough_district_code,
        date_trunc('month', arrest_date) as month,
        count(*) as arrests
    from arrests, params
    where offense_category = arrest_category
        and arrest_date > start_date
    group by 1, 2
), 
park_and_125th_arrests as (
    select
        date_trunc('month', arrest_date) as month,
        count(*) as arrests
    from 
        arrests,
        params
    where 
        ST_Distance(arrest_location, park_and_125th) < radius_meters
        and arrest_date > start_date
        and offense_category = arrest_category
    group by 1
),
park_and_125th_baseline as (
    select
        avg(arrests) as arrest_baseline
    from park_and_125th_arrests,
        params
    where month between baseline_start_date and baseline_end_date
),
pre2020_baseline as (
    select
        borough_district_code,
        avg(arrests) as arrest_baseline
    from arrests_by_district_month,
        params
    where month between baseline_start_date and baseline_end_date
    group by 1
), all_regions as (
    select
        month,
        'Park and 125th' as region,
        arrests,
        arrest_baseline as baseline
    from park_and_125th_arrests, park_and_125th_baseline

    union all 

    select
        month,
        districts.district_name as region,
        sum(arrests) as arrests,
        sum(arrest_baseline) as baseline
    from arrests_by_district_month
        join pre2020_baseline using (borough_district_code)
        join districts using (borough_district_code)
    where 
        borough_district_code = 111 /*in (109, 110, 111)*/
    group by 1, 2

    union all 


    select
        month,
        'Upper Manhattan' as region,
        sum(arrests) as arrests,
        sum(arrest_baseline) as baseline
    from arrests_by_district_month
        join pre2020_baseline using (borough_district_code)
    where 
        borough_district_code in (109, 110, 111)
    group by 1, 2

    union all 

    select
        month,
        'Manhattan' as region,
        sum(arrests) as arrests,
        sum(arrest_baseline) as baseline
    from arrests_by_district_month
        join pre2020_baseline using (borough_district_code)
    where 
        borough_district_code < 200
    group by 1, 2
)

select
    month,
    region,
    arrests,
    arrests::float/baseline as "Arrests over Baseline",
    avg(arrests::float/baseline) over (partition by region order by month rows between 2 preceding and current row) 
        as "Trailing Avg Arrests over Baseline"
from all_regions
"""

arrests_by_district_df = pd.read_sql(sql, con=db_url)
arrests_by_district_df["month"] = pd.to_datetime(arrests_by_district_df["month"], utc=True)

print(arrests_by_district_df.shape)
arrests_by_district_df.head(3).T

In [None]:
(altair.Chart(arrests_by_district_df)
    .mark_line(point=altair.OverlayMarkDef(color='year'))
    .encode(
        x='month:T',
        y="Trailing Avg Arrests over Baseline",
        color='region',
    ).properties(
        width=800,
        height=300
    )
)

# Arrests Along 125th St.

Could do a better version of this by selecting for arrests in this group 
but then plotting vs longitude...

In [None]:

sql = """
with avenue_areas as (
    select
        avenue_number,
        east_edge as "Avenue",
        sum(ST_Area(boundary)) as area
    from blocks_along_125
        join census_blocks using (bctcb2020)
    where south_edge in (124, 125)
    group by 1, 2
        
), arrests_by_avenue_month as (
    select
        b.avenue_number,
        date_trunc('year', arrest_date) as year,
        count(*) as arrests
    from arrests
        join blocks_along_125 as b using (bctcb2020)
    where b.south_edge in (124, 125)
        and offense_category = 'Drugs'
        and date_part('month', arrest_date) <= 9
    group by 1, 2
)
select
    avenue_number,
    "Avenue",
    year,
    arrests,
    arrests::float / (area / max(area) over ()) as "Arrest Density"
from arrests_by_avenue_month
    join avenue_areas using (avenue_number)
"""

arrest_125_df = pd.read_sql(sql, con=db_url)
arrest_125_df["year"] = pd.to_datetime(arrest_125_df ["year"], utc=True).dt.year
print(arrest_125_df.shape)
arrest_125_df.head(3).T


In [None]:
data = arrest_125_df[arrest_125_df.year.isin([2010,2015,2021,2022])]

lines = (altair.Chart(data)
    .mark_line()
    .encode(
        x='avenue_number',
        y="Arrest Density",
        color='year:N',
    ).properties(
        width=800,
        height=300
    )
)

avenues = (altair.Chart(data)
        .mark_text(baseline='bottom', color='white', angle=90)
        .encode(x="avenue_number", text="Avenue", y=altair.datum(120))
)

lines + avenues

In [None]:
data = arrest_125_df[arrest_125_df.year.isin([2010,2015,2019,2021, 2022])]

(altair.Chart(data)
    .mark_bar()
    .encode(
        x=altair.X(
            'Avenue',
            sort=altair.EncodingSortField(field='avenue_number', order='descending')
        ),
        y="arrests",
        column='year:N',
    )
)


What about a density plot by year?

In [None]:
sql = """
with vector_of_125th as (
    select 
        lat0,
        lon0,
        (lat1 - lat0) 
            / sqrt((lat1 - lat0) * (lat1 - lat0) + (lon1 - lon0) * (lon1 - lon0))
            as unit_x,
        (lon1 - lon0) 
            / sqrt((lat1 - lat0) * (lat1 - lat0) + (lon1 - lon0) * (lon1 - lon0))
            as unit_y
    from (
        select 
            40.801739 as lat0,
            -73.93122 as lon0,
            40.811344 as lat1,
            -73.95399 as lon1
    ) pts
),
arrests_by_avenue_month as (
    select
        date_part('year', arrest_date) as year,
        offense_category,
        avenue_number,
        ST_Y(arrest_location::geometry) as lat,
        ST_X(arrest_location::geometry) as lon,
        bctcb2020,
        east_edge,
        south_edge
    from arrests
        join blocks_along_125 as b using (bctcb2020)
    where b.south_edge in (124, 125)
        and date_part('month', arrest_date) <= 9
)
select
    (lat - lat0) * unit_x * (lon - lon0) * unit_y
        as distance_along_125th_st,
    offense_category,
    year
from arrests_by_avenue_month,
    vector_of_125th
where
    offense_category = 'Drugs'
"""

arrest_125_df2 = pd.read_sql(sql, con=db_url)
print(arrest_125_df2.shape)
arrest_125_df2.head(3).T

In [None]:
data = arrest_125_df2[arrest_125_df2.year.isin([2010,2015,2019,2021,2022])]

lines = (altair.Chart(data)
    .transform_density(
        'distance_along_125th_st',
        groupby=['year'],
        as_=['distance_along_125th_st', 'arrests']
    )
    .mark_line()
    .encode(
        x='distance_along_125th_st:Q',
        y='arrests:Q',
        color='year:N',
    ).properties(
        width=800,
        height=300
    )
)

lines

I *think* these aren't incorrectly normalized, but I'm not sure.