In [11]:
import pandas as pd
import numpy as np
import feather
import pickle
import re
import sqlite3
import geopandas as gpd
import altair as alt

# optional libs to run other non-core code
from polyfuzz import PolyFuzz
from polyfuzz.models import EditDistance, TFIDF, Embeddings
from flair.embeddings import TransformerWordEmbeddings

# note pandarallel works well on mac but has issue with windows
# see requirements for windows  - https://github.com/nalepae/pandarallel
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# connect to the database
# note: connects to/creates a db file with the name in the quotes if does not exist
con = sqlite3.connect('streetsofnyc.db')
cur = con.cursor()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
query='''
Select Count(`Summons Number`),L_CD, cd_short_title,poverty_rate,unemployment,pct_non_white_nh,pct_foreign_born  
FROM ticketstreetdem
GROUP BY L_CD
HAVING `ISSUE DATE` LIKE '%2020%'
'''

test=pd.read_sql_query(query,con)

In [14]:
df=test[~test['poverty_rate'].isnull()].copy()
df['Boro']=df['cd_short_title'].apply(lambda x: x.split()[0])

In [15]:
df

Unnamed: 0,Count(`Summons Number`),L_CD,cd_short_title,poverty_rate,unemployment,pct_non_white_nh,pct_foreign_born,Boro
1,377901,101,Manhattan CD 1,8.8,2.8,27.8,23.3,Manhattan
2,694262,102,Manhattan CD 2,8.8,2.8,27.8,23.3,Manhattan
3,480288,103,Manhattan CD 3,19.3,3.6,66.7,34.5,Manhattan
4,697508,104,Manhattan CD 4,11.3,3.5,40.9,30.8,Manhattan
5,1328251,105,Manhattan CD 5,11.3,3.5,40.9,30.8,Manhattan
6,548687,106,Manhattan CD 6,9.8,2.5,30.3,23.2,Manhattan
7,699726,107,Manhattan CD 7,9.2,3.3,32.0,22.1,Manhattan
8,843079,108,Manhattan CD 8,7.2,2.0,24.7,23.6,Manhattan
9,248383,109,Manhattan CD 9,20.7,3.8,72.4,34.6,Manhattan
10,325736,110,Manhattan CD 10,20.2,6.2,85.8,23.4,Manhattan


In [35]:
bar = alt.Chart(df).mark_bar().encode(
    x='Count(`Summons Number`)',
    y='L_CD',
    color='Boro'
).properties(
    height=500,
    width=100
)
bar

In [40]:
a=alt.Chart(df).mark_point().encode(
    x='pct_non_white_nh',
    y='poverty_rate',
    color='Boro',
    tooltip='L_CD'
).properties(
    height=500,
    width=500
)

In [41]:
a|bar

In [25]:
b=alt.Chart(df).mark_point().encode(
    x='Count(`Summons Number`)',
    y='unemployment',
    color='Boro',
    tooltip='L_CD'
)

In [26]:
c=alt.Chart(df).mark_point().encode(
    x='Count(`Summons Number`)',
    y='pct_non_white_nh',
    color='Boro',
    tooltip='L_CD'
)

In [27]:
d=alt.Chart(df).mark_point().encode(
    x='Count(`Summons Number`)',
    y='pct_foreign_born',
    color='Boro',
    tooltip='L_CD'
)

In [48]:
query='''
Select `Violation Code`, `Clean Violation Des`,Count(`Summons Number`) AS No_of_Tix,RW_Type,FeatureTyp 
FROM ticketstreetdem
GROUP BY RW_Type,FeatureTyp,`Violation Code`
HAVING `ISSUE DATE` LIKE '%2020%'
ORDER BY Count(`Summons Number`) DESC
'''

result=pd.read_sql_query(query,con)

In [51]:
result.head(50)

Unnamed: 0,Violation Code,Clean Violation Des,No_of_Tix,RW_TYPE,FeatureTyp
0,21,NO PARKING-STREET CLEANING,3815119,1,0
1,38,FAIL TO DSPLY MUNI METER RECPT,3016343,1,0
2,14,NO STANDING-DAY/TIME LIMITS,1923249,1,0
3,20,NO PARKING-DAY/TIME LIMITS,1517304,1,0
4,40,FIRE HYDRANT,1192211,1,0
5,46,DOUBLE PARKING,1174029,1,0
6,37,EXPIRED MUNI METER,1076130,1,0
7,71,INSP. STICKER-EXPIRED/MISSING,922617,1,0
8,19,NO STANDING-BUS STOP,769958,1,0
9,16,NO STANDING-EXC. TRUCK LOADING,659104,1,0


## Unused Code for Reference

In [None]:
%%time

query='''
CREATE TABLE LION_Dem AS
SELECT 
b.OBJECTID,b.Street,b.FeatureTyp,b.SegmentTyp,b.NonPed,b.TrafDir,b.LocStatus,b.LZip,b.RZip,b.LBoro,b.RBoro,
b.L_CD,b.R_CD,b.CurveFlag,b.Radius,b.RW_Type,b.PhysicalID,b.StreetWidt,b.BikeLane,b.BIKE_Trafd,b.Number_Tra,
b.Number_Par,b.Number_Tot,b.Posted_Spe,b.Truck_Rout,b.c_lowadd,b.c_highadd,b.StreetCode,
a.cd_short_title,a.cd_tot_bldgs,a.cd_tot_resunits,a.crime_count,a.crime_per_1000,a.lep_rate,a.lots_commercial_office,a.lots_industrial_manufacturing,
a.lots_mixed_use,a.lots_open_space,a.lots_parking,a.lots_total,a.mean_commute,a.over65_rate,a.under18_rate,
a.pct_bach_deg,a.pct_foreign_born,a.pct_hh_rent_burd,a.pct_white_nh,a.pct_black_nh,(100-a.pct_white_nh) AS pct_non_white_nh,
a.poverty_rate,a.unemployment 
FROM LION b
LEFT OUTER JOIN cd_indic a
ON b.L_CD = a.borocd
'''

con.execute(query)

## Merge Tickets with Streets and Demographics to New Table

Query to create new table ticketstreetdem - combines tickets data with street and demographic details

In [None]:
%%time

query='''
CREATE TABLE IF NOT EXISTS ticketstreetdem AS
SELECT a.`Summons Number`,a.`Violation Code`,a.`Clean Violation Des`,a.`Issue Date`,a.`Violation Time`,b.*
FROM tickets a
LEFT OUTER JOIN LION_Dem b
ON a.Street1LU = b.StreetCode
WHERE b.c_lowadd<=a.`House Number Clean`
AND b.c_highadd>=a.`House Number Clean`
'''
con.execute(query)

In [None]:
query='''
Select * from ticketstreetdem
WHERE `ISSUE DATE` LIKE '%2020%'
'''

test=pd.read_sql_query(query,con)

In [None]:
test

In [None]:
%%time

query='''
SELECT a.`Summons Number`,a.`Violation Code`,a.`Clean Violation Des`,a.`Issue Date`,a.`Violation Time`,
b.OBJECTID,b.Street,b.FeatureTyp,b.SegmentTyp,b.NonPed,b.TrafDir,b.LocStatus,b.LZip,b.RZip,b.LBoro,b.RBoro,
b.L_CD,b.R_CD,b.CurveFlag,b.Radius,b.RW_Type,b.PhysicalID,b.StreetWidt,b.BikeLane,b.BIKE_Trafd,b.Number_Tra,
b.Number_Par,b.Number_Tot,b.Posted_Spe,b.Truck_Rout
FROM ticket20 a
LEFT OUTER JOIN LION b
ON a.Street1LU = b.StreetCode
WHERE b.c_lowadd<=a.`House Number Clean`
AND b.c_highadd>=a.`House Number Clean`
'''

summon_object20 = pd.read_sql_query(query,con)
summon_object20