In [5]:
import pandas as pd
import numpy as np
import feather
import pickle
import re
import sqlite3
import geopandas as gpd

# optional libs to run other non-core code
from polyfuzz import PolyFuzz
from polyfuzz.models import EditDistance, TFIDF, Embeddings
from flair.embeddings import TransformerWordEmbeddings

# note pandarallel works well on mac but has issue with windows
# see requirements for windows  - https://github.com/nalepae/pandarallel
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# connect to the database
# note: connects to/creates a db file with the name in the quotes if does not exist
con = sqlite3.connect('streetsofnyc.db')
cur = con.cursor()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


## Load Ticket Data from Preprocessing

i.e we only load the data we can match

In [2]:
ticket20 = feather.read_dataframe('ticket_reduced20.feather')
ticket19 = feather.read_dataframe('ticket_reduced19.feather')

## Function to Clean Ticket Dataframes

See function description for cleaning details

In [3]:
def clean_tickets(df):
    '''This function is for cleaning ticket dataframes and to do the following:
    1. Map a clean violation description based on violation code
    2. Map a borocode from violation county
    3. Add leading zeros back to street code - to ensure 5 digit
    4. Clean Boro Code - Update using most common values for each street code - data not clean
    5. Create street code lookup string to match LION data
    6. Clean recorded house numbers - to narrow down the street segment 
    '''
    
    # 1. get violation codes/dictionary
    violation_codes =pd.read_json("https://data.cityofnewyork.us/resource/ncbg-6agr.json")
    violation_dict = violation_codes.set_index('code').to_dict()['definition']
    
    # 1. map violation description
    df['Clean Violation Des'] = df['Violation Code'].map(violation_dict)
    
    # 2. create boro_dict
    boro_dict = {'NY':'1',
                 'MN':'1',
                 'BX':'2',
                 'K':'3',
                 'K F':'3',
                 'BK':'3',
                 'KINGS':'3',
                 'Q':'4',
                 'P' : '0',
                 'QN':'4',
                 'QUEEN':'4',
                 'QNS':'4',
                 'R':'5',
                 'ST':'5',
                 'None':'0'
                }
    
    # 2. map boro code
    df['Boro Code'] = df['Violation County'].map(boro_dict)
    
    
    
    # 3. Add back leading zeros to street code
    df['Street Code1'] = df['Street Code1'].apply('{0:0>5}'.format)
    df['Street Code2'] = df['Street Code2'].apply('{0:0>5}'.format)
    df['Street Code3'] = df['Street Code3'].apply('{0:0>5}'.format)
    
    # 4. Clean Boro Code
#     update_borodict=df.groupby('Street Code1')['Boro Code'].agg(pd.Series.mode).to_frame().to_dict()
#     df['New Boro Code'] = df['Street Code1'].map(update_borodict['Boro Code'])
    
    # 5. Create Street Code Lookup String
    df['Street1LU'] = df['Boro Code']+df['Street Code1'].astype(str)
    df['Street2LU'] = df['Boro Code']+df['Street Code2'].astype(str)
    df['Street3LU'] = df['Boro Code']+df['Street Code3'].astype(str)
    df['Street1LU'].replace({np.nan:'000000'},inplace=True)

    
    df['Boro Code'].replace({np.nan:0},inplace=True)
    
    
    # 6. Clean House Numbers
    # We do not need unit numbers, remove '-'
    # Some input errors where '-' was missing so remove white space
    # Some addresses have alphabets (e.g. 196A) we only want the numeric portion
    # Some have no addresses - replace with 0
    # Some house numbers were only letters (e.g. N,W,E) which were replace with empty strings which we convert to 0
    df['House Number Clean']=df['House Number'].str.split('-').str[0]
    df['House Number Clean']=df['House Number Clean'].str.split(' ').str[0]
    df['House Number Clean']=df['House Number Clean'].str.replace('[^0-9]','',regex=True)
    df['House Number Clean'].replace({np.nan:0},inplace=True)
    df['House Number Clean'].replace({'':0},inplace=True)
    df['House Number Clean']=df['House Number Clean'].astype(float)
    
    
    return df

## Clean Ticket Data - 2020 and 2019

In [4]:
ticket20_clean=clean_tickets(ticket20)
ticket19_clean=clean_tickets(ticket19)

In [6]:
combined_clean=pd.concat([ticket20_clean,ticket19_clean])

## Data Merging

Pandas is not optimal to do filtering and merging on such large datasets with apply - overall O(n) so doesn't scale for 12mn data points (estimated to take 8 days without parallisation - 3 days with). Instead, we will do merging and joining in SQL - ~4-5mins for the dataset 

### Creating the databases

Code to create databases takes awhile to run if we load all the columns (1-2hrs) - can improve efficiency if we drop columns that we will not be using - no need to run if no new cleaning steps are added - can just connect to the database

In [7]:
%%time

combined_clean.to_sql('tickets',con,if_exists='replace',index=False)

  method=method,


CPU times: user 5min 48s, sys: 7min 5s, total: 12min 53s
Wall time: 43min 24s


# Old/Unused Code

In [None]:
%%time

# create ticket20 table in database from df - note we only parse in certain columns
ticket20_clean.to_sql('ticket20',con,if_exists='replace',index=False)
# ticket20_clean[['Summons Number','House Number Clean','Street1LU']].rename(columns={'Summons Number':'Summons_Number','House Number Clean':'House_Number_Clean'}).to_sql('ticket20',con,if_exists='replace',index=False)

# create ticket19 table in database from df - note we only parse in certain columns
ticket19_clean.to_sql('ticket19',con,if_exists='replace',index=False)
# ticket19_clean[['Summons Number','House Number Clean','Street1LU']].rename(columns={'Summons Number':'Summons_Number','House Number Clean':'House_Number_Clean'}).to_sql('ticket19',con,if_exists='replace',index=False)

## Load LION Database - Street and Street Features

In [3]:
lion = gpd.read_file("Data/LION/LION.shp")

# note some address contain '-' we only want the portion before the dash

lion['l_lowadd'] = lion['LLo_Hyphen'].str.split('-').str[0].astype(float)
lion['l_highadd'] = lion['LHi_Hyphen'].str.split('-').str[0].astype(float)
lion['r_lowadd'] = lion['RLo_Hyphen'].str.split('-').str[0].astype(float)
lion['r_highadd'] = lion['RHi_Hyphen'].str.split('-').str[0].astype(float)

# get combined lower and upper limit for street segment

lion['r_lowadd'].replace({'0':np.nan},inplace=True)
lion['l_lowadd'].replace({'0':np.nan},inplace=True)
lion['c_lowadd']= lion[['l_lowadd','r_lowadd']].min(axis=1,skipna=True)
lion['c_highadd']=lion[['l_highadd','r_highadd']].max(axis=1)

lion.head(5)

Unnamed: 0,OBJECTID,Street,SAFStreetN,FeatureTyp,SegmentTyp,IncExFlag,RB_Layer,NonPed,TrafDir,TrafSrc,SpecAddr,FaceCode,SeqNum,StreetCode,SAFStreetC,LGC1,LGC2,LGC3,LGC4,LGC5,LGC6,LGC7,LGC8,LGC9,BOE_LGC,SegmentID,SegCount,LocStatus,LZip,RZip,LBoro,RBoro,L_CD,R_CD,LATOMICPOL,RATOMICPOL,LCT2010,LCT2010Suf,RCT2010,RCT2010Suf,LCB2010,LCB2010Suf,RCB2010,RCB2010Suf,LCT2000,LCT2000Suf,RCT2000,RCT2000Suf,LCB2000,LCB2000Suf,RCB2000,RCB2000Suf,LCT1990,LCT1990Suf,RCT1990,RCT1990Suf,LAssmDist,LElectDist,RAssmDist,RElectDist,SplitElect,LSchlDist,RSchlDist,SplitSchl,LSubSect,RSubSect,SanDistInd,MapFrom,MapTo,BoroBndry,MH_RI_Flag,XFrom,YFrom,XTo,YTo,ArcCenterX,ArcCenterY,CurveFlag,Radius,NodeIDFrom,NodeIDTo,NodeLevelF,NodeLevelT,ConParity,Twisted,RW_TYPE,PhysicalID,GenericID,NYPDID,FDNYID,LBlockFace,RBlockFace,LegacyID,Status,StreetWidt,StreetWi_1,StreetWi_2,BikeLane,BIKE_TRAFD,ACTIVE_FLA,POSTED_SPE,Snow_Prior,Number_Tra,Number_Par,Number_Tot,Carto_Disp,FCC,ROW_Type,LLo_Hyphen,LHi_Hyphen,RLo_Hyphen,RHi_Hyphen,FromLeft,ToLeft,FromRight,ToRight,Join_ID,L_PD_Servi,R_PD_Servi,TRUCK_ROUT,Shape__Len,geometry,l_lowadd,l_highadd,r_lowadd,r_highadd,c_lowadd,c_highadd
0,1,EAST 168 STREET,,0,U,,B,,T,DOT,,2510,3070,226700,,1,,,,,,,,,1,78126,1,X,10456,10456,2.0,2.0,203,203,402,101,149,,185,,3001,,2000,,149,,137,,4000,,1000,,149,,137,,79,40,79,40,,9,9,,1B,1B,,3D,3D,,,1010964,241812,1011265,241555,0,0,,0,47740,9045677,M,M,,,1,35231.0,30694.0,,,1422600653,1422602017,78126,2,34.0,34.0,,,,,25,S,2,,4,,,,599.0,699.0,596.0,716.0,599,699,596,716,2251001000000,,,,396.031,"LINESTRING (-73.90347 40.83036, -73.90238 40.8...",599.0,699.0,596.0,716.0,596.0,716.0
1,2,WEST 192 STREET,,0,U,,B,,A,DOT,,7984,40,274810,,1,,,,,,,,,1,79796,1,,10468,10468,2.0,2.0,207,207,302,104,265,,265,,2000,,1004,,265,,265,,3001,,1003,,265,,265,,78,45,78,59,,10,10,,1A,1A,,3C,3C,,,1011577,255024,1011335,255164,0,0,,0,48679,48678,M,M,,,1,35248.0,30711.0,,,1522607129,1522607721,79796,2,30.0,30.0,,,,,25,S,1,,3,,,,58.0,98.0,63.0,99.0,58,98,63,99,2798401000000,,,,279.361,"LINESTRING (-73.90120 40.86662, -73.90207 40.8...",58.0,98.0,63.0,99.0,58.0,99.0
2,3,UNION AVENUE,,0,U,,B,,W,DOT,,7280,130,270420,,1,,,,,,,,,1,77356,4,X,10459,10459,2.0,2.0,203,203,402,401,135,,131,,2000,,3006,,135,,131,,4000,,4001,,135,,131,,79,46,79,26,,12,12,,1A,1A,,6C,6C,,,1011601,239640,1011786,240230,0,0,,0,47288,47822,M,M,,,1,35252.0,30715.0,,,1422603726,1422604132,77356,2,34.0,34.0,,,,,25,S,1,,3,,,,1017.0,1079.0,1016.0,1084.0,1017,1079,1016,1084,2728001000000,,,,618.327,"LINESTRING (-73.90118 40.82440, -73.90051 40.8...",1017.0,1079.0,1016.0,1084.0,1016.0,1084.0
3,4,UNION AVENUE,BEHAGEN PLAYGROUND COMFORT STA,0,U,,B,,W,DOT,X,7280,130,270420,212795.0,1,,,,,,,,,1,77356,4,X,10459,10459,2.0,2.0,203,203,402,401,135,,131,,2000,,3006,,135,,131,,4000,,4001,,135,,131,,79,46,79,26,,12,12,,1A,1A,,6C,6C,,,1011601,239640,1011786,240230,0,0,,0,47288,47822,M,M,,,1,35252.0,30715.0,,,1422603726,1422604132,77356,2,34.0,34.0,,,,,25,S,1,,3,,,,,,,,0,0,0,0,21279502000000X,,,,618.327,"LINESTRING (-73.90118 40.82440, -73.90051 40.8...",,,,,,
4,5,UNION AVENUE,BEHAGEN PLAYGROUND FIELD NORTH,0,U,,B,,W,DOT,X,7280,130,270420,212795.0,1,,,,,,,,,1,77356,4,X,10459,10459,2.0,2.0,203,203,402,401,135,,131,,2000,,3006,,135,,131,,4000,,4001,,135,,131,,79,46,79,26,,12,12,,1A,1A,,6C,6C,,,1011601,239640,1011786,240230,0,0,,0,47288,47822,M,M,,,1,35252.0,30715.0,,,1422603726,1422604132,77356,2,34.0,34.0,,,,,25,S,1,,3,,,,,,,,0,0,0,0,21279503000000X,,,,618.327,"LINESTRING (-73.90118 40.82440, -73.90051 40.8...",,,,,,


In [None]:
%%time

# create LION table in database from df - note geomerty data not supported in SQLITE
lion.drop(columns='geometry').to_sql('LION',con,if_exists='replace',index=False)

### Joining the Tickets to LION

In [7]:
%%time

query='''
SELECT a.`Summons Number`,a.`Violation Code`,a.`Clean Violation Des`,a.`Issue Date`,a.`Violation Time`,
b.OBJECTID,b.Street,b.FeatureTyp,b.SegmentTyp,b.NonPed,b.TrafDir,b.LocStatus,b.LZip,b.RZip,b.LBoro,b.RBoro,
b.L_CD,b.R_CD,b.CurveFlag,b.Radius,b.RW_Type,b.PhysicalID,b.StreetWidt,b.BikeLane,b.BIKE_Trafd,b.Number_Tra,
b.Number_Par,b.Number_Tot,b.Posted_Spe,b.Truck_Rout
FROM ticket20 a
LEFT OUTER JOIN LION b
ON a.Street1LU = b.StreetCode
WHERE b.c_lowadd<=a.`House Number Clean`
AND b.c_highadd>=a.`House Number Clean`
'''

summon_object20 = pd.read_sql_query(query,con)
summon_object20

CPU times: user 5min 35s, sys: 4min 29s, total: 10min 4s
Wall time: 13min 12s


Unnamed: 0,Summons Number,Violation Code,Clean Violation Des,Issue Date,Violation Time,OBJECTID,Street,FeatureTyp,SegmentTyp,NonPed,TrafDir,LocStatus,LZip,RZip,LBoro,RBoro,L_CD,R_CD,CurveFlag,Radius,RW_TYPE,PhysicalID,StreetWidt,BikeLane,BIKE_TRAFD,Number_Tra,Number_Par,Number_Tot,POSTED_SPE,TRUCK_ROUT
0,1477633194,16,NO STANDING-EXC. TRUCK LOADING,05/08/1972 12:00:00 AM,0523P,51995,43 STREET,0,U,,A,,11232,11232,3.000,3.000,307,307,,0,1,65509.000,30.000,3,TF,1,,3,25,2
1,1449715424,98,OBSTRUCTING DRIVEWAY,08/29/1977 12:00:00 AM,0428P,45690,UNION STREET,0,U,,A,,11233,11233,3.000,3.000,308,308,,0,1,91579.000,30.000,,,1,,3,25,
2,1455779155,20,NO PARKING-DAY/TIME LIMITS,10/03/1988 12:00:00 AM,0625A,58955,CLERMONT AVENUE,0,U,,T,,11205,11205,3.000,3.000,302,302,,0,1,90379.000,32.000,2,FT,2,,4,25,
3,1458800908,21,NO PARKING-STREET CLEANING,01/03/1990 12:00:00 AM,1106A,63230,DIVISION AVENUE,0,U,,T,X,11211,11211,3.000,3.000,301,301,,0,1,58987.000,40.000,,,2,,4,25,
4,1458800908,21,NO PARKING-STREET CLEANING,01/03/1990 12:00:00 AM,1106A,67035,DIVISION AVENUE,0,U,,T,X,11211,11211,3.000,3.000,301,301,,0,1,58988.000,40.000,,,2,,4,25,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9192049,1474552328,21,NO PARKING-STREET CLEANING,03/03/2031 12:00:00 AM,0857A,110290,FREDERICK DOUGLASS BOULEVARD,0,U,,T,,10039,10039,1.000,1.000,110,110,,0,1,131837.000,78.000,,,4,,6,25,
9192050,1449651586,98,OBSTRUCTING DRIVEWAY,12/30/2031 12:00:00 AM,1124A,48164,HERKIMER STREET,0,U,,W,,11233,11233,3.000,3.000,303,303,,0,1,71282.000,34.000,,,1,,3,25,
9192051,1458428930,19,NO STANDING-BUS STOP,01/03/2040 12:00:00 AM,1111A,3691,3 AVENUE,0,U,,T,X,10455,10455,2.000,2.000,201,201,,0,1,35995.000,48.000,,,2,,4,25,2
9192052,1446438314,40,FIRE HYDRANT,01/17/2049 12:00:00 AM,0210A,8236,LYDIG AVENUE,0,U,,A,X,10462,10462,2.000,2.000,211,211,,0,1,84945.000,34.000,,,1,,3,25,


In [8]:
%%time

query='''
SELECT a.`Summons Number`,a.`Violation Code`,a.`Clean Violation Des`,a.`Issue Date`,a.`Violation Time`,
b.OBJECTID,b.Street,b.FeatureTyp,b.SegmentTyp,b.NonPed,b.TrafDir,b.LocStatus,b.LZip,b.RZip,b.LBoro,b.RBoro,
b.L_CD,b.R_CD,b.CurveFlag,b.Radius,b.RW_Type,b.PhysicalID,b.StreetWidt,b.BikeLane,b.BIKE_Trafd,b.Number_Tra,
b.Number_Par,b.Number_Tot,b.Posted_Spe,b.Truck_Rout
FROM ticket19 a
LEFT OUTER JOIN LION b
ON a.Street1LU = b.StreetCode
WHERE b.c_lowadd<=a.`House Number Clean`
AND b.c_highadd>=a.`House Number Clean`
'''

summon_object19 = pd.read_sql_query(query,con)
summon_object19

CPU times: user 6min 56s, sys: 7min 10s, total: 14min 6s
Wall time: 19min 13s


Unnamed: 0,Summons Number,Violation Code,Clean Violation Des,Issue Date,Violation Time,OBJECTID,Street,FeatureTyp,SegmentTyp,NonPed,TrafDir,LocStatus,LZip,RZip,LBoro,RBoro,L_CD,R_CD,CurveFlag,Radius,RW_TYPE,PhysicalID,StreetWidt,BikeLane,BIKE_TRAFD,Number_Tra,Number_Par,Number_Tot,POSTED_SPE,TRUCK_ROUT
0,1442479759,21,NO PARKING-STREET CLEANING,07/13/2018,1157A,60164,HERZL STREET,0,U,,A,,11212,11212,3.000,3.000,316,316,,0,1,89511.000,30.000,,,1,,3,25,
1,1442480397,21,NO PARKING-STREET CLEANING,07/13/2018,0916A,72982,SUTTER AVENUE,0,U,,T,X,11212,11212,3.000,3.000,316,316,,0,1,71448.000,50.000,,,2,,4,25,
2,1442480403,21,NO PARKING-STREET CLEANING,07/13/2018,0918A,72982,SUTTER AVENUE,0,U,,T,X,11212,11212,3.000,3.000,316,316,,0,1,71448.000,50.000,,,2,,4,25,
3,1442480415,21,NO PARKING-STREET CLEANING,07/13/2018,0922A,72976,SUTTER AVENUE,0,U,,T,X,11212,11212,3.000,3.000,316,316,,0,1,71447.000,50.000,,,2,,4,25,
4,1442480415,21,NO PARKING-STREET CLEANING,07/13/2018,0922A,72977,SUTTER AVENUE,0,U,,T,X,11212,11212,3.000,3.000,316,316,,0,1,71447.000,50.000,,,2,,4,25,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11098464,8764852570,46,DOUBLE PARKING,06/22/2019,0707A,90700,AMSTERDAM AVENUE,0,U,,W,X,10023,10023,1.000,1.000,107,107,,0,1,3932.000,60.000,1,FT,4,,6,25,2
11098465,8764852581,71,INSP. STICKER-EXPIRED/MISSING,06/22/2019,0708A,90700,AMSTERDAM AVENUE,0,U,,W,X,10023,10023,1.000,1.000,107,107,,0,1,3932.000,60.000,1,FT,4,,6,25,2
11098466,8764852593,46,DOUBLE PARKING,06/22/2019,0709A,90700,AMSTERDAM AVENUE,0,U,,W,X,10023,10023,1.000,1.000,107,107,,0,1,3932.000,60.000,1,FT,4,,6,25,2
11098467,8764852623,20,NO PARKING-DAY/TIME LIMITS,06/22/2019,0713A,91036,WEST 102 STREET,0,U,,T,,10025,10025,1.000,1.000,107,107,,0,1,81913.000,30.000,,,1,,3,25,


## Run clean_tickets on ticket20 and ticket19 dataframes

In [9]:
summon_object20.to_pickle('ticketstreet20.pickle')
summon_object19.to_pickle('ticketstreet19.pickle')

KeyboardInterrupt: 

## Apply Optimization

## Merging Ticket Data with LION - Optimization

Key issue was trying to use columns in ticket data to filter LION for 12mn rows would not scale efficiently (O(n)) was observed and was unable to vectorize the filtering task but even it was possible it would have taken 2-3 days to go thru all 12mn rows.

**Conclusion** Pandas is not the right place to do this operation

In [143]:
def get_objectid(house,street):
    query=str('''
        SELECT OBJECTID FROM LION
        WHERE StreetCode={0}
        AND c_lowadd<={1}
        AND c_highadd>={1}
        ''').format(street,house)
    result = pd.read_sql_query(query,con)
    return result['OBJECTID'].values.tolist()

def func(x):
    house=x['House Number Clean']
    street=x['Street1LU']
    query=str('''
        SELECT OBJECTID FROM LION
        WHERE StreetCode={0}
        AND c_lowadd<={1}
        AND c_highadd>={1}
        ''').format(street,house)
    result = pd.read_sql_query(query,con)
    return result['OBJECTID'].values.tolist()

In [152]:
test_big=ticket20_clean.head(1000).copy()
test_small=ticket20_clean.head(100).copy()
test_tiny=ticket20_clean.head(10).copy()
test_large=ticket20_clean.head(10000).copy()

In [116]:
%%timeit

lion[(lion['StreetCode']==streetcode)&lion['LLo_Hyphen']<=housenumber]

724 ms ± 27.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [140]:
%%timeit

test_tiny['try1']=[get_objectid(x,y) for x,y in zip(test_tiny['House Number Clean'],test_tiny['Street1LU'])]
test_tiny

663 ms ± 39.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [141]:
%%timeit

test_small['try1']=[get_objectid(x,y) for x,y in zip(test_small['House Number Clean'],test_small['Street1LU'])]
test_small

6.01 s ± 39.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [142]:
%%timeit

test_big['try1']=[get_objectid(x,y) for x,y in zip(test_big['House Number Clean'],test_big['Street1LU'])]
test_big

1min 2s ± 1.39 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [173]:
# %%timeit

test_tiny['try2']=test_tiny.apply(func,axis=1)
test_tiny

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation,Clean Violation Des,Boro Code,Street1LU,Street2LU,Street3LU,House Number Clean,try2
0,1477633194,J58JKX,NJ,PAS,05/08/1972 12:00:00 AM,16,SDN,HONDA,P,8730,5130,5280,0,72.0,72,504,342924,T504,0,0523P,,K,F,270.0,43 ST,,0,408,E2,,YYYYYBB,0800A,0400P,BK,0.0,0,-,0,,,,,,NO STANDING-EXC. TRUCK LOADING,3,308730,305130.0,305280.0,270.0,[51995]
1,1449715424,KRE6058,PA,PAS,08/29/1977 12:00:00 AM,98,SUBN,ME/BE,P,86530,71800,73110,0,77.0,77,77,961115,0077,0,0428P,,K,F,1953.0,UNION ST,,0,408,F1,,BBBBBBB,ALL,ALL,BLK,0.0,0,-,0,,,,,,OBSTRUCTING DRIVEWAY,3,386530,371800.0,373110.0,1953.0,[45690]
2,1455779155,444326R,NJ,PAS,10/03/1988 12:00:00 AM,20,SDN,LEXUS,P,27030,41330,69230,0,88.0,88,730,535422,T730,0,0625A,,K,O,45.0,CLERMONT AVENUE,,0,408,D,,BBBBBBB,ALL,ALL,BLACK,0.0,0,-,0,,,,,,NO PARKING-DAY/TIME LIMITS,3,327030,341330.0,369230.0,45.0,[58955]
3,1458800908,F728330,OH,PAS,01/03/1990 12:00:00 AM,21,SDN,CHEVR,P,33030,93630,58730,0,90.0,90,301,355074,T301,0,1106A,,K,F,218.0,DIVISION AVE,,0,408,C,,BYBBYBB,1100A,1230P,,0.0,0,-,0,,,,,,NO PARKING-STREET CLEANING,3,333030,393630.0,358730.0,218.0,"[63230, 67035, 73883]"
4,1466038676,FMY9090,NY,PAS,02/14/1990 12:00:00 AM,21,SUBN,JEEP,S,45130,23930,68130,20210915,90.0,90,0,668676,KNBO,0,1253A,,K,F,850.0,GRAND ST,,0,408,D1,,BYBBYBB,1200A,0300A,GREY,0.0,2015,-,0,,,,,,NO PARKING-STREET CLEANING,3,345130,323930.0,368130.0,850.0,[42499]
5,1440657920,KDG0693,PA,PAS,07/21/1990 12:00:00 AM,14,SUBN,HYUN,P,33440,62200,0,20191231,100.0,100,100,963999,0100,0,0525P,,Q,,,B 99 ST,SHORE FRONT PKWY,0,408,C,,BBBBBBB,ALL,ALL,GY,0.0,0,-,0,,,,,,NO STANDING-DAY/TIME LIMITS,4,433440,462200.0,400000.0,0.0,[]
6,1460987810,79928MG,NY,COM,09/19/1990 12:00:00 AM,48,DELV,INTER,P,0,0,0,20200531,1.0,1,401,958976,0401,0,1120A,,NY,,,W/S/O WASHINGTON ST,S/O SPRING ST,0,408,E9,,BBBBBBB,ALL,ALL,WH,0.0,2015,-,0,,,,,,BIKE LANE,1,100000,100000.0,100000.0,0.0,[]
7,1449130203,JJJ8186,NY,PAS,10/14/1990 12:00:00 AM,14,SUBN,BMW,P,10020,27480,27540,20210710,52.0,52,52,964971,0052,0,0320A,,BX,F,2734.0,BAINBRIDGE AVE,,0,408,F2,,BBBBBBB,ALL,ALL,BLK,0.0,2010,-,0,,,,,,NO STANDING-DAY/TIME LIMITS,2,210020,227480.0,227540.0,2734.0,"[14632, 17387]"
8,1451300189,DKD6024,NC,PAS,07/25/1991 12:00:00 AM,98,SDN,FORD,P,11280,54137,5430,20190930,68.0,68,68,945183,0068,0,0843P,,K,F,372.0,94 ST,,0,408,C3,,BBBBBBB,ALL,ALL,GREY,0.0,0,-,0,,,,,,OBSTRUCTING DRIVEWAY,3,311280,354137.0,305430.0,372.0,[55227]
9,1464768973,6542FR,99,PAS,01/01/2000 12:00:00 AM,40,SUBN,CMCKU,P,18070,25390,27790,0,17.0,17,405,930288,0405,0,0739P,,,F,65.0,E 54 ST,,0,408,C,,BBBBBBB,ALL,ALL,RED,0.0,0,-,2,,,,,,FIRE HYDRANT,0,0,,,65.0,[]


In [160]:
%%time

test_small['try2']=test_small.parallel_apply(func,axis=1)
test_small

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25), Label(value='0 / 25'))), HBox…

CPU times: user 177 ms, sys: 297 ms, total: 474 ms
Wall time: 3.05 s


Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation,Clean Violation Des,Boro Code,Street1LU,Street2LU,Street3LU,House Number Clean,try2
0,1477633194,J58JKX,NJ,PAS,05/08/1972 12:00:00 AM,16,SDN,HONDA,P,08730,05130,05280,0,72.000,72,504,342924,T504,0000,0523P,,K,F,270,43 ST,,0,408,E2,,YYYYYBB,0800A,0400P,BK,0.000,0,-,0,,,,,,NO STANDING-EXC. TRUCK LOADING,3,308730,305130,305280,270.000,[51995]
1,1449715424,KRE6058,PA,PAS,08/29/1977 12:00:00 AM,98,SUBN,ME/BE,P,86530,71800,73110,0,77.000,77,77,961115,0077,0000,0428P,,K,F,1953,UNION ST,,0,408,F1,,BBBBBBB,ALL,ALL,BLK,0.000,0,-,0,,,,,,OBSTRUCTING DRIVEWAY,3,386530,371800,373110,1953.000,[45690]
2,1455779155,444326R,NJ,PAS,10/03/1988 12:00:00 AM,20,SDN,LEXUS,P,27030,41330,69230,0,88.000,88,730,535422,T730,0000,0625A,,K,O,45,CLERMONT AVENUE,,0,408,D,,BBBBBBB,ALL,ALL,BLACK,0.000,0,-,0,,,,,,NO PARKING-DAY/TIME LIMITS,3,327030,341330,369230,45.000,[58955]
3,1458800908,F728330,OH,PAS,01/03/1990 12:00:00 AM,21,SDN,CHEVR,P,33030,93630,58730,0,90.000,90,301,355074,T301,0000,1106A,,K,F,218,DIVISION AVE,,0,408,C,,BYBBYBB,1100A,1230P,,0.000,0,-,0,,,,,,NO PARKING-STREET CLEANING,3,333030,393630,358730,218.000,"[63230, 67035, 73883]"
4,1466038676,FMY9090,NY,PAS,02/14/1990 12:00:00 AM,21,SUBN,JEEP,S,45130,23930,68130,20210915,90.000,90,0,668676,KNBO,0000,1253A,,K,F,850,GRAND ST,,0,408,D1,,BYBBYBB,1200A,0300A,GREY,0.000,2015,-,0,,,,,,NO PARKING-STREET CLEANING,3,345130,323930,368130,850.000,[42499]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1457406860,HNE5745,NY,PAS,08/05/2000 12:00:00 AM,78,VAN,FORD,P,19366,29345,45226,20210409,121.000,121,121,961676,0121,0000,0320A,,R,F,38,BOWDOIN ST,,0,408,F1,,BBBBBBB,ALL,ALL,WHITE,0.000,2008,-,0,,,,,,NGHT PKG ON RESID STR-COMM VEH,5,519366,529345,545226,38.000,[215148]
96,1459216313,80226MM,NY,COM,08/06/2000 12:00:00 AM,14,DELV,FRUEH,P,17910,10410,25390,20191031,,0,401,956410,0401,0000,0410P,,NY,F,16,E 46TH STREET,,0,408,C,,BBBBBBB,0400,0700P,BROWN,0.000,2009,-,0,,,,,,NO STANDING-DAY/TIME LIMITS,1,117910,110410,125390,16.000,[96195]
97,1459216271,83450MH,NY,COM,08/06/2000 12:00:00 AM,14,DELV,FRUEH,P,18090,10410,25390,20191031,,0,401,956410,0401,0000,0506P,,NY,O,2,E 55TH STREET,,0,408,F1,,BBBBBBB,ALL,ALL,BROWN,0.000,2016,-,0,,,,,,NO STANDING-DAY/TIME LIMITS,1,118090,110410,125390,2.000,[92514]
98,1461065227,89320MJ,NY,COM,08/07/2000 12:00:00 AM,18,,,P,67030,40230,44430,0,70.000,70,401,915100,0401,0000,1023A,,K,,2036,NOSTRAND AVE,,0,408,J6,,BBBBBBB,0700A,0700P,BW,0.000,0,-,0,,,,,,NO STANDING-BUS LANE,3,367030,340230,344430,2036.000,[42106]


In [165]:
%prun -l 10 test_small['try2']=test_small.apply(func,axis=1)

 

In [150]:
%%timeit

test_big['try2']=test_big.parallel_apply(func,axis=1)
test_big

22.2 s ± 425 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [161]:
%%time
test_large['try2']=test_large.parallel_apply(func,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2500), Label(value='0 / 2500'))), …

CPU times: user 8.73 s, sys: 1.92 s, total: 10.6 s
Wall time: 4min 33s


## Fuzzy String Match Using BERT for Street Names

### To map ticket street names to centerline street names

We will build a BERT for each boro and match for better accuracy but ultimately did not use because matching was not 100% and still had to manually clean over 2000 entries

In [None]:
# Create seperate from and to list by Boro for better match accuracy

# takes just over an hour to run 10 BERT models - so we save them to memory 

# UNCOMMENT EVERYTHING BELOW HERE TO RERUN THE BERT FUZZY MATCH 


# from_list_b11=ticket20_clean[ticket20_clean['Boro Code']=='1']['Street1Clean'].dropna().unique().tolist()
# from_list_b21=ticket20_clean[ticket20_clean['Boro Code']=='2']['Street1Clean'].dropna().unique().tolist()
# from_list_b31=ticket20_clean[ticket20_clean['Boro Code']=='3']['Street1Clean'].dropna().unique().tolist()
# from_list_b41=ticket20_clean[ticket20_clean['Boro Code']=='4']['Street1Clean'].dropna().unique().tolist()
# from_list_b51=ticket20_clean[ticket20_clean['Boro Code']=='5']['Street1Clean'].dropna().unique().tolist()

# from_list_b12=ticket19_clean[ticket19_clean['Boro Code']=='1']['Street1Clean'].dropna().unique().tolist()
# from_list_b22=ticket19_clean[ticket19_clean['Boro Code']=='2']['Street1Clean'].dropna().unique().tolist()
# from_list_b32=ticket19_clean[ticket19_clean['Boro Code']=='3']['Street1Clean'].dropna().unique().tolist()
# from_list_b42=ticket19_clean[ticket19_clean['Boro Code']=='4']['Street1Clean'].dropna().unique().tolist()
# from_list_b52=ticket19_clean[ticket19_clean['Boro Code']=='5']['Street1Clean'].dropna().unique().tolist()

# from_list_b1 = list(set(from_list_b11 + from_list_b12))
# from_list_b2 = list(set(from_list_b21 + from_list_b22))
# from_list_b3 = list(set(from_list_b31 + from_list_b32))
# from_list_b4 = list(set(from_list_b41 + from_list_b42))
# from_list_b5 = list(set(from_list_b51 + from_list_b52))


# match_list_b1 = cl_df[cl_df['BOROCODE']==1]['FULL_STREE'].unique().tolist()
# match_list_b2 = cl_df[cl_df['BOROCODE']==2]['FULL_STREE'].unique().tolist()
# match_list_b3 = cl_df[cl_df['BOROCODE']==3]['FULL_STREE'].unique().tolist()
# match_list_b4 = cl_df[cl_df['BOROCODE']==4]['FULL_STREE'].unique().tolist()
# match_list_b5 = cl_df[cl_df['BOROCODE']==5]['FULL_STREE'].unique().tolist()


# # 'Pipeline' to run Bert/tfidf/edit for each boro

# embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
# bert = Embeddings(embeddings, min_similarity=0,model_id="BERT")
# tfidf = TFIDF(min_similarity=0,model_id='TF-IDF')
# edit = EditDistance(model_id='EDIT_DIST')
# string_models = [bert, tfidf, edit]


# model_b1 = PolyFuzz(string_models)
# model_b1.match(from_list_b1, match_list_b1)

# model_b2 = PolyFuzz(string_models)
# model_b2.match(from_list_b2, match_list_b2)

# model_b3 = PolyFuzz(string_models)
# model_b3.match(from_list_b3, match_list_b3)

# model_b4 = PolyFuzz(string_models)
# model_b4.match(from_list_b4, match_list_b4)

# model_b5 = PolyFuzz(string_models)
# model_b5.match(from_list_b5, match_list_b5)

# pickle.dump(model_b1, open('BERT Models/model_b1_2020.sav','wb'))
# pickle.dump(model_b2, open('BERT Models/model_b2_2020.sav','wb'))
# pickle.dump(model_b3, open('BERT Models/model_b3_2020.sav','wb'))
# pickle.dump(model_b4, open('BERT Models/model_b4_2020.sav','wb'))
# pickle.dump(model_b5, open('BERT Models/model_b5_2020.sav','wb'))


In [None]:
# Load saved models to save time

model_b1 = pickle.load(open('BERT Models/model_b1_2020.sav','rb'))
model_b2 = pickle.load(open('BERT Models/model_b2_2020.sav','rb'))
model_b3 = pickle.load(open('BERT Models/model_b3_2020.sav','rb'))
model_b4 = pickle.load(open('BERT Models/model_b4_2020.sav','rb'))
model_b5 = pickle.load(open('BERT Models/model_b5_2020.sav','rb'))


In [None]:
# visualise models

model_b1.visualize_precision_recall()
model_b2.visualize_precision_recall()
model_b3.visualize_precision_recall()
model_b4.visualize_precision_recall()
model_b5.visualize_precision_recall()

In [None]:
# get match results

matchb1 = model_b1.get_matches('BERT')
matchb2 = model_b2.get_matches('BERT')
matchb3 = model_b3.get_matches('BERT')
matchb4 = model_b4.get_matches('BERT')
matchb5 = model_b5.get_matches('BERT')

In [None]:
# get counts of violations for each street 

count20b1 = ticket20_clean[ticket20_clean['Boro Code']=='1'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count20b2 = ticket20_clean[ticket20_clean['Boro Code']=='2'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count20b3 = ticket20_clean[ticket20_clean['Boro Code']=='3'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count20b4 = ticket20_clean[ticket20_clean['Boro Code']=='4'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count20b5 = ticket20_clean[ticket20_clean['Boro Code']=='5'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']

count19b1 = ticket19_clean[ticket19_clean['Boro Code']=='1'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count19b2 = ticket19_clean[ticket19_clean['Boro Code']=='2'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count19b3 = ticket19_clean[ticket19_clean['Boro Code']=='3'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count19b4 = ticket19_clean[ticket19_clean['Boro Code']=='4'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']
count19b5 = ticket19_clean[ticket19_clean['Boro Code']=='5'].groupby('Street1Clean').agg({'Summons Number':'count'}).sort_values(by=['Summons Number'],ascending=False).to_dict()['Summons Number']


In [None]:
match = [matchb1,
         matchb2,
         matchb3,
         matchb4,
         matchb5
        ]

count_20 = [count20b1,
            count20b2,
            count20b3,
            count20b4,
            count20b5
           ]
count_19 = [count19b1,
            count19b2,
            count19b3,
            count19b4,
            count19b5 
           ]

for i,j,k in zip (match,count_20,count_19):
    i['No of Violations 20'] = i['From'].map(j)
    totalviolations20 = i['No of Violations 20'].sum()
    i['Proportion of total 20'] = i['No of Violations 20']/totalviolations20 
    i['No of Violations 19'] = i['From'].map(k)
    totalviolations19 = i['No of Violations 19'].sum()
    i['Proportion of total 19'] = i['No of Violations 19']/totalviolations19 
    i.sort_values(by=['No of Violations 20'],ascending=False,inplace=True)

In [None]:
filepath='BERT Models/BERT Matches/'

match_list = ['matchb1',
              'matchb2',
              'matchb3',
              'matchb4',
              'matchb5',
             ]

for i,j in zip (match,match_list):
    i.to_csv(filepath+j+'.csv')

In [None]:
#     # 5. Match Street Codes to Primary Street Name
#     df['Street1'] = df['Street1LU'].map(stnames_dict)
#     df['Street2'] = df['Street2LU'].map(stnames_dict)
#     df['Street3'] = df['Street3LU'].map(stnames_dict)
    
#     # 6. Clean Street Names for better BERT matching
    
#     # clean white space
#     df['Street1Clean'] = df['Street1'].str.replace(' +',' ',regex=True)
#     df['Street2Clean'] = df['Street2'].str.replace(' +',' ',regex=True)
#     df['Street3Clean'] = df['Street3'].str.replace(' +',' ',regex=True)
    
#     # standardize some common street name patterns
#     df['Street1Clean']=df['Street1Clean'].str.replace(r"(EAST) ([0-9]+) (STREET)",'E \g<2> ST',regex=True)
#     df['Street1Clean']=df['Street1Clean'].str.replace(r"(WEST) ([0-9]+) (STREET)",'W \g<2> ST',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r"(EAST) ([0-9]+) (STREET)",'E \g<2> ST',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r"(WEST) ([0-9]+) (STREET)",'W \g<2> ST',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r"(EAST) ([0-9]+) (STREET)",'E \g<2> ST',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r"(WEST) ([0-9]+) (STREET)",'W \g<2> ST',regex=True)
    
#     df['Street1Clean']=df['Street1Clean'].str.replace(r" STREET",' ST',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r" STREET",' ST',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r" STREET",' ST',regex=True)
    
#     df['Street1Clean']=df['Street1Clean'].str.replace(r" AVENUE",' AVE',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r" AVENUE",' AVE',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r" AVENUE",' AVE',regex=True)
    
#     df['Street1Clean']=df['Street1Clean'].str.replace(r" PLACE",' PL',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r" PLACE",' PL',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r" PLACE",' PL',regex=True)
    
#     df['Street1Clean']=df['Street1Clean'].str.replace(r" BOULEVARD",' BLVD',regex=True)
#     df['Street2Clean']=df['Street2Clean'].str.replace(r" BOULEVARD",' BLVD',regex=True)
#     df['Street3Clean']=df['Street3Clean'].str.replace(r" BOULEVARD",' BLVD',regex=True)
    
    
#     # ADD MORE ABOVE HERE IF REQUIRED - remember to add for Street 1 street 2 street 3
    

## Load and Clean Centerline Data

Using LION data instead

In [None]:
cl_df= pd.read_csv('/Users/stuartong/uberticketsyelp/Data/Centerline.csv')

# note some address contain '-' we only want the portion before the dash
cl_df['l_lowadd']=cl_df['L_LOW_HN'].str.split('-').str[0]
cl_df['l_highadd']=cl_df['L_HIGH_HN'].str.split('-').str[0]
cl_df['r_lowadd']=cl_df['R_LOW_HN'].str.split('-').str[0]
cl_df['r_highadd']=cl_df['R_HIGH_HN'].str.split('-').str[0]

# note have issues with none types convert to np.nan 
# we do this as some sides of the street do not have an address - i.e. 0 - so that we can evaluate the min

cl_df['r_lowadd'].replace({'0':np.nan},inplace=True)
cl_df['l_lowadd'].replace({'0':np.nan},inplace=True)
cl_df['c_lowadd']= cl_df[['l_lowadd','r_lowadd']].min(axis=1,skipna=True)
cl_df['c_highadd']=cl_df[['l_highadd','r_highadd']].max(axis=1)

cl_df.head(10)

## Get Street Code:Street Name DIctionary

Using LION Data instead

In [None]:
# get street name dictionary
stnames = pd.read_csv('/Users/stuartong/uberticketsyelp/Data/snd20d/snd20Dcow.txt')
stnames.columns = ['Raw']

# from SND_metadata.pdf - location of stnames and Lookup prefixes
stnames['Street Name'] = stnames['Raw'].str[2:34].str.strip()
stnames['Lookup'] = stnames['Raw'].str[34:42]

# only keep primary names
stnames = stnames[stnames['Lookup'].str.contains('PF')]

# create streetname dictionary
stnames_dict= pd.Series(stnames['Street Name'].values,index=stnames['Lookup']).to_dict()
