# Quality Check - Reconstructed Municipal Data

The purpose of this file is to Quality Check the re-constructed municipal data, compiled by RN in the file `reconstruct_municipal.ipynb`

In [1]:
# Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Path Management
import pathlib
import os

# Progress Tracking
from tqdm import tqdm

# Regular expressions
import re

# Maths
import numpy as np

# System
from sys import platform

# Import Geopandas
import geopandas as gpd
from shapely.geometry import Point
from geopandas.tools import geocode
import geopy

In [5]:
# Heatmaps
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [2]:
# Set paths
path = raw_path = pathlib.Path().resolve()

if platform == "windows":
    raw_path = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "Vehicles_2022" / "Compiled"
elif platform == "linux":
    raw_path = pathlib.Path().resolve().parent.parent / "data" / "municipal_dataset_latest"
raw_path.is_dir()

True

# Load data

In [3]:
def reset_recompiled_data():
    recompiled_data = pd.read_csv(raw_path / "2019-21_data_compiled_RN_100323.csv", chunksize = 1000)
    return recompiled_data

In [4]:
original_data = pd.read_csv(raw_path / "2019-21 data compiled.csv", chunksize = 1000)

# Checks

## Check the same files are used

In [112]:
recompiled_data_sources = []
recompiled_data_len = 0

Get all the unique file sources in the recompiled data

In [113]:
i = 0

for chunk in recompiled_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    recompiled_data_len += len(chunk)
    
    sources = list(chunk["record_from"].unique())
    sources_new = [item for item in sources if not item in recompiled_data_sources]
    for item in sources_new:
        recompiled_data_sources.append(item)
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


Get all the unique file sources in the original data

In [23]:
original_data_sources = []
original_data_len = 0

In [41]:
i = 0

for chunk in original_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    original_data_len += len(chunk)
    
    sources = list(chunk["record_from"].unique())
    sources_new = [item for item in sources if not item in original_data_sources]
    for item in sources_new:
        original_data_sources.append(item)
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


Compare them

In [114]:
diff1 = [item for item in original_data_sources if not item in recompiled_data_sources]
diff2 = [item for item in recompiled_data_sources if not item in original_data_sources]

diff1

['77_Manchester_MVData_2019.csv',
 '045_East_Lyme_MV_21.csv',
 '072_Ledyard_MV_21.xlsx',
 '077_Manchester_MV_21.xls',
 '122_Salisbury_MV_21.xlsx',
 '016_Bridgewater_MV_21.xlsx']

This makes sense - these are the files that got edited. Manchester 2019 and 2021, Salisbury, and Bridgewater

In [115]:
diff2 = [item for item in diff2 if not "ALTERED" in item]

In [116]:
diff2

['010_Bethlehem_MV_21.xlsx',
 '020_Burlington_MV_21.xlsx',
 '024_Chaplin_MV_21.xlsx',
 '034_Danbury_MV_21.xlsx',
 '061_Haddam_MV_21.xlsx',
 '066_Harwinton_MV_21.xlsx',
 '079_Marlborough_MV_21.xlsx',
 '086_Montville_MV_21.xlsx',
 '090_New_Canaan_MV_21.xlsx',
 '131_Southington_MV_21.xlsx',
 '147_Voluntown_MV_21.xlsx',
 '163_Windham_MV_21.xlsx']

Why weren't these items in the original data sources?... They appear to have been skipped or added later. **Note:** This has not been resolved. But because no issues arise, I assume the files should be included. Below, in `investigate_diff2,` I extract some rows from these files and confirm that they look normal - so it's OK to include them.

In [65]:
investigate_diff2 = pd.DataFrame([])
i = 0

for chunk in recompiled_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    required = chunk[chunk["record_from"].isin(diff2)]
        
    excerpt_head = required.head(10)
    excerpt_tail = required.tail(10)
    
    excerpt = pd.concat([excerpt_head, excerpt_tail])
    
    investigate_diff2 = pd.concat([investigate_diff2, excerpt])
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


In [74]:
heads = []
for item in diff2:
    df = investigate_diff2[investigate_diff2["record_from"] == item]
    heads.append(df.head(10))

for df in heads:
    display(df)

Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
95937,95937,010_Bethlehem_MV_21.xlsx,37-39 CITY HILL STREET LLC,15 W SHORE DR,BETHLEHEM,CT,6751,2021.0,CHEVR,SILVERAD,2.0,1GC4YUEY1MF128671,15 WEST SHORE DR,,BETHLEHEM,CT,6751.0
95938,95938,010_Bethlehem_MV_21.xlsx,37-39 CITY HILL STREET LLC,15 W SHORE DR,BETHLEHEM,CT,6751,2022.0,LOOK,STLC,10.0,53BLTEA10NP023562,15 WEST SHORE DR,,BETHLEHEM,CT,6751.0
95939,95939,010_Bethlehem_MV_21.xlsx,A & B HARD FLOOR LLC,PO BOX 1387,WATERBURY,CT,6721,2019.0,GMC,SIERRA K,3.0,1GT12SEY7KF230918,21 SKY MEADOW RD,,BETHLEHEM,CT,6751.0
95940,95940,010_Bethlehem_MV_21.xlsx,A MIM S CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2000.0,MACK,CX600,2.0,1M1AE07Y5YW003647,,,,,
95941,95941,010_Bethlehem_MV_21.xlsx,A MIM S CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2012.0,LANDO,T L,10.0,1LH440VHXC1018832,,,,,
95942,95942,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2013.0,PTRB,CONVENTI,2.0,1XPSD79X2DD172162,,,,,
95943,95943,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,1999.0,FONTA,303NDMRV,10.0,4LF4G4825X3508389,,,,,
95944,95944,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,1995.0,LANDO,317,10.0,1LH317VJ9S1007514,,,,,
95945,95945,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2004.0,INTER,4000 SER,2.0,1HTMMAAM14H667912,,,,,
95946,95946,010_Bethlehem_MV_21.xlsx,ABBEY OF REGINA LAUDIS,273 FLANDERS RD,BETHLEHEM,CT,6751,2005.0,HONDA,ACCORD L,1.0,1HGCM56495A149907,,,BETHLEHEM,CT,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
318203,318203,020_Burlington_MV_21.xlsx,A-1 ENTERPRISES LLC,8 MARY ROAD,BURLINGTON,CT,6013,2021.0,FORD,F150 SUP,3,1FTFW1E55MKD52858,,,,,
318204,318204,020_Burlington_MV_21.xlsx,A-1 ENTERPRISES LLC,8 MARY ROAD,BURLINGTON,CT,6013,2019.0,CHEVR,SILVERAD,2,1HTKHPVK5KH406240,8 MARY RD,,BURLINGTON,CT,6013.0
318205,318205,020_Burlington_MV_21.xlsx,ABALAN DOUGLAS L,71 MILFORD ST,BURLINGTON,CT,6013,2011.0,DODGE,GRAND CR,1,2D4RN5DGXBR683379,,,,,
318206,318206,020_Burlington_MV_21.xlsx,ABALAN DOUGLAS L,71 MILFORD ST,BURLINGTON,CT,6013,2004.0,DODGE,RAM 1500,1,1D7HU18D84S564872,,,,,
318207,318207,020_Burlington_MV_21.xlsx,ABALAN RINETTE R,71 MILFORD ST,BURLINGTON,CT,6013,2001.0,HONDA,CR-V EX,1,JHLRD18661C024901,,,,,
318208,318208,020_Burlington_MV_21.xlsx,ABASCAL RICHARD,68 VENICE DR,BURLINGTON,CT,6013,2013.0,TOYOT,TACOMA A,3,5TFUU4EN9DX054329,,,,,
318209,318209,020_Burlington_MV_21.xlsx,ABASCAL RICHARD,68 VENICE DR,BURLINGTON,CT,6013,2011.0,HARLE,FLSTN,12,1HD1JD517BB017457,,,,,
318210,318210,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2005.0,HONDA,PILOT EX,1,2HKYF18425H570235,59 BELDEN ROAD,,BURLINGTON,CT,6013.0
318211,318211,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2013.0,TOYOT,CAMRY L/,1,4T1BF1FK5DU660406,,,,,
318212,318212,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2012.0,HONDA,PILOT EX,1,5FNYF4H55CB053260,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
345894,345894,024_Chaplin_MV_21.xlsx,ABARZUA DAGGETT ASCHLY C,26 BEDLAM RD,CHAPLIN,CT,6235,2003.0,HONDA,VT750DCB,12,JH2RC44503M708944,,,,,
345895,345895,024_Chaplin_MV_21.xlsx,ABARZUA DAGGETT ASCHLY C,26 BEDLAM RD,CHAPLIN,CT,6235,2014.0,VOLKS,PASSAT S,1,1VWBN7A38EC063894,,,,,
345896,345896,024_Chaplin_MV_21.xlsx,ABELIN DAWN L,150 CHEWINK RD,CHAPLIN,CT,6235,2009.0,HONDA,CR-V EX,1,3CZRE48539G703675,,,,,
345897,345897,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021.0,BUICK,ENCORE P,1,KL4CJESM2MB361970,,,,,
345898,345898,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2018.0,BUICK,ENCORE P,1,KL4CJESB7JB694739,,,,,
345899,345899,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019.0,CHEVR,TRAX 1LT,1,KL7CJPSBXKB951800,,,,,
345900,345900,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021.0,GMC,TERRAIN,1,3GKALVEV6ML351991,,,,,
345901,345901,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020.0,CHEVR,SILVERAD,3,1GCRYEED2LZ369807,,,,,
345902,345902,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019.0,GMC,SIERRA L,3,2GTV2LEC7K1105839,,,,,
345903,345903,024_Chaplin_MV_21.xlsx,ADAMS ALEXIS E,20 CAREFREE LN APT 1,CHAPLIN,CT,6235,2018.0,MITSU,OUTLANDE,1,JA4AP3AWXJU023925,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
421301,421301,034_Danbury_MV_21.xlsx,29 FEDERAL ROAD LLC,64 TRIANGLE ST.,DANBURY,CT,6810,2007,CHEVR,EXPRESS,3,1GBGG29U171156648,,,,,
421302,421302,034_Danbury_MV_21.xlsx,2J S LLC,12 STARR AVE 14 STARR AVE,DANBURY,CT,6811,2014,HINO,HINO 268,2,5PVNJ8JT9E4S55479,,,,,
421303,421303,034_Danbury_MV_21.xlsx,2M GENERAL SERVICES LLC,12 TRIANGLE ST UNIT 2,DANBURY,CT,6810,1998,FORD,ECONOLIN,3,1FTRE1429WHA50822,,,,,
421304,421304,034_Danbury_MV_21.xlsx,3M GENERAL SERVICES LLC,13 COTTAGE ST FL 2,DANBURY,CT,6810,2017,FORD,FUSION S,3,3FA6P0HD1HR195797,,,,,
421305,421305,034_Danbury_MV_21.xlsx,4 SEASONSLLC,50 NORTH STREET,DANBURY,CT,6810,2021,CROSS,61435,11,431FS1411M1000212,,,,,
421306,421306,034_Danbury_MV_21.xlsx,4 SEASONSLLC,50 NORTH STREET,DANBURY,CT,6810,2019,CHEVR,EXPRESS,3,1GCWGAFG3K1255867,,,,,
421307,421307,034_Danbury_MV_21.xlsx,404 CAR LINE LLC,89 WALNUT TRL,DANBURY,CT,6811,1999,FORD,EXPEDITI,3,1FMPU18L3XLA05477,,,,,
421308,421308,034_Danbury_MV_21.xlsx,46 SL LLC,16 HAYESTOWN RD UNIT 3104,DANBURY,CT,6811,2020,PORSC,MACAN,1,WP1AA2A57LLB09404,,,,,
421309,421309,034_Danbury_MV_21.xlsx,85 MILL PLAIN ROAD LLC,85 MILL PLAIN RD,DANBURY,CT,6811,2017,RAM,RAM CHAS,70,3C7WRNDL8HG625578,,,,,
421310,421310,034_Danbury_MV_21.xlsx,A & C ROOFING AND SIDING LLC,8 MALLORY ST APT 2,DANBURY,CT,6810,2008,GMC,ACADIA S,1,1GKEV23768J245536,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
938991,938991,061_Haddam_MV_21.xlsx,A & A PROPERTY MAINTENANCE LLC,77 OAK RIDGE DR UNIT 1,HADDAM,CT,6438.0,2019,FORD,F550 SUP,2,1FD0X5HT6KEF91010,,,,,
938992,938992,061_Haddam_MV_21.xlsx,ABARIENTOS ANTONIETA L,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2005,ACURA,MDX,1,2HNYD182X5H552606,,,,,
938993,938993,061_Haddam_MV_21.xlsx,ABARIENTOS CRISPIN,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2011,AUDI,A4 2.0T,1,WAUFFAFL9BA091621,,,,,
938994,938994,061_Haddam_MV_21.xlsx,ABARIENTOS CRISPIN,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2001,PORSC,911 CARR,1,WP0AA299X1S620343,,,,,
938995,938995,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,1985,CHEVR,K10,25,2GCEK14H3F1149599,,,,,
938996,938996,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,2011,VOLKS,JETTA TD,1,3VWPL7AJ2BM616673,,,,,
938997,938997,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,2003,CHEVR,SILVERAD,1,1GCEC14X73Z353572,,,,,
938998,938998,061_Haddam_MV_21.xlsx,ABBOTT ARTHUR H JR,623 CANDLEWOOD HILL RD,HIGGANUM,CT,6441.0,2009,SUBAR,OUTBACK,1,4S4BP61C497330465,,,,,
938999,938999,061_Haddam_MV_21.xlsx,ABBOTT ARTHUR H JR,623 CANDLEWOOD HILL RD,HIGGANUM,CT,6441.0,1997,CHEVR,K1500,3,1GCEK14MXVZ191674,,,,,
938991,938991,061_Haddam_MV_21.xlsx,A & A PROPERTY MAINTENANCE LLC,77 OAK RIDGE DR UNIT 1,HADDAM,CT,6438.0,2019,FORD,F550 SUP,2,1FD0X5HT6KEF91010,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1043778,1043778,066_Harwinton_MV_21.xlsx,.SUPREME INDUSTRIES INC.,216 LOWER BOGUE RD,HARWINTON,CT,6791,2019,CHEVR,SILVERAD,1,1GCUYDED3KZ127370,,,,,
1043779,1043779,066_Harwinton_MV_21.xlsx,.SUPREME INDUSTRIES INC.,216 LOWER BOGUE RD,HARWINTON,CT,6791,2017,CHEVR,SILVERAD,3,1GC1KVEG7HF169684,,,,,
1043780,1043780,066_Harwinton_MV_21.xlsx,362 POST ROAD LLC,216 BOGUE RD,HARWINTON,CT,6791,2018,TIDEW,TP-22-36,11,45LBS2219J2100584,,,,,
1043781,1043781,066_Harwinton_MV_21.xlsx,ABBAMONDI CYNTHIA J,30 HITHER LANE,HARWINTON,CT,6791,2016,VOLVO,XC60 T5,1,YV4612RM7G2831290,,,,,
1043782,1043782,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,1983,FIAT,PININFAR,25,ZFRAS00B3D5503101,,,,,
1043783,1043783,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,2013,VOLKS,BEETLE T,1,3VW4A7AT1DM617780,,,,,
1043784,1043784,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,2016,RAM,RAM TRUC,3,3C6JR7DTXGG302349,,,,,
1043785,1043785,066_Harwinton_MV_21.xlsx,ABELING CHRISTOPHER L,576 MAIN ST,TORRINGTON,CT,6790,2011,HONDA,ELEMENT,1,5J6YH2H75BL004405,,,,,
1043786,1043786,066_Harwinton_MV_21.xlsx,ABLING-JOSEPHSON CORITA P,352 CLEARVIEW AVE,HARWINTON,CT,6791,2012,SUBAR,FORESTER,1,JF2SHADC7CH451624,,,,,
1043787,1043787,066_Harwinton_MV_21.xlsx,ABOU ARRAGE ANDRE,34 SAND HILL LN,GLASTONBURY,CT,6033,2017,HONDA,ACCORD T,1,1HGCR3F90HA007645,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1199237,1199237,079_Marlborough_MV_21.xlsx,,32 EAST HAMPTON RD,MARLBOROUGH,CT,6447,2018,TOYOT,4RUNNER,1,JTEBU5JR9J5607957,32 EAST HAMPTON RD RT 66,50001.0,MARLBOROUGH,CT,6447.0
1199238,1199238,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,1999,FORD,F550 SUP,2,1FDAF56F0XEA33195,5 SHERWOOD LANE,50002.0,MARLBOROUGH,CT,6447.0
1199239,1199239,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,2001,ECONO,,10,42EDPFB2X11001464,5 SHERWOOD LANE,50003.0,MARLBOROUGH,CT,6447.0
1199240,1199240,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,2016,CHEVR,SILVERAD,3,1GC1KUEG4GF278550,5 SHERWOOD LANE,50004.0,MARLBOROUGH,CT,6447.0
1199241,1199241,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2018,PORSC,MACAN,1,WP1AA2A59JLB15167,,50005.0,,,
1199242,1199242,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2015,AUDI,A4 2.0T,1,WAUBFBFL2FN040366,,50006.0,,,
1199243,1199243,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2020,DUCAT,DIAVEL 1,12,ZDMGAHRW2LB003903,,50007.0,,,
1199244,1199244,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2018,FORD,F150 RAP,1,1FTFW1RG0JFE52897,,50008.0,,,
1199245,1199245,079_Marlborough_MV_21.xlsx,,20 CARRIAGE LN,MARLBOROUGH,CT,6447,2016,NISSA,ROGUE S/,1,KNMAT2MV8GP723352,,50009.0,,,
1199246,1199246,079_Marlborough_MV_21.xlsx,,20 APACHE LN,MARLBOROUGH,CT,6447,2019,HONDA,CR-V EXL,1,5J6RW2H8XKA000252,,50010.0,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1364806,1364806,086_Montville_MV_21.xlsx,A & B EXCAVATING L.L.C.,33 COVE RD,UNCASVILLE,CT,6382,1988,INTER,C16,10,1ZFCF1622JB003085,,,,,
1364807,1364807,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,2013,FORD,F550 SUP,2,1FD0W5HT9DEA34887,,,,,
1364808,1364808,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,1997,INTER,4000 SER,2,1HTSCAAM2VH424900,,,,,
1364809,1364809,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,2005,FORD,F350 SUP,2,1FDWX37P75EB40966,,,,,
1364810,1364810,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,1998,MITSU,FE639,2,JW6AAE1H5WL001462,,,,,
1364811,1364811,086_Montville_MV_21.xlsx,A & B TREE SERVICE LLC,PO BOX 335,MONTVILLE,CT,6353,1996,GMC,TOPKICK,2,1GDL7H1P4TJ512163,,,,,
1364812,1364812,086_Montville_MV_21.xlsx,A & B TREE SERVICES LLC,PO BOX 335,MONTVILLE,CT,6353,2012,INTER,4000 SER,2,3HAMMAAM6CL549363,,,,,
1364813,1364813,086_Montville_MV_21.xlsx,A J CABRAL TRUCKING,12 BONVILLE DR,UNCASVILLE,CT,6382,1994,KENWO,CONSTRUC,2,2XKDD69X9RM635618,,,,,
1364814,1364814,086_Montville_MV_21.xlsx,AANENSEN KENNETH T,8 ROBIN LN,OAKDALE,CT,6370,2021,NISSA,MURANO P,1,5N1AZ2DS3MC136494,,,,,
1364815,1364815,086_Montville_MV_21.xlsx,AANENSEN KENNETH T JR,8 ROBIN LN,OAKDALE,CT,6370,2009,HONDA,PILOT EX,1,5FNYF48509B031076,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1431120,1431120,090_New_Canaan_MV_21.xlsx,1330 EAST PUTNAM LLC,PO BOX 811,NEW CANAAN,CT,6840.0,2019.0,CHEVR,EQUINOX,1.0,3GNAXVEX0KS674784,261 ELM ST,50001,NEW CANAAN,CT,6840.0
1431121,1431121,090_New_Canaan_MV_21.xlsx,1ST RIDE & GO CORP,58 PINE ST,NEW CANAAN,CT,6840.0,2016.0,FORD,TRANSIT,8.0,1FBZX2CM2GKA71622,,50002,,,
1431122,1431122,090_New_Canaan_MV_21.xlsx,909 WEST ROAD REVOCABLE LIVING TRUST,909 WEST RD,NEW CANAAN,CT,6840.0,1996.0,JAGUA,XJR,25.0,SAJPX1143TC781554,,50003,,,
1431123,1431123,090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840.0,2007.0,FORD,ECONOLIN,3.0,1FTNE24L07DA36751,,50004,,,
1431124,1431124,090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840.0,2018.0,RAM,PROMASTE,3.0,3C6TRVAG6JE110384,,50005,,,
1431125,1431125,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2015.0,RAM,RAM TRUC,3.0,3C6MR5AJ2FG524435,,50006,,,
1431126,1431126,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2007.0,ISUZU,NPR,2.0,JALB4W16277400371,,50007,,,
1431127,1431127,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2006.0,CAM,6CAM 18,10.0,5JPBU23296P015771,,50008,,,
1431128,1431128,090_New_Canaan_MV_21.xlsx,A SILLO DEVELOPMENT LLC,691 OLD STAMFORD ROAD,NEW CANAAN,CT,6840.0,2012.0,CHEVR,SILVERAD,2.0,1GB3KZCG1CF113627,691 OLD STAMFORD RD,50009,NEW CANAAN,CT,6840.0
1431129,1431129,090_New_Canaan_MV_21.xlsx,A SILLO DEVELOPMENT LLC,691 OLD STAMFORD ROAD,NEW CANAAN,CT,6840.0,2020.0,CHEVR,SILVERAD,3.0,3GCPYFED0LG246956,691 OLD STAMFORD RD,50010,NEW CANAAN,CT,6840.0


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2076515,2076515,131_Southington_MV_21.xlsx,18FORE18 INC,201 PATTONWOOD DR,SOUTHINGTON,CT,6489.0,2002,USCAR,USCL58SA,11,4X4TSE0112X053369,,,,,
2076516,2076516,131_Southington_MV_21.xlsx,3 BROTHERS PROPERTY PRESERVATION SE,461C COOKE ST,FARMINGTON,CT,6032.0,2013,FORD,F350 SUP,3,1FTRF3B68DEA53496,,,,,
2076517,2076517,131_Southington_MV_21.xlsx,A & A SURPLUS INC.,389 MARION AVE,PLANTSVILLE,CT,6479.0,2008,TOYOT,TUNDRA D,3,5TBBV54198S491700,,,,,
2076518,2076518,131_Southington_MV_21.xlsx,A & A SURPLUS INC.,389 MARION AVE,PLANTSVILLE,CT,6479.0,2012,FREIG,M2 106 M,2,1FVACXDT5CHBV1762,,,,,
2076519,2076519,131_Southington_MV_21.xlsx,A AND A SURPLUS INC,389 MARION AVE,PLANTSVILLE,CT,6479.0,1999,INTER,4000 SER,2,1HTSCAAM3XH688887,,,,,
2076520,2076520,131_Southington_MV_21.xlsx,A AND P KRISHNA CORPORATION,151 QUEEN STREET,SOUTHINGTON,CT,6489.0,2015,MERCE,SPRINTER,3,WD3PF1CC9FP167427,,,,,
2076521,2076521,131_Southington_MV_21.xlsx,A CUT ABOVE GREENCARE LLC,41 HIGHWOOD AVE.,SOUTHINGTON,CT,6489.0,2017,FREED,6X10SA,10,5WKBE101XH1045634,,,,,
2076522,2076522,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV9CLBK5195,,,,,
2076523,2076523,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV0CLBK5165,,,,,
2076524,2076524,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV0CLBK5196,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2375159,2375159,147_Voluntown_MV_21.xlsx,A KAUSCH & SONS LLC,15 BEACH VIEW ROAD EXT,VOLUNTOWN,CT,6384,2006,CHEVR,EXPRESS,2,1GBJG31U761268117,,,,,
2375160,2375160,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,GMC,ACADIA S,1,1GKKNULS2KZ110890,,,,,
2375161,2375161,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,CHEVR,EQUINOX,1,2GNAXUEV3K6194780,,,,,
2375162,2375162,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,CHEVR,SILVERAD,1,1GCRYCEF6KZ368941,,,,,
2375163,2375163,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,SILVERAD,1,1GCPYFED7LZ165405,,,,,
2375164,2375164,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,SILVERAD,1,1GCRYBEH8LZ271768,,,,,
2375165,2375165,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,CHEVR,SILVERAD,3,1GCRYBEH8MZ187550,,,,,
2375166,2375166,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,GMC,YUKON DE,3,1GKS2DKL1MR127895,,,,,
2375167,2375167,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,GMC,ACADIA S,1,1GKKNULS2MZ210281,,,,,
2375168,2375168,147_Voluntown_MV_21.xlsx,ADAMS ANGELA D,66 TEN ROD RD,VOLUNTOWN,CT,6384,2014,MERCE,C300 4 M,1,WDDGF8AB8EA949962,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


# Further checks

The original purpose of recompiling the data was to address issues with missing VINs, missing zip codes, and missing street addresses. below, I confirm that these issues have been addressed.

In [5]:
recompiled_data = pd.read_csv(raw_path / "2019-21_data_compiled_RN_100323.csv", chunksize = 1000)

In [6]:
i = 0

missing_df = pd.DataFrame([])

for chunk in recompiled_data:
    
    # Give progress
    if (i % 1000 ==0):
        print(f"Currently on chunk number {i}")
        # display(df)
        
    # Create DF
    # Get the total number for that record_from
    total = chunk.groupby("record_from").count()["Unnamed: 0"].reset_index(name="count")
    
    # Get the number with a missing ZIP
    missing_zips = chunk.groupby("record_from")["zip"].apply(lambda x: x.isna().sum()).reset_index(name = "missing zip")
    
    # Get the number with a missing VIN
    missing_vins = chunk.groupby("record_from")["vehicle_id"].apply(lambda x: x.isna().sum()).reset_index(name = "missing VIN")
    
    # Missing address
    missing_streets = chunk.groupby("record_from")["street"].apply(lambda x: x.isna().sum()).reset_index(name = "missing street")
    
    # Get the number with state not in CT
    state_not_CT = chunk.groupby("record_from")["state"].apply(lambda x: (x!="CT").sum()).reset_index(name ="not CT")
    
    # Merge
    all_columns  = missing_zips.merge(missing_vins,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(missing_streets,
                                   how = 'outer',
                                   left_on = 'record_from',
                                   right_on = 'record_from')
    
    all_columns  = all_columns.merge(state_not_CT,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(total,
                                    how = 'outer',
                                    left_on = 'record_from',
                                    right_on = 'record_from')
    
    
    # Assign this to a DF to save
    missing_df = pd.concat([missing_df, all_columns])
    
    # add to count
    i +=1

missing_df = missing_df.groupby("record_from").sum()
missing_df = missing_df.reset_index()

Currently on chunk number 0
Currently on chunk number 1000
Currently on chunk number 2000
Currently on chunk number 3000
Currently on chunk number 4000
Currently on chunk number 5000


In [7]:
missing_df_new = missing_df.copy(deep = True)

In [8]:
i = 0

missing_df = pd.DataFrame([])

for chunk in original_data:
    
    # Give progress
    if (i % 1000 ==0):
        print(f"Currently on chunk number {i}")
        # display(df)
        
    # Create DF
    # Get the total number for that record_from
    total = chunk.groupby("record_from").count()["Unnamed: 0"].reset_index(name="count")
    
    # Get the number with a missing ZIP
    missing_zips = chunk.groupby("record_from")["zip"].apply(lambda x: x.isna().sum()).reset_index(name = "missing zip")
    
    # Get the number with a missing VIN
    missing_vins = chunk.groupby("record_from")["vehicle_id"].apply(lambda x: x.isna().sum()).reset_index(name = "missing VIN")
    
    # Missing address
    missing_streets = chunk.groupby("record_from")["street"].apply(lambda x: x.isna().sum()).reset_index(name = "missing street")
    
    # Get the number with state not in CT
    state_not_CT = chunk.groupby("record_from")["state"].apply(lambda x: (x!="CT").sum()).reset_index(name ="not CT")
    
    # Merge
    all_columns  = missing_zips.merge(missing_vins,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(missing_streets,
                                   how = 'outer',
                                   left_on = 'record_from',
                                   right_on = 'record_from')
    
    all_columns  = all_columns.merge(state_not_CT,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(total,
                                    how = 'outer',
                                    left_on = 'record_from',
                                    right_on = 'record_from')
    
    
    # Assign this to a DF to save
    missing_df = pd.concat([missing_df, all_columns])
    
    # add to count
    i +=1

missing_df = missing_df.groupby("record_from").sum()
missing_df = missing_df.reset_index()

Currently on chunk number 0
Currently on chunk number 1000
Currently on chunk number 2000
Currently on chunk number 3000
Currently on chunk number 4000
Currently on chunk number 5000


In [9]:
missing_df_original = missing_df.copy(deep=True)

# Check zip codes and towns

The goal of this analysis is to identify discrepancies in addresses for the vehicles listed. Specifically, we want to know with confidence how to allocate EVs to Connecticut. We also wish to match the addresses to L2 data, requiring that we can confidently link a vehicle with its address. We have three variables we can easily use: **state**, **zip code** and **town name**.

* The taxpayer Town Name and State Name are not in CT, but the zip code is --> Probably outside of CT
    * Analysis: Get the Town Names for CT, and check the opposite of this
    * Implications: Will need to look at the addresses. If both the town name and State Name don't belong to CT, then it's likely the address will be outside CT, and we can allocate the entry as being outside CT
* The taxpayer Town Name and State Name are in CT, but the zip code is not --> Need to look into the address
    * Analysis: Get the Town Names for CT, and check for these town names and the State Name as CT. For these, find those with non-CT Zip and investigate.
    * Implications: The ZIP may be wrongly entered, there may be no address, or the address might be outside of CT.
* The taxpayer Town Name is in CT, but the zip code and state are not --> Need to determine if the town name is duplicated across states

## Define required functions

In [None]:
def check_valid_zip(zip_code):
    zip_str = str(zip_code)
    split_zip = re.split("-", zip_str)
    if len(split_zip) == 2:
        if check_valid_zip(split_zip[0]) & check_valid_zip(split_zip[1]):
            return 2
        elif check_valid_zip(split_zip[0]):
            return 3
        else:
            return 0
    elif len(split_zip) == 1:
        # MUST ADDRESS STARTING "Os"
        # starting_o = re.match('^O', split_zip[0])
        if len(split_zip[0]) < 5:
            matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
            if matched:
                return 1
            else:
                return 0
        else:
            split_zip = [split_zip[0][0:5], split_zip[0][5:]]
            if check_valid_zip(split_zip[0]) & check_valid_zip(split_zip[1]):
                return 2
            elif check_valid_zip(split_zip[0]):
                return 3
            else:
                return 0
    else:
        return 0

In [48]:
def create_valid_zip(zip):
    try:
        zip_str = str(zip)
        zip_str = zip_str.strip()
        has_dot = re.search(r"\.", zip_str)

        # Get rid of decimal places
        if has_dot:
            zip_str = zip_str[0:re.search(r"\.", zip_str).start()]

        split_zip = re.split("-", zip_str)
        
        if len(split_zip) == 2:
            return create_valid_zip(split_zip[0])
        else:
            # If length is less than 4, return na
            if len(zip_str) < 4:
                return np.NaN
                
            # If length is 4 or 5, check it
            elif((len(zip_str) == 5) | (len(zip_str) == 4)):
                matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)
                if matched:
                    return matched[0].zfill(5)
                else:
                    return np.NaN
            # If the zip is between 5 and 8 (inclusive) long, we assume the first 4 are the first part
            # And the second 4 are the second part
            # There is no other way to do this... 
            elif((len(zip_str) > 5) & (len(zip_str)<9)):
                return create_valid_zip(zip_str[0:4])
            elif (len(zip_str) == 9):
                return create_valid_zip(zip_str[0:5])
            else:
                return np.NaN
        
    except Exception as e:
        print(e)
        return np.NaN

  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)


In [5]:
def get_valid_zips(zip_df, zip_column_name):
    """
    Inputs: A DataFrame with a column called "zip"
    Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
    """
    # Prepare the list to be used
    zip_list = zip_df[[zip_column_name]].rename(columns = {zip_column_name : "zip"}).reset_index(drop = True)

    # Get validity code
    zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: check_valid_zip(x))

    # Get indices
    correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
    invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
    two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index

    zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
    zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
    zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]

    return zip_list[["zip_corrected"]]

def convert_vin_valid(vin):
    try:
        vin_str = str(vin)
        if len(vin_str) < 11:
            return "NA"
        if " " in vin_str[0:11]:
            return "NA"
        else:
            return vin_str[0:8]+"*"+vin_str[9:11]
    except:
        return "NA"

def return_matched_vins(chunk_number, df, vin_column, matching_list):
    match = df.merge(matching_list,
                    left_on = vin_column,
                    right_on = vin_column,
                    how = 'left')
    
    # Get rows of DF where VINS matched
    df_vins_matched = match.loc[match["Manufacturer Name"].notna(), :]
    df_vins_unmatched = match.loc[match["Manufacturer Name"].isna(), :]
    
    # Get length
    len_matched = len(df_vins_matched)
    len_unmatched = len(df_vins_unmatched)
    len_all = len(match)
    
    # Create df
    tally_dict = {"Chunk Number": [chunk_number],
                  "Matched" : [len_matched],
                  "Unmatched" : [len_unmatched],
                  "All" : [len_all]}
    
    match_unmatched_tally = pd.DataFrame(tally_dict)

    return [match, match_unmatched_tally]

def ct_zip(zip):
    try:
        zip_str = str(int(zip)).zfill(5)
        if zip_str[0:2] == "06":
            return True
        else:
            return False
    except:
        return False

# Find CT zips with non-CT addresses

In [49]:
recompiled_data = reset_recompiled_data()

In [50]:
ct_zip_nonct_state = pd.DataFrame()
nonct_zip_ct_state = pd.DataFrame()

i = 0

for chunk in tqdm(recompiled_data):
    # Clean up
    chunk = chunk.drop("Unnamed: 0", axis = 1).reset_index(drop = True)
    
    # Check that the ZIPs and the states match
    chunk_corrected_zips = chunk
    chunk_corrected_zips["zip_corrected"] = chunk_corrected_zips["zip"].apply(lambda x: create_valid_zip(x))
    # get_valid_zips(chunk, "zip")
    # Join them
    #chunk_corrected_zips = chunk.join(valid_zips).reset_index(drop = True)
    
    # Get CT zip codes
    ct_zip_mask = chunk_corrected_zips["zip_corrected"].apply(lambda x: ct_zip(x))
    ct_state_mask = chunk_corrected_zips["state"] == "CT"
    ct_zip_nonct_state_mask = ~ct_state_mask & ct_zip_mask
    nonct_zip_ct_state_mask = ~ct_zip_mask & ct_state_mask
    
    # Get ct zip nonct state
    ct_zip_nonct_state_chunk = chunk_corrected_zips.loc[ct_zip_nonct_state_mask]
    ct_zip_nonct_state_chunk = ct_zip_nonct_state_chunk.dropna(axis = 0, subset = ["zip_corrected"])
    ct_zip_nonct_state = pd.concat([ct_zip_nonct_state, ct_zip_nonct_state_chunk])

    # Get non CT zip ct state
    nonct_zip_ct_state_chunk = chunk_corrected_zips.loc[nonct_zip_ct_state_mask]
    nonct_zip_ct_state_chunk = nonct_zip_ct_state_chunk.dropna(axis = 0, subset = ["zip_corrected"])
    nonct_zip_ct_state = pd.concat([nonct_zip_ct_state, nonct_zip_ct_state_chunk])

5788it [01:24, 68.59it/s]


In [52]:
nonct_zip_ct_state

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
133,003_Ashford_MV_21.xlsx,COSTELLO MARY L,46 CHERRY VALLEY RD,COLUMBIA,CT,3237,2018.0,SUBAR,LEGACY 2,1.0,4S3BNAF6XJ3016489,,,,,,03237
354,003_Ashford_MV_21.xlsx,JOHNS DANIEL B,5888 FOX ST,HARRISBURG,CT,17112,2017.0,HYUND,ELANTRA,1.0,KMHD04LB0HU355868,,,,,,17112
719,003_Ashford_MV_21.xlsx,PLATT VICKI H,5813 ROVER DR,JACKSONVILLE,CT,32244,2012.0,HYUND,ACCENT G,1.0,KMHCU4AE5CU072890,,,,,,32244
723,003_Ashford_MV_21.xlsx,WEIDIG RANDALL A,PO BOX 204,STAFFORD SPRINGS,CT,60768,2011.0,HM,HM,11.0,CTTRL22087,,,,,,60768
757,003_Ashford_MV_21.xlsx,WHITE-INGALLS PATRICIA A,777 WESTFORD RD,ASHFORD,CT,3278,2013.0,VOLKS,JETTA S/,1.0,3VWPP7AJXDM696585,,,,,,03278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,84_Milford_MVData_2020.csv,KJR ENTERPRISE CO,15 RESEACH DR,MILFORD,CT,8900,1995.0,FORD,F SUPER,70.0,1FDLF47G8SEA22495,,,,,,08900
714,85_Monroe_MVData_2020.csv,DAUTRICH STEVEN E,PO BOX 1366,NORTH CONWAY,CT,13860,2006.0,PATRI,PAC8524T,11.0,5NHUPAZ236W024707,,,,,,13860
698,88_Naugatuck_MVData_2020.csv,CHARNEY SEAN,433 CURTISS ST,SOUTHINGTON,CT,16489,2015.0,BELMO,SS101614,11.0,1B9UT1421FL657608,,,,,,16489
723,89_NewBritain_MVData_2020.csv,BABALOLA OLUWAGBENGA S,1 CUNNINGHAM SQ,RHODE ISLAND,CT,2918,2010.0,HONDA,ACCORD L,1.0,1HGCP2F41AA022438,,302536,,,,02918


In [53]:
ct_zip_nonct_state.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / "ct_zip_nonct_state.csv")
nonct_zip_ct_state.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / "nonct_zip_ct_state.csv")

In [21]:
old = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "non_matching_zips_states.csv")

  old = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "non_matching_zips_states.csv")


In [23]:
print(f"These errors come from {old["record_from"].nunique()} underlying files")

These errors come from 111 underlying files


In [24]:
old[old["state"]!="NY"].reset_index().head(40)

Unnamed: 0.1,index,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
0,0,0,009_Bethel_MV_21.xlsx,FLYNN TAYLOR D,28 SIMEON RD UNIT 16D,BETHEL,DC,6801.0,2021.0,SUBAR,,1.0,JF2GTHMC3M8281048,,,,,,6801.0
1,1,1,009_Bethel_MV_21.xlsx,MENDIETA MENDIETA GERMANIA I,6 DRUMMER LN,BETHEL,WA,6801.0,2008.0,HONDA,,1.0,JHMGD37488S002641,6 DRUMMERS LN,,,,,6801.0
2,4,4,014_Branford_MV_21.xlsx,CONNER FREDERICK J JR,19 TERHUNE AVE 1ST,BRANFORD,CO,6405.0,1993.0,DODGE,D-250,3.0,1B7JE26YXPS265330,,55273.0,,,,6405.0
3,7,7,015_Bridgeport_MV_21.xlsx,DUMAS RASHAAN,838 CLARK ST,BRIDGEPORT,NJ,6606.0,2006.0,NISSA,MURANO S,1.0,JN8AZ08W16W535318,,,,,,6606.0
4,8,8,015_Bridgeport_MV_21.xlsx,LUGO-BALOIS JESUS A,368 BIRMINGHAM ST,BRIDGEPORT,MD,6606.0,1998.0,TOYOT,SIENNA L,1.0,4T3ZF13C5WU019942,,,,,,6606.0
5,9,9,015_Bridgeport_MV_21.xlsx,LUGO-BALOIS JESUS A,368 BIRMINGHAM ST,BRIDGEPORT,MD,6606.0,2011.0,HONDA,CIVIC LX,1.0,2HGFA1F5XBH303769,,,,,,6606.0
6,10,10,015_Bridgeport_MV_21.xlsx,PARK CITY WIRELESS LLC,2418 MAIN STREET,BRIDGEPORT,OT,6606.0,2009.0,TOYOT,CAMRY HY,1.0,4T1BB46K09U068660,,,,,,6606.0
7,11,11,015_Bridgeport_MV_21.xlsx,PARK CITY WIRELESS LLC,2418 MAIN STREET,BRIDGEPORT,OT,6606.0,2021.0,TESLA,MODEL Y,1.0,5YJYGDEE0MF070602,,,,,,6606.0
8,13,13,015_Bridgeport_MV_21.xlsx,SILVA MADSON C,40 GREENWOOD ST,BRIDGEPORT,MA,6606.0,2014.0,TOYOT,VENZA LE,1.0,4T3BA3BB6EU050331,,,,,,6606.0
9,14,14,015_Bridgeport_MV_21.xlsx,TIFFANY CLEANING LLC,350 GROVERS AVE 10-C,BRIDGEPORT,OT,6605.0,2016.0,DODGE,GRAND CA,3.0,2C4RDGBG9GR107952,,,,,,6605.0


* There appear to be a few kinds of errors:
    * **CT ZIP, Non CT Address:**
        * E.g. "31 SLIPTOWN RD" / "SHARON" / "NH" / 6010 (31 Sliptown Rd. is a real address in New Hampshire)
        * "12941 N Fox Hollow Dr" / "MARANA" / "AZ" (Real address, but wrong ZIP code"
        * Note that in both of the above instances from 017_Brisol_MV_21 - the underlying file provides no further information on how to parse this information.
        * 79 WOODLAND RD CARIBOU ME 6417 - Not a CT address, CT Zip code. The Deep River underlying file provides no way to fix this. 
    * **Erroneous CT Addresses**
        * E.g. 693 DOGWOOD DR CHESHIRE OT --> Probably means "CT," this is a real address
        * E.g. 152 DAISY LN DURHAM HI 6422 --> 152 Daisy Lane is a CT address. Perhaps "CT" was written wrong? 

## Investigate CT ZIPs non CT state

In [18]:
print(f"Of the {len(ct_zip_nonct_state)} rows, {ct_zip_nonct_state["state"].isna().sum()} are NA for state")
print(f"These are from the following records: {'\n'.join(ct_zip_nonct_state[ct_zip_nonct_state["state"].isna()]["record_from"].unique().tolist())}")

Of the 66090 rows, 65713 are NA for state
These are from the following records: 099_North_Branford_MV_21.xlsx
127_Sherman_MV_21.xlsx
127_Sherman_MVData_2020.csv
22_Canterbury_MVData_2019.csv
4_Avon_MVData_2020.csv
96_Newington_MVData_2020.csv


In [60]:
ct_zip_nonct_state[ct_zip_nonct_state["state"].notna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
618,009_Bethel_MV_21.xlsx,FLYNN TAYLOR D,28 SIMEON RD UNIT 16D,BETHEL,DC,6801,2021.0,SUBAR,,1.0,JF2GTHMC3M8281048,,,,,,06801
768,009_Bethel_MV_21.xlsx,MENDIETA MENDIETA GERMANIA I,6 DRUMMER LN,BETHEL,WA,6801,2008.0,HONDA,,1.0,JHMGD37488S002641,6 DRUMMERS LN,,,,,06801
753,009_Bethel_MV_21.xlsx,SHERWOOD BROOKE E,20 HUDSON ST UNIT 14,BETHEL,NY,6801,2002.0,HONDA,,1.0,1HGEM22962L090002,3 KYLE CT,,,,,06801
330,011_Bloomfield_MV_21.xlsx,FURSE ELIZABETH W,2412 PENINSULA 107,FISHERS ISLE,NY,6390,2016.0,BMW,X3 XDRIV,1.0,5UXWX9C57G0D89216,,,,,,06390
702,014_Branford_MV_21.xlsx,CONNER FREDERICK J JR,19 TERHUNE AVE 1ST,BRANFORD,CO,6405,1993.0,DODGE,D-250,3.0,1B7JE26YXPS265330,,55273.0,,,,06405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,93_NewHaven_MVData_2019.csv,NOERAND MARIE M,210 BLATCCHLEY AVE 1ST FLOOR,NEW HAVEN,NY,6513.0,2004,TOYOT,SIENNA C,1,5TDZA23C14S066954,,88723,,,,06513
831,93_NewHaven_MVData_2019.csv,NOERAND MARIE M,210 BLATCCHLEY AVE 1ST FLOOR,NEW HAVEN,NY,6513.0,2007,KIA,SPECTRA,1,KNAFE122875452949,,88724,,,,06513
773,95_NewMilford_MVData_2020.csv,WILSON KEISHA L,75 MILFORD AVE APT 2G,MILFORD,NY,6460,2016,TOYOT,CAMRY LE,1,4T4BF1FK4GR532006,,,,,,06460
309,9_Bethel_MVData_2020.csv,MENDIETA MENDIETA GERMANIA I,6 DRUMMER LN,BETHEL,WA,6801,2008,HONDA,FIT,1,JHMGD37488S002641,,,,,,06801


Note: import appears to be an error - Bethel file has 06801

In [61]:
ct_zip_nonct_state[ct_zip_nonct_state["state"].notna()]["zip"].unique()

array([6801, 6390, 6405, 6604, 6606, 6605, 6010, 6804, '06013', 6019,
       6410, '06415-', 6796, 6753, 6810, '06416', 6811, 6417, 6422, 6108,
       6016, 6029, 6080, 6082, 6426, 6825, 6824, 6032, 6543, 6831, 6830,
       6351, 6340, 6437.0, 6441, 6576, 6516, 6518, 6112, 6118, 6114, 6757,
       '06335', '06234', '06390-0536', '06040', '06042', 6450.0, 6416.0,
       6457.0, 6460, 6468, 6782, 6370, 6053, 6052, 6051, 6770, 6840, 6057,
       6111, 6492, 6515, 6519, 6512, 6513, 6511, 6093, 6026, '06390',
       '06320', '06755', 6776, 6470, 6482, 6473, '6473', 6359,
       '06854     ', '06851     ', '06855     ', 6360, 6371, 6475, 6478,
       6239, 6062, '06060', 6259, 6712, '06712', 6751, 6260, 6015, 6483,
       6484, 6092, 6081, 6844, 6489.0, 6902, '06905', 6906, 6907, 6855,
       6905, 6708, 6379, 6378, 6078, '06705', '06708', '06704', 6375,
       6498, 6119, 6117, '06883', 6880, 6109, '06279', 6897, 6095, 6716,
       6525, 6074, 6903, 6901, 6904, 6883.0, 6066, 6882, 6401.0, 6

In [66]:
ct_zip_nonct_state[ct_zip_nonct_state["state"].notna()].groupby("record_from").count()[["vehicle_id"]].sort_values("vehicle_id", ascending=False)

Unnamed: 0_level_0,vehicle_id
record_from,Unnamed: 1_level_1
135_Stamford_MV_21.xlsx,16
034_Danbury_MV_21.xlsx,14
105_Old_Lyme_MV_21.xlsx,13
093_New_Haven_MV_21.xlsx,12
105_Old_Lyme_MVData_2019.csv,11
...,...
84_Milford_MVData_2020.csv,1
011_Bloomfield_MV_21.xlsx,1
102_North_Stonington_MV_21.xlsx,1
104_Norwich_MV_21.xlsx,1


### Investigate Non CT ZIPS CT state

In [54]:
print(f"This error is present in {nonct_zip_ct_state["record_from"].nunique()} underlying files, suggesting it is generalized")

This error is present in 125 underlying files, suggesting it is generalized


In [55]:
nonct_zip_ct_state.groupby("record_from").count()[["vehicle_id"]].sort_values("vehicle_id", ascending=False)

Unnamed: 0_level_0,vehicle_id
record_from,Unnamed: 1_level_1
103_Norwalk_MV_21.xlsx,491
135_Stamford_MVData_2020.csv,13
135_Stamford_MV_21.xlsx,12
111_Plymouth_MV_21.xlsx,12
034_Danbury_MV_21.xlsx,7
...,...
89_NewBritain_MVData_2020.csv,1
009_Bethel_MV_21.xlsx,1
011_Bloomfield_MV_21.xlsx,1
015_Bridgeport_MV_21.xlsx,1


In [56]:
nonct_zip_ct_state[nonct_zip_ct_state["record_from"].str.contains("Norwalk")]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
974,103_Norwalk_MV_21.xlsx,HEIDEMANN BEVERLY ...,146 W ROCKS RD ...,NORWALK,CT,68512-232,2011.0,YAKIM,VINS,11.0,510SF1114BN009217,,,,,,68512
975,103_Norwalk_MV_21.xlsx,MANDUJANO JAIME ...,58 N TAYLOR AVE ...,NORWALK,CT,68541-410,1977.0,HOLSC,2000,11.0,CTUNKNOWN95699011,,,,,,68541
977,103_Norwalk_MV_21.xlsx,SON YOUNG SANG ...,198 ELY AVE ...,NORWALK,CT,68544-229,2011.0,CHANGZHOU NANXIASHU,TRAILER,11.0,LN2AD001XBJ000038,,,,,,68544
978,103_Norwalk_MV_21.xlsx,STEWARD WILLIAM C ...,13 SYCAMORE ST ...,NORWALK,CT,68552-003,2007.0,LOADR,14U800,11.0,5A4XJRJ1472015660,,,,,,68552
979,103_Norwalk_MV_21.xlsx,REED MICHAEL E ...,146 PONUS AVE ...,NORWALK,CT,68501-832,2006.0,LOADR,OUT14800,11.0,5A4XJRJ1962012902,,,,,,68501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,103_Norwalk_MV_21.xlsx,NORTH GREENY LANDSCAPING LLC,4 TERRY LN # 4,NORWALK,CT,68544-304,2005.0,CHEVR,C4500 C4C042,2.0,1GBE4C1255F517265,,,,,,68544
606,103_Norwalk_MVData_2020.csv,JONES LYNNELLE ...,10 POINT RD WILSON PT ...,NORWALK,CT,00000,1991.0,MERCEDES-BENZ,560SEL,1.0,WDBCA39E1MA580237,,,,,,00000
737,103_Norwalk_MVData_2020.csv,FUSCO AL A ...,10 ROBIN SQ ...,NORWALK,CT,00000,1978.0,HOMEM,HM,11.0,CTUNKNOWN29263511,,,,,,00000
219,103_Norwalk_MVData_2020.csv,JOHNS AUTO CENTER INC ...,161 CONN AVE ...,NORWALK,CT,00000,1978.0,CHEVROLET,CK31003,70.0,CKL3381182236,,,,,,00000


In [57]:
nonct_zip_ct_state[nonct_zip_ct_state["record_from"].str.contains("Norwalk")]["zip"].unique()

array(['68512-232 ', '68541-410 ', '68544-229 ', '68552-003 ',
       '68501-832 ', '68511-221 ', '68516-038 ', '68501-616 ',
       '00000     ', '68501-733 ', '68531-132 ', '68541-067 ',
       '68513-605 ', '68514-466 ', '68512-827 ', '68504-430 ',
       '68512-214 ', '68511-215 ', '68515-328 ', '68512-431 ',
       '68543-216 ', '68502-730 ', '68501-835 ', '68552-103 ',
       '68513-420 ', '68502-722 ', '68515-534 ', '68543-305 ',
       '68543-627 ', '68543-422 ', '68513-008 ', '68552-235 ',
       '68544-714 ', '68502-840 ', '68544-716 ', '68504-320 ',
       '68515-940 ', '68513-108 ', '68511-715 ', '68541-613 ',
       '68513-221 ', '68542-107 ', '68502-408 ', '68543-512 ',
       '68511-042 ', '68516-027 ', '68514-316 ', '68502-728 ',
       '68541-561 ', '68543-548 ', '68516-141 ', '68551-615 ',
       '68544-304 ', '68512-645 ', '68511-536 ', '68544-727 ',
       '68544-308 ', '68543-735 ', '68542-514 ', '68501-704 ',
       '68513-115 ', '69024-108 ', '68552-703 ', '68515

Appears to occur most in **Ridgefield** and **Norwalk.** However **Ridgefield** is erroneous: The zips should be 06877.

**NORWALK:** The import is correct. The VINs are wrong

## Check CT towns with non-CT states

In [88]:
recompiled_data = reset_recompiled_data()

In [90]:
ct_towns = []

# Get list of CT towns
for chunk in tqdm(recompiled_data):
    ct_state_mask = chunk["state"] == "CT"
    ct_town_list = chunk.loc[ct_state_mask, "city"].unique().tolist()
    for town in ct_town_list:
        if not town in ct_towns:
            ct_towns.append(town)

5787it [01:00, 94.97it/s] 


In [93]:
ct_towns = [x.strip() for x in ct_towns if type(x) == str]

In [156]:
ct_towns_raw = ['Andover','Ansonia','Ashford','Avon','Bantam','Barkhamsted',
                'Beacon Falls','Berlin','Bethany','Bethel','Bethlehem','Bloomfield',
                'Bolton','Bozrah','Branford','Bridgeport','Bridgewater','Bristol',
                'Broad Brook','Brookfield','Brooklyn','Burlington','Canaan','Canterbury',
                'Canton','Chaplin','Cheshire','Chester','Clinton','Colchester','Colebrook',
                'Collinsville','Columbia','Cornwall','Coventry','Cromwell','Danbury',
                'Danielson','Darien','Deep River','Derby','Durham','East Granby','East Haddam',
                'East Hampton','East Hartford','East Haven','East Lyme','East Windsor','Eastford','Easton','Ellington','Enfield','Essex','Fairfield','Farmington','Georgetown','Glastonbury','Goshen','Granby','Greenwich','Groton','Guilford','Haddam','Hamden','Hampton','Hartford','Harwinton','Hebron','Higganum','Jewett City','Kent','Killingworth','Lebanon','Ledyard','Litchfield','Madison','Manchester','Mansfield Center','Marlborough','Meriden','Middlebury','Middlefield','Middletown','Milford','Monroe','Montville','Moodus','Moosup','Morris','Mystic','Naugatuck','New Britain','New Canaan','New Fairfield','New Hartford','New Haven','New London','New Milford','Newington','Newtown','Niantic','Norfolk','North Branford','North Granby','North Grosvenordale','North Haven','North Stonington','Norwalk','Norwich','Oakville','Old Lyme','Old Mystic','Old Saybrook','Orange','Oxford','Pawcatuck','Plainfield','Plainville','Plymouth','Pomfret','Poquonock','Portland','Preston','Prospect','Putnam','Redding','Ridgefield','Rocky Hill','Roxbury','Salem','Salisbury','Scotland','Seymour','Sharon','Shelton','Sherman','Simsbury','Somers',
                'South Windham','South Windsor','South Woodstock','Southbury','Southington',
                'Stafford','Stamford','Sterling','Stonington','Stratford','Suffield','Tariffville',
                'Terryville','Thomaston','Thompson','Tolland','Torrington','Trumbull','Voluntown','Wallingford',
                'Washington','Waterbury','Waterford','Watertown','Wauregan','Weatogue','West Hartford','West Haven',
                'West Simsbury','Westbrook','Weston','Westport','Wethersfield','Willimantic','Willington','Wilton',
                'Winchester Center','Windham','Windsor Locks','Windsor','Winsted','Wolcott','Woodbridge','Woodbury','Woodstock']

ct_towns_raw = [x.upper() for x in ct_towns_raw]

In [160]:
# Now get entries where town is a CT town but the state is not

recompiled_data = reset_recompiled_data()

ct_town_nonct_state = pd.DataFrame([])
nonct_town_ct_state = pd.DataFrame([])

In [164]:
for chunk in tqdm(recompiled_data):
    try:
        ct_state_mask = chunk["state"] == "CT"
        
        ct_city_mask = chunk["city"].str.upper().str.strip().isin(ct_towns_raw)
        ct_town_nonct_state_mask = ct_city_mask & ~ct_state_mask
        nonct_town_ct_state_mask = ~ct_city_mask & ct_state_mask

        chunk_ct_town_nonct_state = chunk.loc[ct_town_nonct_state_mask]
        chunk_nonct_town_ct_state = chunk.loc[nonct_town_ct_state_mask]

        ct_town_nonct_state = pd.concat([ct_town_nonct_state, chunk_ct_town_nonct_state])
        nonct_town_ct_state = pd.concat([nonct_town_ct_state, chunk_nonct_town_ct_state])
        
    except:
        pass

4710it [07:04, 11.09it/s]


In [None]:
# Correct zips
valid_zips = get_valid_zips(nonct_town_ct_state, "zip")

# Join them
nonct_town_ct_state = nonct_town_ct_state.join(valid_zips).reset_index(drop = True)

In [30]:
nonct_town_ct_state.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / "nonct_town_ct_state.csv")
ct_town_nonct_state.reset_index(drop = True).to_csv(path.parent.parent / "data"/ "analysis_outputs" / "ct_town_nonct_state.csv")

In [3]:
# Read back in
nonct_town_ct_state = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "nonct_town_ct_state.csv")

  nonct_town_ct_state = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "nonct_town_ct_state.csv")


In [4]:
nonct_town_ct_state

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
0,0,0,55,001_Andover_MV_21.csv,ALLEN CARLY M,125 SOUTH ST APT 283,VERNON ROCKVILLE,CT,6066,2013.0,HONDA,CIVIC SI,1.0,2HGFG4A51DH703262,,,,,,6066.0
1,1,1,391,001_Andover_MV_21.csv,BOUCHARD DANIEL B,42 KEENE PL,UNIONVILLE,CT,6085,2009.0,CHEVR,SILVERAD,3.0,1GCEK29099Z180506,,,,,,6085.0
2,2,2,934,001_Andover_MV_21.csv,EDBERG NATHAN S,77 EAST ST,STAFFORD SPRINGS,CT,6076,2011.0,FORD,F350 SUP,3.0,1FT8W3BT0BEB85196,,,,,,6076.0
3,3,3,1576,001_Andover_MV_21.csv,JAHN MATTHEW M,10 COUNTRY RD,UNIONVILLE,CT,6085,2007.0,CHEVR,IMPALA L,1.0,2G1WT58N679293166,,,,,,6085.0
4,4,4,1892,001_Andover_MV_21.csv,LAVEY LYNN K,141 LEBANON RD,NORTH FRANKLIN,CT,6254,2017.0,ACURA,RDX ADVA,1.0,5J8TB4H70HL014959,,,,,,6254.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362233,362233,362233,5782863,9_Bethel_MVData_2020.csv,REAL MOVING LLC,1 PINE ST,SANDY HOOK,CT,6482,2004.0,INTER,4000 SER,2.0,1HTMMAAM84H612857,,,,,,6482.0
362234,362234,362234,5784057,9_Bethel_MVData_2020.csv,SCRIBNER MILES D,4 TOP PASTURE RD,WASHINGTON DEPOT,CT,6794,2018.0,SUBAR,LEGACY 2,1.0,4S3BNAC62J3020380,,,,,,6794.0
362235,362235,362235,5784605,9_Bethel_MVData_2020.csv,SOMOSKOVEC ROBERT M,PO BOX 177,HAWLEYVILLE,CT,6440,2005.0,JAGUA,X-TYPE 3,1.0,SAJWA51AX5WE52543,,,,,,6440.0
362236,362236,362236,5784606,9_Bethel_MVData_2020.csv,SOMOSKOVEC ROBERT M,PO BOX 177,HAWLEYVILLE,CT,6440,2008.0,FORD,RANGER,1.0,1FTZR15E28PA69914,,,,,,6440.0


In [40]:
# Create a list of non CT towns, with the state CT, and the zip code.
nonct_town_ct_state_zips = nonct_town_ct_state[["city", "state", "zip_corrected"]]
nonct_town_ct_state_zips["city_state_zip_corrected"] = nonct_town_ct_state_zips["city"].str.strip() + ", "+ nonct_town_ct_state_zips["state"] +", "+ nonct_town_ct_state_zips["zip_corrected"].astype(str).str[:-2]
nonct_town_ct_state_zips_list = nonct_town_ct_state_zips["city_state_zip_corrected"].unique().tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonct_town_ct_state_zips["city_state_zip_corrected"] = nonct_town_ct_state_zips["city"].str.strip() + ", "+ nonct_town_ct_state_zips["state"] +", "+ nonct_town_ct_state_zips["zip_corrected"].astype(str).str[:-2]


In [43]:
# Save this to a text file
text_file = open(path.parent.parent / "data" / "analysis_outputs" / "possible_nonct_towns_withzips.txt", "w")
text_file.write(', '.join([x for x in nonct_town_ct_state_zips_list if type(x) == str]))
text_file.close()

In [180]:
# The same thing, but with only the town names and not the zip code and the state name
possible_nonct_towns = [x.strip() for x in nonct_town_ct_state["city"].unique().tolist() if type(x) == str]
text_file = open(path.parent.parent / "data" / "analysis_outputs" / "possible_nonct_towns.txt", "w")
text_file.write(', '.join(possible_nonct_towns))
text_file.close()

In [194]:
poss_nonct = pd.DataFrame(possible_nonct_towns, columns = ["town_names"])
poss_nonct["town_name_state"] = poss_nonct["town_names"] + ", CT"

### Attempt to GeoCode non-CT towns with CT state (without ZIPs)

In [234]:
geocoded_out = gpd.GeoDataFrame([])

In [235]:
count = 0

for i in tqdm(range(84)):
    try:
        to_geocode = poss_nonct["town_name_state"][count:count+10]
    except:
        remainder = len(possible_nonct_towns)-count-1
        to_geocode = poss_nonct["town_name_state"][count:count+remainder]
        
    geo = geocode(to_geocode, provider='nominatim', user_agent='yale', timeout=4)
    geo["town_name_state"] = to_geocode
    geo = geo.reset_index(drop=True)

    geocoded_out = pd.concat([geocoded_out, geo])
    count +=10

100%|██████████████████████████████████████████| 84/84 [07:20<00:00,  5.24s/it]


In [242]:
geocoded_out.to_csv(path.parent.parent / "data" / "analysis_outputs" / "geocoded_ct_towns.csv")

### Attempt to GeoCode non-CT towns with CT state (with ZIPs)

In [48]:
geocoded_out_zips = gpd.GeoDataFrame([])

In [None]:
count = 0

In [None]:
for i in tqdm(range(109, 203)):
    try:
        to_geocode = nonct_town_ct_state_zips_list[count:count+10]
    except:
        remainder = len(nonct_town_ct_state_zips_list)-count-1
        to_geocode = nonct_town_ct_state_zips_list[count:count+remainder]
        
    geo = geocode(to_geocode, provider='nominatim', user_agent='yale', timeout=4)
    geo["city_state_zip"] = to_geocode
    geo = geo.reset_index(drop=True)

    geocoded_out_zips = pd.concat([geocoded_out_zips, geo])
    count +=10

In [67]:
geocoded_out_zips.to_csv(path.parent.parent / "data" / "analysis_outputs" / "geocoded_ct_towns_zips.csv")

In [None]:
# Truth-table analysis
def chunk_truth_table(chunk, town_list):
    ctstate_mask = chunk["state"] == "CT" 
    ctzip_mask = chunk["zip_corrected"].astype(str).str[:-2].str.zfill(5).str[0:2]=="06"
    cttown_mask = chunk["city"].str.upper().str.strip().isin(town_list)
    
    chunk.loc[ctstate_mask, "ctstate_mask"] = 1
    chunk.loc[~ctstate_mask, "ctstate_mask"] = 0

    chunk.loc[ctzip_mask, "ctzip_mask"] = 1
    chunk.loc[~ctzip_mask, "ctzip_mask"] = 0

    chunk.loc[cttown_mask, "cttown_mask"] = 1
    chunk.loc[~cttown_mask, "cttown_mask"] = 0

    chunk["masks"] = (chunk[ctstate_mask] * "ctstate_") + (chunk[ctzip_mask] * "ctzip_") + (chunk[cttown_mask] * "cttown")

    chunk["count"] = 1

    tally = chunk.groupy("masks").sum()[["masks", "count"]]

# Compare Original and New

In [107]:
missing_df_original["missing zip rank"] = missing_df_original["missing zip"].rank(method = 'min', ascending = False)
missing_df_original["missing VIN rank"] = missing_df_original["missing VIN"].rank(method = 'min', ascending = False)
missing_df_original["missing street rank"] = missing_df_original["missing street"].rank(method = 'min', ascending = False)

In [108]:
missing_df_new["missing zip rank"] = missing_df_new["missing zip"].rank(method = 'min', ascending = False)
missing_df_new["missing VIN rank"] = missing_df_new["missing VIN"].rank(method = 'min', ascending = False)
missing_df_new["missing street rank"] = missing_df_new["missing street"].rank(method = 'min', ascending = False)

In [197]:
missing_df_original.sort_values("missing street rank", ascending = True).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count,missing zip rank,missing VIN rank,missing street rank
91,103_Norwalk_MV_21.xlsx,75044,75044,75044,75044,75044,4.0,1.0,1.0
90,103_Norwalk_MVData_2020.csv,71664,71664,71664,71664,71664,5.0,2.0,2.0
290,77_Manchester_MVData_2019.csv,48990,3203,48990,48990,48990,12.0,12.0,3.0
64,077_Manchester_MV_21.xls,44487,3,44487,44487,44487,18.0,37.0,4.0
124,11_Bloomfield_MVData_2019.csv,19643,19643,19643,1630,19643,71.0,6.0,5.0
8,009_Bethel_MV_21.xlsx,17855,1,17855,1678,17855,79.0,44.0,6.0
99,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522,119.0,61.0,7.0
235,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202,123.0,61.0,8.0
169,141_Thompson_MV_21.xlsx,10054,457,10054,10054,10054,151.0,14.0,9.0
28,037_Derby_MV_21.xls,9511,0,9511,9511,9511,161.0,61.0,10.0


In [192]:
missing_df_original_zips_formerge = missing_df_original[["record_from", "missing zip", "missing zip rank"]].sort_values("missing zip rank", ascending = True).head(15)
missing_df_original_VINS_formerge = missing_df_original[["record_from", "missing VIN", "missing VIN rank"]].sort_values("missing VIN rank", ascending = True).head(10)
missing_df_original_streets_formerge = missing_df_original[["record_from", "missing street", "missing street rank"]].sort_values("missing street rank", ascending = True).head(10)

In [193]:
missing_df_new_zips_formerge = missing_df_new[["record_from", "missing zip", "missing zip rank"]].sort_values("missing zip rank", ascending = True).head(15)
missing_df_new_VINS_formerge = missing_df_new[["record_from", "missing VIN", "missing VIN rank"]].sort_values("missing VIN rank", ascending = True).head(10)
missing_df_new_streets_formerge = missing_df_new[["record_from", "missing street", "missing street rank"]].sort_values("missing street rank", ascending = True).head(10)

In [182]:
compare_missing_zips = missing_df_original_zips_formerge.merge(missing_df_new_zips_formerge, how = 'left',
                                                              left_on = "missing zip rank",
                                                              right_on = "missing zip rank",
                                                              suffixes = ("_original", "_recompiled"))

compare_missing_zips = compare_missing_zips.set_index("missing zip rank")

compare_missing_zips["Difference"] = compare_missing_zips["missing zip_recompiled"] - compare_missing_zips["missing zip_original"]

compare_missing_zips = compare_missing_zips.reset_index()

In [172]:
cm = sns.light_palette("green", as_cmap=True, reverse = True)

In [183]:
compare_missing_zips = compare_missing_zips.style.background_gradient(subset = ["Difference"], cmap=cm)
# compare_missing_zips.data = compare_missing_zips.data.drop("Town_original", axis = 1)
compare_missing_zips.data = compare_missing_zips.data.drop("missing zip rank", axis = 1)

In [184]:
total_row_zips = pd.DataFrame(compare_missing_zips.data.sum(), columns=["Total"]).T
total_row_zips.loc["Total", "record_from_original"] = ""
total_row_zips.loc["Total", "record_from_recompiled"] =""

In [186]:
compare_missing_zips.data = pd.concat([compare_missing_zips.data, total_row_zips])

In [188]:
compare_missing_zips.set_caption("Comparison of Missing ZIP Codes: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing zip_original,record_from_recompiled,missing zip_recompiled,Difference
0,15_Bridgeport_MVData_2019.csv,82704,107_Orange_MV_21.xlsx,13522,-69182
1,015_Bridgeport_MV_21.xlsx,80383,27_Clinton_MVData_2020.csv,13202,-67181
2,135_Stamford_MV_21.xlsx,77594,131_Southington_MV_21.xlsx,10089,-67505
3,103_Norwalk_MV_21.xlsx,75044,037_Derby_MV_21.xls,9511,-65533
4,103_Norwalk_MVData_2020.csv,71664,117_Redding_MV_21.xlsx,8171,-63493
5,151_Waterbury_MV_21.xlsx,64647,016_Bridgewater_MV_21_ALTERED.csv,4886,-59761
6,93_NewHaven_MVData_2019.csv,60366,045_East_Lyme_MV_21_ALTERED.csv,4475,-55891
7,093_New_Haven_MV_21.xlsx,57470,44_East_Lyme_MVData_2019.csv,4301,-53169
8,057_Greenwich_MV_21.xlsx,51748,112_Pomfret_MVData_2020.csv,4130,-47618
9,57_Greenwich_MVData_2020.csv,51478,055_Goshen_MV_21.XLSX,4004,-47474


### Comparing missing VINs

In [137]:
compare_missing_VINS = missing_df_original_VINS_formerge.merge(missing_df_new_VINS_formerge, how = 'left',
                                                              left_on = "missing VIN rank",
                                                              right_on = "missing VIN rank",
                                                              suffixes = ("_original", "_recompiled"))

compare_missing_VINS = compare_missing_VINS.set_index("missing VIN rank")

compare_missing_VINS["Difference"] = compare_missing_VINS["missing VIN_recompiled"] - compare_missing_VINS["missing VIN_original"]

compare_missing_VINS = compare_missing_VINS.reset_index()

compare_missing_VINS = compare_missing_VINS.style.background_gradient(subset = ["Difference"], cmap=cm)

compare_missing_VINS.data = compare_missing_VINS.data.drop("missing VIN rank", axis = 1)

Unnamed: 0,record_from_original,missing VIN_original,record_from_recompiled,missing VIN_recompiled,Difference
0,103_Norwalk_MV_21.xlsx,75044,11_Bloomfield_MVData_2019.csv,19643,-55401
1,103_Norwalk_MVData_2020.csv,71664,107_Orange_MVData_2020.csv,14650,-57014
2,126_Shelton_MV_21.xlsx,39086,124_Seymour_MV_21.csv,13860,-25226
3,143_Torrington_MV_21.xlsx,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
4,143_Torrington_MVData_2020.csv,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
5,11_Bloomfield_MVData_2019.csv,19643,121_Salem_MVData_2019.csv,218,-19425
6,107_Orange_MVData_2020.csv,14650,101_North_Haven_MV_21.xlsx,84,-14566
7,124_Seymour_MV_21.csv,13860,083_Middletown_MV_21.xlsx,82,-13778
8,016_Bridgewater_MV_21.xlsx,4886,136_Sterling_MVData_2019.csv,69,-4817
9,122_Salisbury_MV_21.xlsx,4390,163_Windham_MV_21.xlsx,37,-4353


In [152]:
total_row = pd.DataFrame(compare_missing_VINS.data.sum(), columns=["Total"]).T

In [155]:
total_row.loc["Total", "record_from_original"] = ""
total_row.loc["Total", "record_from_recompiled"] =""

In [157]:
compare_missing_VINS.data = pd.concat([compare_missing_VINS.data, total_row])

In [159]:
compare_missing_VINS.set_caption("Comparison of Missing VINs: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing VIN_original,record_from_recompiled,missing VIN_recompiled,Difference
0,103_Norwalk_MV_21.xlsx,75044,11_Bloomfield_MVData_2019.csv,19643,-55401
1,103_Norwalk_MVData_2020.csv,71664,107_Orange_MVData_2020.csv,14650,-57014
2,126_Shelton_MV_21.xlsx,39086,124_Seymour_MV_21.csv,13860,-25226
3,143_Torrington_MV_21.xlsx,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
4,143_Torrington_MVData_2020.csv,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
5,11_Bloomfield_MVData_2019.csv,19643,121_Salem_MVData_2019.csv,218,-19425
6,107_Orange_MVData_2020.csv,14650,101_North_Haven_MV_21.xlsx,84,-14566
7,124_Seymour_MV_21.csv,13860,083_Middletown_MV_21.xlsx,82,-13778
8,016_Bridgewater_MV_21.xlsx,4886,136_Sterling_MVData_2019.csv,69,-4817
9,122_Salisbury_MV_21.xlsx,4390,163_Windham_MV_21.xlsx,37,-4353


### Comparing Missing Street Addresses

In [194]:
compare_missing_streets = missing_df_original_streets_formerge.merge(missing_df_new_streets_formerge, how = 'left',
                                                      left_on = "missing street rank",
                                                      right_on = "missing street rank",
                                                      suffixes = ("_original", "_recompiled"))

compare_missing_streets = compare_missing_streets.set_index("missing street rank")

compare_missing_streets["Difference"] = compare_missing_streets["missing street_recompiled"] - compare_missing_streets["missing street_original"]

compare_missing_streets = compare_missing_streets.reset_index()

compare_missing_streets = compare_missing_streets.style.background_gradient(subset = ["Difference"], cmap=cm)

compare_missing_streets.data = compare_missing_streets.data.drop("missing street rank", axis = 1)

In [198]:
total_row_streets = pd.DataFrame(compare_missing_streets.data.sum(), columns=["Total"]).T
total_row_streets.loc["Total", "record_from_original"] = ""
total_row_streets.loc["Total", "record_from_recompiled"] =""

In [199]:
compare_missing_streets.data = pd.concat([compare_missing_streets.data, total_row_streets])

In [200]:
compare_missing_streets.set_caption("Comparison of Missing Street Addresses: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing street_original,record_from_recompiled,missing street_recompiled,Difference
0,103_Norwalk_MV_21.xlsx,75044,107_Orange_MV_21.xlsx,13522,-61522
1,103_Norwalk_MVData_2020.csv,71664,045_East_Lyme_MV_21_ALTERED.csv,13325,-58339
2,77_Manchester_MVData_2019.csv,48990,27_Clinton_MVData_2020.csv,13202,-35788
3,077_Manchester_MV_21.xls,44487,037_Derby_MV_21.xls,9511,-34976
4,11_Bloomfield_MVData_2019.csv,19643,016_Bridgewater_MV_21_ALTERED.csv,4886,-14757
5,009_Bethel_MV_21.xlsx,17855,10_Bethlehem_MVData_2019.csv,4760,-13095
6,107_Orange_MV_21.xlsx,13522,112_Pomfret_MVData_2020.csv,4130,-9392
7,27_Clinton_MVData_2020.csv,13202,055_Goshen_MV_21.XLSX,4004,-9198
8,141_Thompson_MV_21.xlsx,10054,098_Norfolk_MV_21.xlsx,1945,-8109
9,037_Derby_MV_21.xls,9511,101_North_Haven_MV_21.xlsx,84,-9427


***

**Consider missing VINs**

In [121]:
 missing_df.sort_values("missing VIN", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count
134,11_Bloomfield_MVData_2019.csv,0,19643,0,1630,19643
108,107_Orange_MVData_2020.csv,0,14650,0,1939,14650
144,124_Seymour_MV_21.csv,0,13860,0,1119,15410
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254
180,141_Thompson_MV_21.xlsx,0,457,7,447,10054
137,121_Salem_MVData_2019.csv,0,218,0,126,4914
97,101_North_Haven_MV_21.xlsx,84,84,84,2978,24425
77,083_Middletown_MV_21.xlsx,82,82,82,3060,35302
168,136_Sterling_MVData_2019.csv,71,69,69,223,4249
224,163_Windham_MV_21.xlsx,37,37,37,848,16516


**MISSING VINs**
* Bloomfield file for 2019 does not contain VINs
* Orange file for 2019 does not contain VINs
* Seymour file has major issues for missing VINs
* Bridgewater file - has this many missing VINs in the underlying file.
* Thompson file - has 457 missing VINs in the underlying file anyway

That is - since the VINs are missing in the underlying file anyway, nothing can be done to fix these missing VINs. So we are good.

**Consider missing street addresses**

In [122]:
 missing_df.sort_values("missing street", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522
39,045_East_Lyme_MV_21_ALTERED.csv,4475,0,13325,5171,17616
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254
114,10_Bethlehem_MVData_2019.csv,0,0,4760,206,4760
119,112_Pomfret_MVData_2020.csv,4130,0,4130,4130,4130
49,055_Goshen_MV_21.XLSX,4004,0,4004,4004,4004
92,098_Norfolk_MV_21.xlsx,1945,0,1945,1945,1945
97,101_North_Haven_MV_21.xlsx,84,84,84,2978,24425


Orange 2021, Clinton 2020, Derby 2021, Bridgewater 2021 - All missing this info anyway.
Bethlehem 2019, Pomfret 2020, Goshen 2021 - all the same
Norfolk 2019 - is missing 1945 addresses in the underlying file

**Consider missing ZIP codes**

In [123]:
missing_df.sort_values("missing zip", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202
160,131_Southington_MV_21.xlsx,10089,0,2,12764,42996
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511
129,117_Redding_MV_21.xlsx,8171,0,0,8171,8171
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254
39,045_East_Lyme_MV_21_ALTERED.csv,4475,0,13325,5171,17616
267,44_East_Lyme_MVData_2019.csv,4301,0,0,5215,17448
119,112_Pomfret_MVData_2020.csv,4130,0,4130,4130,4130
49,055_Goshen_MV_21.XLSX,4004,0,4004,4004,4004


* Orange 2021 - **not an error** - the ZIP is genuinely missing in the underlying file
* Clinton - is genuinely missing
* Southington - 10,089 ZIPs are genuinely missing
* Derby - is genuinely missing
* Redding - is genuinely missing in the underlying file
* Bridgewater 2021 - real
* East Lyme 21 - really 4475 are indeed missing.
* Pomfret - Real
* Goshen - Real
* Norfolk 2021 - genuinely missing