# Quality Check - Reconstructed Municipal Data

In [1]:
import pandas as pd

import pathlib
import os

from tqdm import tqdm

In [12]:
raw_path = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "Vehicles_2022" / "Compiled"

# Load data

In [63]:
recompiled_data = pd.read_csv(raw_path / "2019-21_data_compiled_RN_092923.csv", chunksize = 1000)

In [40]:
original_data = pd.read_csv(raw_path / "2019-21 data compiled.csv", chunksize = 1000)

# Checks

## Check the same files are used

In [36]:
recompiled_data_sources = []
recompiled_data_len = 0

Get all the unique file sources in the recompiled data

In [37]:
i = 0

for chunk in recompiled_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    recompiled_data_len += len(chunk)
    
    sources = list(chunk["record_from"].unique())
    sources_new = [item for item in sources if not item in recompiled_data_sources]
    for item in sources_new:
        recompiled_data_sources.append(item)
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


Get all the unique file sources in the original data

In [23]:
original_data_sources = []
original_data_len = 0

In [41]:
i = 0

for chunk in original_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    original_data_len += len(chunk)
    
    sources = list(chunk["record_from"].unique())
    sources_new = [item for item in sources if not item in original_data_sources]
    for item in sources_new:
        original_data_sources.append(item)
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


Compare them

In [43]:
diff1 = [item for item in original_data_sources if not item in recompiled_data_sources]
diff2 = [item for item in recompiled_data_sources if not item in original_data_sources]

diff1

['77_Manchester_MVData_2019.csv',
 '077_Manchester_MV_21.xls',
 '122_Salisbury_MV_21.xlsx',
 '016_Bridgewater_MV_21.xlsx']

This makes sense - these are the files that got edited. Manchester 2019 and 2021, Salisbury, and Bridgewater

In [46]:
diff2 = [item for item in diff2 if not "ALTERED" in item]

In [47]:
diff2

['010_Bethlehem_MV_21.xlsx',
 '020_Burlington_MV_21.xlsx',
 '024_Chaplin_MV_21.xlsx',
 '034_Danbury_MV_21.xlsx',
 '061_Haddam_MV_21.xlsx',
 '066_Harwinton_MV_21.xlsx',
 '079_Marlborough_MV_21.xlsx',
 '086_Montville_MV_21.xlsx',
 '090_New_Canaan_MV_21.xlsx',
 '131_Southington_MV_21.xlsx',
 '147_Voluntown_MV_21.xlsx',
 '163_Windham_MV_21.xlsx']

Why weren't these items in the original data sources?... They appear to have been skipped or added later.

In [65]:
investigate_diff2 = pd.DataFrame([])
i = 0

for chunk in recompiled_data:
    if i %1000 == 0:
        print(f"Currently on chunk {i}")
        
    required = chunk[chunk["record_from"].isin(diff2)]
        
    excerpt_head = required.head(10)
    excerpt_tail = required.tail(10)
    
    excerpt = pd.concat([excerpt_head, excerpt_tail])
    
    investigate_diff2 = pd.concat([investigate_diff2, excerpt])
        
    i +=1

Currently on chunk 0
Currently on chunk 1000
Currently on chunk 2000
Currently on chunk 3000
Currently on chunk 4000
Currently on chunk 5000


In [74]:
heads = []
for item in diff2:
    df = investigate_diff2[investigate_diff2["record_from"] == item]
    heads.append(df.head(10))

for df in heads:
    display(df)

Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
95937,95937,010_Bethlehem_MV_21.xlsx,37-39 CITY HILL STREET LLC,15 W SHORE DR,BETHLEHEM,CT,6751,2021.0,CHEVR,SILVERAD,2.0,1GC4YUEY1MF128671,15 WEST SHORE DR,,BETHLEHEM,CT,6751.0
95938,95938,010_Bethlehem_MV_21.xlsx,37-39 CITY HILL STREET LLC,15 W SHORE DR,BETHLEHEM,CT,6751,2022.0,LOOK,STLC,10.0,53BLTEA10NP023562,15 WEST SHORE DR,,BETHLEHEM,CT,6751.0
95939,95939,010_Bethlehem_MV_21.xlsx,A & B HARD FLOOR LLC,PO BOX 1387,WATERBURY,CT,6721,2019.0,GMC,SIERRA K,3.0,1GT12SEY7KF230918,21 SKY MEADOW RD,,BETHLEHEM,CT,6751.0
95940,95940,010_Bethlehem_MV_21.xlsx,A MIM S CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2000.0,MACK,CX600,2.0,1M1AE07Y5YW003647,,,,,
95941,95941,010_Bethlehem_MV_21.xlsx,A MIM S CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2012.0,LANDO,T L,10.0,1LH440VHXC1018832,,,,,
95942,95942,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2013.0,PTRB,CONVENTI,2.0,1XPSD79X2DD172162,,,,,
95943,95943,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,1999.0,FONTA,303NDMRV,10.0,4LF4G4825X3508389,,,,,
95944,95944,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,1995.0,LANDO,317,10.0,1LH317VJ9S1007514,,,,,
95945,95945,010_Bethlehem_MV_21.xlsx,A MIMS CORP,218 GUILDS HOLLOW RD,BETHLEHEM,CT,6751,2004.0,INTER,4000 SER,2.0,1HTMMAAM14H667912,,,,,
95946,95946,010_Bethlehem_MV_21.xlsx,ABBEY OF REGINA LAUDIS,273 FLANDERS RD,BETHLEHEM,CT,6751,2005.0,HONDA,ACCORD L,1.0,1HGCM56495A149907,,,BETHLEHEM,CT,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
318203,318203,020_Burlington_MV_21.xlsx,A-1 ENTERPRISES LLC,8 MARY ROAD,BURLINGTON,CT,6013,2021.0,FORD,F150 SUP,3,1FTFW1E55MKD52858,,,,,
318204,318204,020_Burlington_MV_21.xlsx,A-1 ENTERPRISES LLC,8 MARY ROAD,BURLINGTON,CT,6013,2019.0,CHEVR,SILVERAD,2,1HTKHPVK5KH406240,8 MARY RD,,BURLINGTON,CT,6013.0
318205,318205,020_Burlington_MV_21.xlsx,ABALAN DOUGLAS L,71 MILFORD ST,BURLINGTON,CT,6013,2011.0,DODGE,GRAND CR,1,2D4RN5DGXBR683379,,,,,
318206,318206,020_Burlington_MV_21.xlsx,ABALAN DOUGLAS L,71 MILFORD ST,BURLINGTON,CT,6013,2004.0,DODGE,RAM 1500,1,1D7HU18D84S564872,,,,,
318207,318207,020_Burlington_MV_21.xlsx,ABALAN RINETTE R,71 MILFORD ST,BURLINGTON,CT,6013,2001.0,HONDA,CR-V EX,1,JHLRD18661C024901,,,,,
318208,318208,020_Burlington_MV_21.xlsx,ABASCAL RICHARD,68 VENICE DR,BURLINGTON,CT,6013,2013.0,TOYOT,TACOMA A,3,5TFUU4EN9DX054329,,,,,
318209,318209,020_Burlington_MV_21.xlsx,ABASCAL RICHARD,68 VENICE DR,BURLINGTON,CT,6013,2011.0,HARLE,FLSTN,12,1HD1JD517BB017457,,,,,
318210,318210,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2005.0,HONDA,PILOT EX,1,2HKYF18425H570235,59 BELDEN ROAD,,BURLINGTON,CT,6013.0
318211,318211,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2013.0,TOYOT,CAMRY L/,1,4T1BF1FK5DU660406,,,,,
318212,318212,020_Burlington_MV_21.xlsx,ABDELREHIM MOHAMED E,59 BELDEN RD,BURLINGTON,CT,6013,2012.0,HONDA,PILOT EX,1,5FNYF4H55CB053260,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
345894,345894,024_Chaplin_MV_21.xlsx,ABARZUA DAGGETT ASCHLY C,26 BEDLAM RD,CHAPLIN,CT,6235,2003.0,HONDA,VT750DCB,12,JH2RC44503M708944,,,,,
345895,345895,024_Chaplin_MV_21.xlsx,ABARZUA DAGGETT ASCHLY C,26 BEDLAM RD,CHAPLIN,CT,6235,2014.0,VOLKS,PASSAT S,1,1VWBN7A38EC063894,,,,,
345896,345896,024_Chaplin_MV_21.xlsx,ABELIN DAWN L,150 CHEWINK RD,CHAPLIN,CT,6235,2009.0,HONDA,CR-V EX,1,3CZRE48539G703675,,,,,
345897,345897,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021.0,BUICK,ENCORE P,1,KL4CJESM2MB361970,,,,,
345898,345898,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2018.0,BUICK,ENCORE P,1,KL4CJESB7JB694739,,,,,
345899,345899,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019.0,CHEVR,TRAX 1LT,1,KL7CJPSBXKB951800,,,,,
345900,345900,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021.0,GMC,TERRAIN,1,3GKALVEV6ML351991,,,,,
345901,345901,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020.0,CHEVR,SILVERAD,3,1GCRYEED2LZ369807,,,,,
345902,345902,024_Chaplin_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019.0,GMC,SIERRA L,3,2GTV2LEC7K1105839,,,,,
345903,345903,024_Chaplin_MV_21.xlsx,ADAMS ALEXIS E,20 CAREFREE LN APT 1,CHAPLIN,CT,6235,2018.0,MITSU,OUTLANDE,1,JA4AP3AWXJU023925,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
421301,421301,034_Danbury_MV_21.xlsx,29 FEDERAL ROAD LLC,64 TRIANGLE ST.,DANBURY,CT,6810,2007,CHEVR,EXPRESS,3,1GBGG29U171156648,,,,,
421302,421302,034_Danbury_MV_21.xlsx,2J S LLC,12 STARR AVE 14 STARR AVE,DANBURY,CT,6811,2014,HINO,HINO 268,2,5PVNJ8JT9E4S55479,,,,,
421303,421303,034_Danbury_MV_21.xlsx,2M GENERAL SERVICES LLC,12 TRIANGLE ST UNIT 2,DANBURY,CT,6810,1998,FORD,ECONOLIN,3,1FTRE1429WHA50822,,,,,
421304,421304,034_Danbury_MV_21.xlsx,3M GENERAL SERVICES LLC,13 COTTAGE ST FL 2,DANBURY,CT,6810,2017,FORD,FUSION S,3,3FA6P0HD1HR195797,,,,,
421305,421305,034_Danbury_MV_21.xlsx,4 SEASONSLLC,50 NORTH STREET,DANBURY,CT,6810,2021,CROSS,61435,11,431FS1411M1000212,,,,,
421306,421306,034_Danbury_MV_21.xlsx,4 SEASONSLLC,50 NORTH STREET,DANBURY,CT,6810,2019,CHEVR,EXPRESS,3,1GCWGAFG3K1255867,,,,,
421307,421307,034_Danbury_MV_21.xlsx,404 CAR LINE LLC,89 WALNUT TRL,DANBURY,CT,6811,1999,FORD,EXPEDITI,3,1FMPU18L3XLA05477,,,,,
421308,421308,034_Danbury_MV_21.xlsx,46 SL LLC,16 HAYESTOWN RD UNIT 3104,DANBURY,CT,6811,2020,PORSC,MACAN,1,WP1AA2A57LLB09404,,,,,
421309,421309,034_Danbury_MV_21.xlsx,85 MILL PLAIN ROAD LLC,85 MILL PLAIN RD,DANBURY,CT,6811,2017,RAM,RAM CHAS,70,3C7WRNDL8HG625578,,,,,
421310,421310,034_Danbury_MV_21.xlsx,A & C ROOFING AND SIDING LLC,8 MALLORY ST APT 2,DANBURY,CT,6810,2008,GMC,ACADIA S,1,1GKEV23768J245536,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
938991,938991,061_Haddam_MV_21.xlsx,A & A PROPERTY MAINTENANCE LLC,77 OAK RIDGE DR UNIT 1,HADDAM,CT,6438.0,2019,FORD,F550 SUP,2,1FD0X5HT6KEF91010,,,,,
938992,938992,061_Haddam_MV_21.xlsx,ABARIENTOS ANTONIETA L,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2005,ACURA,MDX,1,2HNYD182X5H552606,,,,,
938993,938993,061_Haddam_MV_21.xlsx,ABARIENTOS CRISPIN,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2011,AUDI,A4 2.0T,1,WAUFFAFL9BA091621,,,,,
938994,938994,061_Haddam_MV_21.xlsx,ABARIENTOS CRISPIN,17 LARKSPUR DR,HIGGANUM,CT,6441.0,2001,PORSC,911 CARR,1,WP0AA299X1S620343,,,,,
938995,938995,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,1985,CHEVR,K10,25,2GCEK14H3F1149599,,,,,
938996,938996,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,2011,VOLKS,JETTA TD,1,3VWPL7AJ2BM616673,,,,,
938997,938997,061_Haddam_MV_21.xlsx,ABBATELLO STEVEN G,5 MAPLE AVE WEST,HIGGANUM,CT,6441.0,2003,CHEVR,SILVERAD,1,1GCEC14X73Z353572,,,,,
938998,938998,061_Haddam_MV_21.xlsx,ABBOTT ARTHUR H JR,623 CANDLEWOOD HILL RD,HIGGANUM,CT,6441.0,2009,SUBAR,OUTBACK,1,4S4BP61C497330465,,,,,
938999,938999,061_Haddam_MV_21.xlsx,ABBOTT ARTHUR H JR,623 CANDLEWOOD HILL RD,HIGGANUM,CT,6441.0,1997,CHEVR,K1500,3,1GCEK14MXVZ191674,,,,,
938991,938991,061_Haddam_MV_21.xlsx,A & A PROPERTY MAINTENANCE LLC,77 OAK RIDGE DR UNIT 1,HADDAM,CT,6438.0,2019,FORD,F550 SUP,2,1FD0X5HT6KEF91010,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1043778,1043778,066_Harwinton_MV_21.xlsx,.SUPREME INDUSTRIES INC.,216 LOWER BOGUE RD,HARWINTON,CT,6791,2019,CHEVR,SILVERAD,1,1GCUYDED3KZ127370,,,,,
1043779,1043779,066_Harwinton_MV_21.xlsx,.SUPREME INDUSTRIES INC.,216 LOWER BOGUE RD,HARWINTON,CT,6791,2017,CHEVR,SILVERAD,3,1GC1KVEG7HF169684,,,,,
1043780,1043780,066_Harwinton_MV_21.xlsx,362 POST ROAD LLC,216 BOGUE RD,HARWINTON,CT,6791,2018,TIDEW,TP-22-36,11,45LBS2219J2100584,,,,,
1043781,1043781,066_Harwinton_MV_21.xlsx,ABBAMONDI CYNTHIA J,30 HITHER LANE,HARWINTON,CT,6791,2016,VOLVO,XC60 T5,1,YV4612RM7G2831290,,,,,
1043782,1043782,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,1983,FIAT,PININFAR,25,ZFRAS00B3D5503101,,,,,
1043783,1043783,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,2013,VOLKS,BEETLE T,1,3VW4A7AT1DM617780,,,,,
1043784,1043784,066_Harwinton_MV_21.xlsx,ABBOTTS JEFFREY C,95 WILDCAT HILL RD,HARWINTON,CT,6791,2016,RAM,RAM TRUC,3,3C6JR7DTXGG302349,,,,,
1043785,1043785,066_Harwinton_MV_21.xlsx,ABELING CHRISTOPHER L,576 MAIN ST,TORRINGTON,CT,6790,2011,HONDA,ELEMENT,1,5J6YH2H75BL004405,,,,,
1043786,1043786,066_Harwinton_MV_21.xlsx,ABLING-JOSEPHSON CORITA P,352 CLEARVIEW AVE,HARWINTON,CT,6791,2012,SUBAR,FORESTER,1,JF2SHADC7CH451624,,,,,
1043787,1043787,066_Harwinton_MV_21.xlsx,ABOU ARRAGE ANDRE,34 SAND HILL LN,GLASTONBURY,CT,6033,2017,HONDA,ACCORD T,1,1HGCR3F90HA007645,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1199237,1199237,079_Marlborough_MV_21.xlsx,,32 EAST HAMPTON RD,MARLBOROUGH,CT,6447,2018,TOYOT,4RUNNER,1,JTEBU5JR9J5607957,32 EAST HAMPTON RD RT 66,50001.0,MARLBOROUGH,CT,6447.0
1199238,1199238,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,1999,FORD,F550 SUP,2,1FDAF56F0XEA33195,5 SHERWOOD LANE,50002.0,MARLBOROUGH,CT,6447.0
1199239,1199239,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,2001,ECONO,,10,42EDPFB2X11001464,5 SHERWOOD LANE,50003.0,MARLBOROUGH,CT,6447.0
1199240,1199240,079_Marlborough_MV_21.xlsx,,PO BOX 453,MARLBOROUGH,CT,6447,2016,CHEVR,SILVERAD,3,1GC1KUEG4GF278550,5 SHERWOOD LANE,50004.0,MARLBOROUGH,CT,6447.0
1199241,1199241,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2018,PORSC,MACAN,1,WP1AA2A59JLB15167,,50005.0,,,
1199242,1199242,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2015,AUDI,A4 2.0T,1,WAUBFBFL2FN040366,,50006.0,,,
1199243,1199243,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2020,DUCAT,DIAVEL 1,12,ZDMGAHRW2LB003903,,50007.0,,,
1199244,1199244,079_Marlborough_MV_21.xlsx,,2 AVALON LN,MARLBOROUGH,CT,6447,2018,FORD,F150 RAP,1,1FTFW1RG0JFE52897,,50008.0,,,
1199245,1199245,079_Marlborough_MV_21.xlsx,,20 CARRIAGE LN,MARLBOROUGH,CT,6447,2016,NISSA,ROGUE S/,1,KNMAT2MV8GP723352,,50009.0,,,
1199246,1199246,079_Marlborough_MV_21.xlsx,,20 APACHE LN,MARLBOROUGH,CT,6447,2019,HONDA,CR-V EXL,1,5J6RW2H8XKA000252,,50010.0,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1364806,1364806,086_Montville_MV_21.xlsx,A & B EXCAVATING L.L.C.,33 COVE RD,UNCASVILLE,CT,6382,1988,INTER,C16,10,1ZFCF1622JB003085,,,,,
1364807,1364807,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,2013,FORD,F550 SUP,2,1FD0W5HT9DEA34887,,,,,
1364808,1364808,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,1997,INTER,4000 SER,2,1HTSCAAM2VH424900,,,,,
1364809,1364809,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,2005,FORD,F350 SUP,2,1FDWX37P75EB40966,,,,,
1364810,1364810,086_Montville_MV_21.xlsx,A & B EXCAVATING LLC,33 COVE RD,UNCASVILLE,CT,6382,1998,MITSU,FE639,2,JW6AAE1H5WL001462,,,,,
1364811,1364811,086_Montville_MV_21.xlsx,A & B TREE SERVICE LLC,PO BOX 335,MONTVILLE,CT,6353,1996,GMC,TOPKICK,2,1GDL7H1P4TJ512163,,,,,
1364812,1364812,086_Montville_MV_21.xlsx,A & B TREE SERVICES LLC,PO BOX 335,MONTVILLE,CT,6353,2012,INTER,4000 SER,2,3HAMMAAM6CL549363,,,,,
1364813,1364813,086_Montville_MV_21.xlsx,A J CABRAL TRUCKING,12 BONVILLE DR,UNCASVILLE,CT,6382,1994,KENWO,CONSTRUC,2,2XKDD69X9RM635618,,,,,
1364814,1364814,086_Montville_MV_21.xlsx,AANENSEN KENNETH T,8 ROBIN LN,OAKDALE,CT,6370,2021,NISSA,MURANO P,1,5N1AZ2DS3MC136494,,,,,
1364815,1364815,086_Montville_MV_21.xlsx,AANENSEN KENNETH T JR,8 ROBIN LN,OAKDALE,CT,6370,2009,HONDA,PILOT EX,1,5FNYF48509B031076,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
1431120,1431120,090_New_Canaan_MV_21.xlsx,1330 EAST PUTNAM LLC,PO BOX 811,NEW CANAAN,CT,6840.0,2019.0,CHEVR,EQUINOX,1.0,3GNAXVEX0KS674784,261 ELM ST,50001,NEW CANAAN,CT,6840.0
1431121,1431121,090_New_Canaan_MV_21.xlsx,1ST RIDE & GO CORP,58 PINE ST,NEW CANAAN,CT,6840.0,2016.0,FORD,TRANSIT,8.0,1FBZX2CM2GKA71622,,50002,,,
1431122,1431122,090_New_Canaan_MV_21.xlsx,909 WEST ROAD REVOCABLE LIVING TRUST,909 WEST RD,NEW CANAAN,CT,6840.0,1996.0,JAGUA,XJR,25.0,SAJPX1143TC781554,,50003,,,
1431123,1431123,090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840.0,2007.0,FORD,ECONOLIN,3.0,1FTNE24L07DA36751,,50004,,,
1431124,1431124,090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840.0,2018.0,RAM,PROMASTE,3.0,3C6TRVAG6JE110384,,50005,,,
1431125,1431125,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2015.0,RAM,RAM TRUC,3.0,3C6MR5AJ2FG524435,,50006,,,
1431126,1431126,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2007.0,ISUZU,NPR,2.0,JALB4W16277400371,,50007,,,
1431127,1431127,090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840.0,2006.0,CAM,6CAM 18,10.0,5JPBU23296P015771,,50008,,,
1431128,1431128,090_New_Canaan_MV_21.xlsx,A SILLO DEVELOPMENT LLC,691 OLD STAMFORD ROAD,NEW CANAAN,CT,6840.0,2012.0,CHEVR,SILVERAD,2.0,1GB3KZCG1CF113627,691 OLD STAMFORD RD,50009,NEW CANAAN,CT,6840.0
1431129,1431129,090_New_Canaan_MV_21.xlsx,A SILLO DEVELOPMENT LLC,691 OLD STAMFORD ROAD,NEW CANAAN,CT,6840.0,2020.0,CHEVR,SILVERAD,3.0,3GCPYFED0LG246956,691 OLD STAMFORD RD,50010,NEW CANAAN,CT,6840.0


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2076515,2076515,131_Southington_MV_21.xlsx,18FORE18 INC,201 PATTONWOOD DR,SOUTHINGTON,CT,6489.0,2002,USCAR,USCL58SA,11,4X4TSE0112X053369,,,,,
2076516,2076516,131_Southington_MV_21.xlsx,3 BROTHERS PROPERTY PRESERVATION SE,461C COOKE ST,FARMINGTON,CT,6032.0,2013,FORD,F350 SUP,3,1FTRF3B68DEA53496,,,,,
2076517,2076517,131_Southington_MV_21.xlsx,A & A SURPLUS INC.,389 MARION AVE,PLANTSVILLE,CT,6479.0,2008,TOYOT,TUNDRA D,3,5TBBV54198S491700,,,,,
2076518,2076518,131_Southington_MV_21.xlsx,A & A SURPLUS INC.,389 MARION AVE,PLANTSVILLE,CT,6479.0,2012,FREIG,M2 106 M,2,1FVACXDT5CHBV1762,,,,,
2076519,2076519,131_Southington_MV_21.xlsx,A AND A SURPLUS INC,389 MARION AVE,PLANTSVILLE,CT,6479.0,1999,INTER,4000 SER,2,1HTSCAAM3XH688887,,,,,
2076520,2076520,131_Southington_MV_21.xlsx,A AND P KRISHNA CORPORATION,151 QUEEN STREET,SOUTHINGTON,CT,6489.0,2015,MERCE,SPRINTER,3,WD3PF1CC9FP167427,,,,,
2076521,2076521,131_Southington_MV_21.xlsx,A CUT ABOVE GREENCARE LLC,41 HIGHWOOD AVE.,SOUTHINGTON,CT,6489.0,2017,FREED,6X10SA,10,5WKBE101XH1045634,,,,,
2076522,2076522,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV9CLBK5195,,,,,
2076523,2076523,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV0CLBK5165,,,,,
2076524,2076524,131_Southington_MV_21.xlsx,A DUIE PYLE INC,PO BOX 564,WEST CHESTER,PA,19381.0,2012,FRHT,CA12,2,1FUBGEDV0CLBK5196,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2375159,2375159,147_Voluntown_MV_21.xlsx,A KAUSCH & SONS LLC,15 BEACH VIEW ROAD EXT,VOLUNTOWN,CT,6384,2006,CHEVR,EXPRESS,2,1GBJG31U761268117,,,,,
2375160,2375160,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,GMC,ACADIA S,1,1GKKNULS2KZ110890,,,,,
2375161,2375161,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,CHEVR,EQUINOX,1,2GNAXUEV3K6194780,,,,,
2375162,2375162,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,CHEVR,SILVERAD,1,1GCRYCEF6KZ368941,,,,,
2375163,2375163,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,SILVERAD,1,1GCPYFED7LZ165405,,,,,
2375164,2375164,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,SILVERAD,1,1GCRYBEH8LZ271768,,,,,
2375165,2375165,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,CHEVR,SILVERAD,3,1GCRYBEH8MZ187550,,,,,
2375166,2375166,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,GMC,YUKON DE,3,1GKS2DKL1MR127895,,,,,
2375167,2375167,147_Voluntown_MV_21.xlsx,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,GMC,ACADIA S,1,1GKKNULS2MZ210281,,,,,
2375168,2375168,147_Voluntown_MV_21.xlsx,ADAMS ANGELA D,66 TEN ROD RD,VOLUNTOWN,CT,6384,2014,MERCE,C300 4 M,1,WDDGF8AB8EA949962,,,,,


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


# Same checks

In [88]:
recompiled_data = pd.read_csv(raw_path / "2019-21_data_compiled_RN_092923.csv", chunksize = 1000)

In [89]:
missing_df_old = missing_df.copy(deep = True)

In [90]:
i = 0

missing_df = pd.DataFrame([])

for chunk in recompiled_data:
    
    # Give progress
    if (i % 1000 ==0):
        print(f"Currently on chunk number {i}")
        # display(df)
        
    # Create DF
    # Get the total number for that record_from
    total = chunk.groupby("record_from").count()["Unnamed: 0"].reset_index(name="count")
    
    # Get the number with a missing ZIP
    missing_zips = chunk.groupby("record_from")["zip"].apply(lambda x: x.isna().sum()).reset_index(name = "missing zip")
    
    # Get the number with a missing VIN
    missing_vins = chunk.groupby("record_from")["vehicle_id"].apply(lambda x: x.isna().sum()).reset_index(name = "missing VIN")
    
    # Missing address
    missing_streets = chunk.groupby("record_from")["street"].apply(lambda x: x.isna().sum()).reset_index(name = "missing street")
    
    # Get the number with state not in CT
    state_not_CT = chunk.groupby("record_from")["state"].apply(lambda x: (x!="CT").sum()).reset_index(name ="not CT")
    
    # Merge
    all_columns  = missing_zips.merge(missing_vins,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(missing_streets,
                                   how = 'outer',
                                   left_on = 'record_from',
                                   right_on = 'record_from')
    
    all_columns  = all_columns.merge(state_not_CT,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(total,
                                    how = 'outer',
                                    left_on = 'record_from',
                                    right_on = 'record_from')
    
    
    # Assign this to a DF to save
    missing_df = pd.concat([missing_df, all_columns])
    
    # add to count
    i +=1

missing_df = missing_df.groupby("record_from").sum()
missing_df = missing_df.reset_index()

# Get the details out of the from_record column
record_from_split = pd.DataFrame(list(missing_df["record_from"].str.split("_")),
                                 columns = ["number", "town", "suffix", "year", "other"])
record_from_split = record_from_split.drop("other", axis =1 )
record_from_split["year"] = record_from_split["year"].str.replace("\..*$", "")

Currently on chunk number 0


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


Currently on chunk number 1000


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


Currently on chunk number 2000


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


Currently on chunk number 3000


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


Currently on chunk number 4000


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


Currently on chunk number 5000


Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
2691877,2691877,163_Windham_MV_21.xlsx,0 TO 60 MOTORSPORTS LLC,114 PINE ORCHARD RD,BRANFORD,CT,6405,1974,FERRA,365GTB4,25,17993,,,,,
2691878,2691878,163_Windham_MV_21.xlsx,A L FIRE PROTECTION LLC,45 CHERRY ST,EAST HARTFORD,CT,6108,2008,FORD,ECONOLIN,3,1FTSE34L88DA28181,,,,,
2691879,2691879,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2017,FORD,F350 SUP,2,1FT8W3DT7HEF28494,,,,,
2691880,2691880,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2020,BRAVO,SC712TA2,10,542BC1224LB031060,,,,,
2691881,2691881,163_Windham_MV_21.xlsx,A+ PUMPS & SERVICE LLC,42 MACHINE SHOP HILL RD,SOUTH WINDHAM,CT,6266,2021,REISE,TRAILER,10,55L1F2028MN008651,,,,,
2691882,2691882,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2021,VOLKS,ATLAS SE,1,1V2SR2CA8MC548989,,,,,
2691883,2691883,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF14V78KC77911,,,,,
2691884,2691884,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2008,FORD,F150,3,1FTRF12238KB63171,,,,,
2691885,2691885,163_Windham_MV_21.xlsx,AAA DURA GLAZE LLC,292 SCOTLAND RD,WINDHAM,CT,6280,2001,DODGE,DAKOTA,3,1B7FL26X61S310631,,,,,
2691886,2691886,163_Windham_MV_21.xlsx,ABBOTT CARLETON N,143 BOULEVARD RD,NORTH WINDHAM,CT,6256,2008,MITSU,GALANT E,1,4A3AB36F48E015544,,,,,


  record_from_split["year"] = record_from_split["year"].str.replace("\..*$", "")


**Consider missing VINs**

In [87]:
 missing_df.sort_values("missing VIN", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,not CT,count
133,11_Bloomfield_MVData_2019.csv,0,19643,1630,19643
107,107_Orange_MVData_2020.csv,0,14650,1939,14650
143,124_Seymour_MV_21.csv,0,13860,1119,15410
14,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,5074,7254
179,141_Thompson_MV_21.xlsx,0,457,447,10054
136,121_Salem_MVData_2019.csv,0,218,126,4914
96,101_North_Haven_MV_21.xlsx,84,84,2978,24425
76,083_Middletown_MV_21.xlsx,82,82,3060,35302
167,136_Sterling_MVData_2019.csv,71,69,223,4249
103,105_Old_Lyme_MVData_2019.csv,37,37,530,9135


**MISSING VINs**
* Bloomfield file for 2019 does not contain VINs
* Orange file for 2019 does not contain VINs
* Seymour file has major issues for missing VINs
* Bridgewater file - has this many missing VINs in the underlying file.
* Thompson file - has 457 missing VINs in the underlying file anyway

**Consider missing street addresses**

In [91]:
 missing_df.sort_values("missing street", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254
114,10_Bethlehem_MVData_2019.csv,0,0,4760,206,4760
119,112_Pomfret_MVData_2020.csv,4130,0,4130,4130,4130
49,055_Goshen_MV_21.XLSX,4004,0,4004,4004,4004
92,098_Norfolk_MV_21.xlsx,1945,0,1945,1945,1945
97,101_North_Haven_MV_21.xlsx,84,84,84,2978,24425
77,083_Middletown_MV_21.xlsx,82,82,82,3060,35302


Orange 2021, Clinton 2020, Derby 2021, Bridgewater 2021 - All missing this info anyway.
Bethlehem 2019, Pomfret 2020, Goshen 2021 - all the same
Norfolk 2019 - is missing 1945 addresses in the underlying file

**Consider missing ZIP codes**

In [92]:
missing_df.sort_values("missing zip", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count
323,96_Newington_MVData_2020.csv,27612,0,0,27612,27612
39,045_East_Lyme_MV_21.csv,17616,0,0,17616,17616
3,004_Avon_MV_21.xlsx,15860,0,0,1547,15860
273,4_Avon_MVData_2020.csv,15654,0,0,15654,15654
66,072_Ledyard_MV_21.xlsx,13858,0,0,13858,13858
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202
160,131_Southington_MV_21.xlsx,10089,0,2,12764,42996
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511
129,117_Redding_MV_21.xlsx,8171,0,0,8171,8171


* Newington - **error** - the column should be "ZIP"
* East Lyme - **partial error** - the Zip code data is under "Address" cont for SOME of them
* Avon 2021 - **error** - the Zip code data is under ZIP
* Avon 2020 - **error** - the ZIP should be under zip1
* Ledyard 2021 - **error** - The Zip is under ADDRESS but must be split first
* Orange 2021 - **not an error** - the ZIP is genuinely missing in the underlying file
* Clinton - is genuinely missing
* Southington - 10,089 ZIPs are genuinely missing
* Derby - is genuinely missing
* Redding - is genuinely missing in the underlying file
* Canterbury 2019 - **error** - the ZIP information is under "ZIP"
* Bridgewater 2021 - real
* East Lyme 2019 - is under Zip1 and Zip2
* Pomfret - Real
* Goshen - Real
* Sherman 2021 - Error - is under zip
* Sherman 2020 - Error - is under ZIP
* Norfolk 2021 - genuinely missing
* Cornwall 2021 - **error** - under ZIP
* Manchester 2019 - **error** - under Unnamed 6 - Unnamed 8