# Quality Check - Reconstructed Municipal Data

The purpose of this file is to Quality Check the re-constructed municipal data, compiled by RN in the file `reconstruct_municipal.ipynb`

In [None]:
!pip install seaborn

In [2]:
# Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Path Management
import pathlib
import os

# Progress Tracking
from tqdm import tqdm

# Regular expressions
import re

# Maths
import numpy as np

# System
from sys import platform
# import platform

# Import Geopandas
import geopandas as gpd
from shapely.geometry import Point
from geopandas.tools import geocode
import geopy

# Heatmaps
import seaborn as sns

In [3]:
# Set paths
path = raw_path = pathlib.Path().resolve()

if platform == "windows":
    raw_path = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "Vehicles_2022" / "Compiled"
elif platform == "linux":
    raw_path = pathlib.Path().resolve().parent.parent / "rn_home"/ "data" / "municipal_dataset_latest"
    vin_detail_output = pathlib.Path().resolve().parent.parent / "rn_home"/ "data" / "vin_matching"
raw_path.is_dir()

True

# Functions

In [4]:
def fetch_vin_data(vin, variables = None):
        """
        Input: An unmatched, but corrected VIN
        Output: A matched VIN or NA
        
        """
        if not variables:
            variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        else:
            variables = variables
        
        url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{vin.strip()}?format=csv")

        # Download response
        resp_df = pd.read_csv(url)

        # Extract needed
        resp_df = resp_df.loc[:, ["variable", "value"]].T
        # resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
        resp_df.columns = resp_df.iloc[0]
        resp_df = resp_df.drop("variable", axis = 0)
        resp_df["vin_corrected"] = vin
        
        return resp_df

# 1. Load data

In [5]:
def reset_recompiled_data(version):
    recompiled_data = pd.read_csv(raw_path / f"2019-21_data_compiled_RN_{version}.csv", index_col = 0)
    return recompiled_data

def reset_original_data():
    if platform == "linux":
        original_data = pd.read_csv(path.parent.parent /"data"/"municipal_compilation"/"old_file"/ "2019-21 data compiled.csv", chunksize = 1000)
    else:
        original_data = pd.read_csv(raw_path / "2019-21 data compiled.csv", chunksize = 1000)
    return original_data

In [6]:
recompiled_data = reset_recompiled_data("102323")

  recompiled_data = pd.read_csv(raw_path / f"2019-21_data_compiled_RN_{version}.csv", index_col = 0)


# Checks

## Missing Vins

In [38]:
missing_vins = recompiled_data.loc[recompiled_data["vehicle_id"].isna(), ["record_from", "vehicle_id"]].reset_index().groupby("record_from").count()[["index"]]

In [40]:
missing_vins.sort_values("index", ascending = False)

Unnamed: 0_level_0,index
record_from,Unnamed: 1_level_1
11_Bloomfield_MVData_2019.csv,19643
107_Orange_MVData_2020.csv,14650
124_Seymour_MV_21.csv,13860
016_Bridgewater_MV_21_ALTERED.csv,4886
141_Thompson_MV_21.xlsx,457
121_Salem_MVData_2019.csv,218
101_North_Haven_MV_21.xlsx,84
083_Middletown_MV_21.xlsx,82
136_Sterling_MVData_2019.csv,69
105_Old_Lyme_MVData_2019.csv,37


## 0. SUMMARY

## Check VIN detail

In [None]:
# Get a set of 5000 random vins
vins = recompiled_data["vehicle_id"].unique().tolist()
random_indices = np.random.randint(0, len(vins), 5000)
vins_sample = [vins[i] for i in random_indices]

In [None]:
vins_sample_details = pd.DataFrame([])

for vin in tqdm(vins_sample):
    try:
        chars = fetch_vin_data(vin)
        vins_sample_details = pd.concat([vins_sample_details, chars]).reset_index(drop=True)
    except:
        pass

 30%|███       | 1511/5000 [05:22<10:56,  5.32it/s]

In [None]:
vins_sample_details.to_csv(vin_detail_output / "municipal_vin_details_sample.csv")

In [7]:
vins_sample_details = pd.read_csv(vin_detail_output / "municipal_vin_details_sample.csv", index_col = [0])

  vins_sample_details = pd.read_csv(vin_detail_output / "municipal_vin_details_sample.csv", index_col = [0])


In [16]:
vins_sample_details.columns.tolist()

['Suggested VIN',
 'Error Code',
 'Possible Values',
 'Additional Error Text',
 'Error Text',
 'Vehicle Descriptor',
 'Destination Market',
 'Make',
 'Manufacturer Name',
 'Model',
 'Model Year',
 'Plant City',
 'Series',
 'Trim',
 'Vehicle Type',
 'Plant Country',
 'Plant Company Name',
 'Plant State',
 'Trim2',
 'Series2',
 'Note',
 'Base Price ($)',
 'Non-Land Use',
 'Body Class',
 'Doors',
 'Windows',
 'Wheel Base Type',
 'Track Width (inches)',
 'Gross Vehicle Weight Rating From',
 'Bed Length (inches)',
 'Curb Weight (pounds)',
 'Wheel Base (inches) From',
 'Wheel Base (inches) To',
 'Gross Combination Weight Rating From',
 'Gross Combination Weight Rating To',
 'Gross Vehicle Weight Rating To',
 'Bed Type',
 'Cab Type',
 'Trailer Type Connection',
 'Trailer Body Type',
 'Trailer Length (feet)',
 'Other Trailer Info',
 'Number of Wheels',
 'Wheel Size Front (inches)',
 'Wheel Size Rear (inches)',
 'Entertainment System',
 'Steering Location',
 'Number of Seats',
 'Number of Seat 

In [25]:
vars_wanted = ['Make', 'Manufacturer Name', 'Model',
               'Model Year', 'Trim', 'Base Price ($)',
               'Body Class', 'Doors', 'Curb Weight (pounds)', 'Gross Combination Weight Rating From', 'Battery Energy (kWh) From',
               'Engine Power (kW)',
               'Wheel Base (inches) From', 'Wheel Base (inches) To',
               'Engine Brake (hp) From']

In [26]:
missing_details = vins_sample_details.notna().sum()
missing_details_select = pd.DataFrame(missing_details.loc[vars_wanted]).reset_index().rename({0: "Number Available"}, axis =1)
missing_details_select["Percentage"] = missing_details_select.loc[:, "Number Available"]/5000

In [27]:
missing_details_select.sort_values("Percentage", ascending = False)

Unnamed: 0,index,Number Available,Percentage
1,Manufacturer Name,4500,0.9
0,Make,4498,0.8996
3,Model Year,4494,0.8988
2,Model,4441,0.8882
6,Body Class,4438,0.8876
7,Doors,3588,0.7176
14,Engine Brake (hp) From,2432,0.4864
4,Trim,1877,0.3754
12,Wheel Base (inches) From,898,0.1796
5,Base Price ($),657,0.1314


In [28]:
print(missing_details_select.sort_values("Percentage", ascending = False).to_latex(index=False))

\begin{tabular}{lrr}
\toprule
index & Number Available & Percentage \\
\midrule
Manufacturer Name & 4500 & 0.900000 \\
Make & 4498 & 0.899600 \\
Model Year & 4494 & 0.898800 \\
Model & 4441 & 0.888200 \\
Body Class & 4438 & 0.887600 \\
Doors & 3588 & 0.717600 \\
Engine Brake (hp) From & 2432 & 0.486400 \\
Trim & 1877 & 0.375400 \\
Wheel Base (inches) From & 898 & 0.179600 \\
Base Price ($) & 657 & 0.131400 \\
Curb Weight (pounds) & 41 & 0.008200 \\
Wheel Base (inches) To & 31 & 0.006200 \\
Engine Power (kW) & 28 & 0.005600 \\
Battery Energy (kWh) From & 3 & 0.000600 \\
Gross Combination Weight Rating From & 0 & 0.000000 \\
\bottomrule
\end{tabular}



In [1]:
vars_wanted

NameError: name 'vars_wanted' is not defined

## 1a. Define required functions

In [25]:
def create_valid_zip(zip):
    try:
        zip_str = str(zip)
        zip_str = zip_str.strip()
        has_dot = re.search(r"\.", zip_str)

        # Get rid of decimal places
        if has_dot:
            zip_str = zip_str[0:re.search(r"\.", zip_str).start()]

        split_zip = re.split("-", zip_str)
        
        if len(split_zip) == 2:
            return create_valid_zip(split_zip[0])
        else:
            # If length is less than 4, return na
            if len(zip_str) < 4:
                return np.NaN
                
            # If length is 4 or 5, check it
            elif((len(zip_str) == 5) | (len(zip_str) == 4)):
                matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)
                if matched:
                    return matched[0].zfill(5)
                else:
                    return np.NaN
            # If the zip is between 5 and 8 (inclusive) long, we assume the first 4 are the first part
            # And the second 4 are the second part
            # There is no other way to do this... 
            elif((len(zip_str) > 5) & (len(zip_str)<9)):
                return create_valid_zip(zip_str[0:4])
            elif (len(zip_str) == 9):
                return create_valid_zip(zip_str[0:5])
            else:
                return np.NaN
        
    except Exception as e:
        print(e)
        return np.NaN

def ct_zip(zip):
    try:
        zip_str = str(int(zip)).zfill(5)
        if zip_str[0:2] == "06":
            return True
        else:
            return False
    except:
        return False

  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)


## 1b. CT Zips and State inconsistencies

### 1bi. Extract from dataset

In [33]:
recompiled_data = reset_recompiled_data("102323")

In [34]:
ct_zip_nonct_state = pd.DataFrame()
nonct_zip_ct_state = pd.DataFrame()

i = 0

for chunk in tqdm(recompiled_data):
    # Clean up
    chunk = chunk.reset_index()
    chunk = chunk.rename(columns = {"index" : "original_index"})
    
    # Check that the ZIPs and the states match
    chunk_corrected_zips = chunk
    chunk_corrected_zips["zip_corrected"] = chunk_corrected_zips["zip"].apply(lambda x: create_valid_zip(x))
    # get_valid_zips(chunk, "zip")
    # Join them
    #chunk_corrected_zips = chunk.join(valid_zips).reset_index(drop = True)
    
    # Get CT zip codes
    ct_zip_mask = chunk_corrected_zips["zip_corrected"].apply(lambda x: ct_zip(x))
    ct_state_mask = chunk_corrected_zips["state"] == "CT"
    ct_zip_nonct_state_mask = ~ct_state_mask & ct_zip_mask
    nonct_zip_ct_state_mask = ~ct_zip_mask & ct_state_mask
    
    # Get ct zip nonct state
    ct_zip_nonct_state_chunk = chunk_corrected_zips.loc[ct_zip_nonct_state_mask]
    ct_zip_nonct_state_chunk = ct_zip_nonct_state_chunk.dropna(axis = 0, subset = ["zip_corrected"])
    ct_zip_nonct_state = pd.concat([ct_zip_nonct_state, ct_zip_nonct_state_chunk])

    # Get non CT zip ct state
    nonct_zip_ct_state_chunk = chunk_corrected_zips.loc[nonct_zip_ct_state_mask]
    nonct_zip_ct_state_chunk = nonct_zip_ct_state_chunk.dropna(axis = 0, subset = ["zip_corrected"])
    nonct_zip_ct_state = pd.concat([nonct_zip_ct_state, nonct_zip_ct_state_chunk])

5788it [01:17, 74.90it/s]


In [37]:
ct_zip_nonct_state

Unnamed: 0,original_index,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
491,491,110_Plainville_MV_21.xlsx,APOLO-CEVALLOS JORGE E,234 EAST ST UNIT 20,PLAINVILE,PA,6062,2014,HYUND,SONATA H,1,KMHEC4A47EA113558,,,,,,06062
941,8941,110_Plainville_MV_21.xlsx,LAVIGNE JOSEPH J,35 WAYNE DR,PLAINVILLE,QC,6062,2011,SUBAR,IMPREZA,1,JF1GH6B69BH820812,,,,,,06062
107,9107,110_Plainville_MV_21.xlsx,LESANO CONSTRUCTION LLC,111 DIAMON AVE,PLAINVILLE,OT,6062,2005,DODGE,RAM 1500,3,1D7HU18D55S317908,,,,,,06062
12,23012,101_North_Haven_MV_21.xlsx,CRAFT BEER GUILD DISTRIBUTING OF CONNECT,352 SACKETT POINT RD,NORTH HAVEN,CA,6473,2013,HINO,HINO 268,2,5PVNJ8JT3D4554181,,55219.0,NORTH HAVEN,CT,6473,06473
430,33430,101_North_Haven_MV_21.xlsx,MULLEN ASHLEY L,9 MAIDEN LANE,NORTH HAVEN,WI,6473,2013,LINCO,MKS AWD,1,1LNHL9EK5DG603365,,65637.0,,CT,53597,06473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,5705023,105_Old_Lyme_MVData_2019.csv,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2011,FORD,ECONOLIN,3,1FBSS3BL6BDB06822,,50478.0,,,,06390
24,5705024,105_Old_Lyme_MVData_2019.csv,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2001,TOYOT,TUNDRA/S,3,5TBBT44121S216756,,50479.0,,,,06390
25,5705025,105_Old_Lyme_MVData_2019.csv,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2018,MITSU,OUTLANDE,1,JA4AZ3A30JZ070576,,50480.0,,,,06390
26,5705026,105_Old_Lyme_MVData_2019.csv,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2007,SAAB,9-3 2.0T,3,YS3FD49Y071126547,,50481.0,,,,06390


In [36]:
version = "102323"
ct_zip_nonct_state.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / f"ct_zip_nonct_state_{version}.csv")
nonct_zip_ct_state.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / f"nonct_zip_ct_state_{version}.csv")

### 1bii. Read and assess errors

In [53]:
ct_zip_nonct_state = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "ct_zip_nonct_state_102023.csv", index_col = 0)
nonct_zip_ct_state = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "nonct_zip_ct_state_102023.csv", index_col = 0)
old = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "non_matching_zips_states.csv")

  old = pd.read_csv(path.parent.parent / "data" / "analysis_outputs" / "non_matching_zips_states.csv")


* There appear to be a few kinds of errors:
    * **CT ZIP, Non CT Address:**
        * E.g. "31 SLIPTOWN RD" / "SHARON" / "NH" / 6010 (31 Sliptown Rd. is a real address in New Hampshire)
        * "12941 N Fox Hollow Dr" / "MARANA" / "AZ" (Real address, but wrong ZIP code"
        * Note that in both of the above instances from 017_Brisol_MV_21 - the underlying file provides no further information on how to parse this information.
        * 79 WOODLAND RD CARIBOU ME 6417 - Not a CT address, CT Zip code. The Deep River underlying file provides no way to fix this. 
    * **Erroneous CT Addresses**
        * E.g. 693 DOGWOOD DR CHESHIRE OT --> Probably means "CT," this is a real address
        * E.g. 152 DAISY LN DURHAM HI 6422 --> 152 Daisy Lane is a CT address. Perhaps "CT" was written wrong? 

### 1biii. Investigate CT ZIPs non CT state

In [57]:
print(f"Of the {len(ct_zip_nonct_state)} rows, {ct_zip_nonct_state["state"].isna().sum()} are NA for state")

Of the 380 rows, 0 are NA for state


In [59]:
ct_zip_nonct_state[ct_zip_nonct_state["state"].notna()].groupby("record_from").count()[["vehicle_id"]].sort_values("vehicle_id", ascending=False).head(5)

Unnamed: 0_level_0,vehicle_id
record_from,Unnamed: 1_level_1
135_Stamford_MV_21.xlsx,16
034_Danbury_MV_21.xlsx,14
105_Old_Lyme_MV_21.xlsx,13
093_New_Haven_MV_21.xlsx,12
135_Stamford_MVData_2020.csv,11


In [69]:
# Observe Stamford
ct_zip_nonct_state[(ct_zip_nonct_state["record_from"].str.contains("135_Stamford_MV_21")) & (ct_zip_nonct_state["state"].notna())].head(3)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
108,135_Stamford_MV_21.xlsx,AGUILAR-CARRERA ELIAS,31 CAROLINA RD 2FL,STAMFORD,MD,6902,2001.0,CHEVR,4500 W45,2.0,J8BC4B14617002821,,,,,,6902
109,135_Stamford_MV_21.xlsx,AGUILAR-CARRERA ELIAS,31 CAROLINA RD 2FL,STAMFORD,MD,6902,2005.0,GMC,SIERRA K,2.0,1GDJK34215E234911,,,,,,6902
110,135_Stamford_MV_21.xlsx,AGUILAR-CARRERA ELIAS,31 CAROLINA RD 2FL,STAMFORD,MD,6902,2000.0,TOYOT,SIENNA L,1.0,4T3ZF13C1YU217680,,,,,,6902


**These are erroneous - attempt geocoding them without the state and see where they land**

In [68]:
# Observe Danbury
ct_zip_nonct_state[(ct_zip_nonct_state["record_from"].str.contains("Danbury")) & (ct_zip_nonct_state["state"].notna())].head(3)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
254,034_Danbury_MV_21.xlsx,ADAMS JILL L,6 CEDAR CREST DR,DANBURY,CA,6811,2017.0,NISSA,MURANO S,1.0,5N1AZ2MH9HN125868,,,,,,6811
255,034_Danbury_MV_21.xlsx,ALVES PRISCILA M,8 TOWER PL APT A,DANBURY,MD,6810,2016.0,HONDA,CR-V SE,1.0,2HKRM4H46GH651740,,,,,,6810
256,034_Danbury_MV_21.xlsx,CAJAMARCA SEGUNDO,3 FIRST STREET EXT,DANBURY,NY,6810,2008.0,HONDA,CIVIC EX,1.0,2HGFA16818H501014,,,,,,6810


**These are erroneous**

In [73]:
# Observe Old Lyme
ct_zip_nonct_state[(ct_zip_nonct_state["record_from"].str.contains("Lyme")) & (ct_zip_nonct_state["state"].notna())].head(3)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
225,105_Old_Lyme_MV_21.xlsx,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2002.0,FORD,ECONOLIN,3.0,1FTNE24L62HB67748,,,,,,6390
226,105_Old_Lyme_MV_21.xlsx,BD HOLDINGS LLC,PO BOX 447,FISHERS ISLAND,NY,6390,2011.0,FORD,ECONOLIN,3.0,1FBSS3BL6BDB06822,,,,,,6390
227,105_Old_Lyme_MV_21.xlsx,BD HOLDINGS LLC,1420 THE GLOAMING #447,FISHERS ISLAND,NY,6390,2004.0,FORD,E450 SUP,2.0,1FDXE45P14HB09356,,,,,,6390


**These are correct**

In [76]:
# Observe NHV
ct_zip_nonct_state[(ct_zip_nonct_state["record_from"].str.contains("New_Haven")) & (ct_zip_nonct_state["state"].notna())].head(3)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
20,093_New_Haven_MV_21.xlsx,A T AND T COMMUNICATIONS,909 CHESTNUT ST 34A07,ST LOUIS,MO,6492,1987.0,TOTEE,4896,10.0,050402,,50039.0,,,,6492
21,093_New_Haven_MV_21.xlsx,BRADLEY NICOLE L,116 SPRINGSIDE AVE,NEW HAVEN,MD,6515,2019.0,NISSA,PATHFIND,1.0,5N1DR2MM5KC652562,,55559.0,,,,6515
22,093_New_Haven_MV_21.xlsx,CEDAR ROOFING LLC,125 CEDAR ST,NEW HAVEN,OT,6519,1998.0,ACURA,INTEGRA,3.0,JH4DC4454WS003338,,58671.0,,,,6519


**Erroneous**

### 1biv. Look at Non-CT ZIP ct State

In [70]:
nonct_zip_ct_state

Unnamed: 0,original_index,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
242,54242,140_Thomaston_MV_21.xlsx,STARKS BRITTANY D,30 LEIGH AVE,THOMASTON,CT,36787,2012,FORD,ESCAPE X,1,1FMCU9C75CKA10106,,,,,,36787
175,105175,146_Vernon_MV_21.xlsx,LEASE PLAN U.S.A. INC.,1165 SANCTURARY PARKWAY,ALPHARETTA,CT,30009,2019,CHEVR,COLORADO,1,1GCHSBEA3K1227996,,,,,,30009
35,130035,103_Norwalk_MV_21.xlsx,HEIDEMANN BEVERLY ...,146 W ROCKS RD ...,NORWALK,CT,68512-232,2011,YAKIM,VINS,11,510SF1114BN009217,,,,,,68512
36,130036,103_Norwalk_MV_21.xlsx,MANDUJANO JAIME ...,58 N TAYLOR AVE ...,NORWALK,CT,68541-410,1977,HOLSC,2000,11,CTUNKNOWN95699011,,,,,,68541
38,130038,103_Norwalk_MV_21.xlsx,SON YOUNG SANG ...,198 ELY AVE ...,NORWALK,CT,68544-229,2011,CHANGZHOU NANXIASHU,TRAILER,11,LN2AD001XBJ000038,,,,,,68544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,5540908,77_Manchester_MVData_2019_ALTERED.csv,CHMIELEWSKI THOMAS D,105 ZENA RD,KINGSTON,CT,12401-,2014.0,SUBAR,XV CROSS,,JF2GPACC3E9244364,,,,,,12401
178,5618178,36_Deep_River_MVData_2019.csv,ORELLANA ANA C,185 ESTANCIA DR 237,SAN JOSE,CT,95134,2012,TOYOT,MATRIX,1,2T1KU4EE9CC766263,,,,,,95134
734,5650734,54_Glastonbury_MVData_2019.csv,WARD GEORGE D JR,118 PRINCETON AVE,FEEDING HILLS,CT,1030,2010,HONDA,ACCORD L,1,1HGCP2F36AA151587,,,,,,01030
726,5683726,26_Chester_MVData_2020.csv,FISHMAN HOWARD W,116 N 7TH ST APT 5,BROOKLYN NY,CT,11211,1999,MERCU,SABLE GS,1,1MEFM58UXXA622238,,,,,,11211


### Investigate Non CT ZIPS CT state

In [86]:
print(f"This error is present in {nonct_zip_ct_state["record_from"].nunique()} underlying files, suggesting it is generalized")
print(f"This error affects {len(nonct_zip_ct_state)} rows")

This error is present in 128 underlying files, suggesting it is generalized
This error affects 808 rows


In [88]:
nonct_zip_ct_state.groupby("record_from").count()[["vehicle_id"]].sort_values("vehicle_id", ascending=False)

Unnamed: 0_level_0,vehicle_id
record_from,Unnamed: 1_level_1
103_Norwalk_MV_21.xlsx,491
099_North_Branford_MV_21.xlsx,36
135_Stamford_MVData_2020.csv,13
111_Plymouth_MV_21.xlsx,12
135_Stamford_MV_21.xlsx,12
...,...
138_Stratford_MV_21.xlsx,1
056_Granby_MV_21.xlsx,1
131_Southington_MV_21.xlsx,1
131_Southbury_MVData_2019.csv,1


In [84]:
nonct_zip_ct_state[nonct_zip_ct_state["record_from"].str.contains("Norwalk")].head(3)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
2,103_Norwalk_MV_21.xlsx,HEIDEMANN BEVERLY ...,146 W ROCKS RD ...,NORWALK,CT,68512-232,2011.0,YAKIM,VINS,11.0,510SF1114BN009217,,,,,,68512
3,103_Norwalk_MV_21.xlsx,MANDUJANO JAIME ...,58 N TAYLOR AVE ...,NORWALK,CT,68541-410,1977.0,HOLSC,2000,11.0,CTUNKNOWN95699011,,,,,,68541
4,103_Norwalk_MV_21.xlsx,SON YOUNG SANG ...,198 ELY AVE ...,NORWALK,CT,68544-229,2011.0,CHANGZHOU NANXIASHU,TRAILER,11.0,LN2AD001XBJ000038,,,,,,68544


In [85]:
nonct_zip_ct_state[nonct_zip_ct_state["record_from"].str.contains("Norwalk")]["zip"].unique()

array(['68512-232 ', '68541-410 ', '68544-229 ', '68552-003 ',
       '68501-832 ', '68511-221 ', '68516-038 ', '68501-616 ',
       '00000     ', '68501-733 ', '68531-132 ', '68541-067 ',
       '68513-605 ', '68514-466 ', '68512-827 ', '68504-430 ',
       '68512-214 ', '68511-215 ', '68515-328 ', '68512-431 ',
       '68543-216 ', '68502-730 ', '68501-835 ', '68552-103 ',
       '68513-420 ', '68502-722 ', '68515-534 ', '68543-305 ',
       '68543-627 ', '68543-422 ', '68513-008 ', '68552-235 ',
       '68544-714 ', '68502-840 ', '68544-716 ', '68504-320 ',
       '68515-940 ', '68513-108 ', '68511-715 ', '68541-613 ',
       '68513-221 ', '68542-107 ', '68502-408 ', '68543-512 ',
       '68511-042 ', '68516-027 ', '68514-316 ', '68502-728 ',
       '68541-561 ', '68543-548 ', '68516-141 ', '68551-615 ',
       '68544-304 ', '68512-645 ', '68511-536 ', '68544-727 ',
       '68544-308 ', '68543-735 ', '68542-514 ', '68501-704 ',
       '68513-115 ', '69024-108 ', '68552-703 ', '68515

In [32]:
nonct_zip_ct_state[nonct_zip_ct_state["record_from"].str.contains("Stamford")].head(5)

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,zip_corrected
634,135_Stamford_MV_21.xlsx,BIVIANO NICHOLAS A,2428 NAPA TER,LAKE WYLIE,CT,29710,2016.0,MERCE,C300 4 M,1.0,WDDWF4KB5GR145755,,,,,,29710
635,135_Stamford_MV_21.xlsx,DE-MARTINEZ MARIA R,43 CRESCENT ST APT 3,STAMFORD,CT,60906,2017.0,SUBAR,OUTBACK,1.0,4S4BSAAC4H3223438,,,,,,60906
636,135_Stamford_MV_21.xlsx,FUENTES-SOTO NOE A,131 W BROAD ST UNIT 1,STAMFORD,CT,98387,2008.0,GMC,CANYON,1.0,1GTDT13E488180313,,,,,,98387
637,135_Stamford_MV_21.xlsx,FUENTES-SOTO NOE A,131 W BROAD ST UNIT 1,STAMFORD,CT,98387,2008.0,CHEVR,SILVERAD,3.0,1GCHK23658F128932,,,,,,98387
638,135_Stamford_MV_21.xlsx,GODOY-PAREDES CARLOS A,148B EAST BOSTON POST ROAD,MAMARONECK,CT,10543,2005.0,NISSA,PATHFIND,1.0,5N1AR18W55C762007,,,,,,10543


## 1c. Missing Data and Other

In [39]:
recompiled_data = reset_recompiled_data("102323")

In [None]:
missing_state_or_zip = pd.DataFrame()
nonct_state_or_nonct_zip = pd.DataFrame()

i = 0

In [50]:
for chunk in tqdm(recompiled_data):
    # Clean up
    chunk = chunk.reset_index()
    chunk = chunk.rename(columns = {"index" : "original_index"})
    
    # Check that the ZIPs and the states match
    chunk_corrected_zips = chunk
    chunk_corrected_zips["zip_corrected"] = chunk_corrected_zips["zip"].apply(lambda x: create_valid_zip(x))
    
    # Create masks
    ct_zip_mask = chunk_corrected_zips["zip_corrected"].apply(lambda x: ct_zip(x))
    ct_state_mask = chunk_corrected_zips["state"] == "CT"

    missing_mask = chunk_corrected_zips["state"].isna() | chunk_corrected_zips["zip_corrected"].isna()
    nonct_state_or_nonct_zip_mask = ~ct_state_mask | ~ct_zip_mask
    
    # Get missing rows
    missing_rows = chunk_corrected_zips.loc[missing_mask]\

    # Get Non CT state or Non CT Zip
    nonct_state_or_nonct_zip_rows = chunk_corrected_zips.loc[nonct_state_or_nonct_zip_mask]
    nonct_state_or_nonct_zip_rows = nonct_state_or_nonct_zip_rows.dropna(axis = 0, subset = ["zip_corrected", "state"])

    # Concat
    missing_state_or_zip = pd.concat([missing_state_or_zip, missing_rows])
    nonct_state_or_nonct_zip = pd.concat([nonct_state_or_nonct_zip, nonct_state_or_nonct_zip_rows])

2761it [04:36, 10.00it/s]


In [69]:
print(f"Thus there are {len(nonct_state_or_nonct_zip)} rows where the state or zip are non-CT,\nand there are a further {len(missing_state_or_zip)} where either state zip is missing")

Thus there are 493698 rows where the state or zip are non-CT,
and there are a further 94499 where either state zip is missing


In [51]:
version = "102323"
missing_state_or_zip.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / f"missing_state_or_zip{version}.csv")
nonct_state_or_nonct_zip.reset_index(drop = True).to_csv(path.parent.parent / "data" / "analysis_outputs" / f"nonct_state_or_nonct_zip{version}.csv")

## Further checks

The original purpose of recompiling the data was to address issues with missing VINs, missing zip codes, and missing street addresses. below, I confirm that these issues have been addressed.

In [13]:
recompiled_data = reset_recompiled_data("102023")

In [14]:
i = 0

missing_df = pd.DataFrame([])

for chunk in recompiled_data:
    
    # Give progress
    if (i % 1000 ==0):
        print(f"Currently on chunk number {i}")
        # display(df)
        
    # Create DF
    # Get the total number for that record_from
    total = chunk.groupby("record_from").count()["Unnamed: 0"].reset_index(name="count")
    
    # Get the number with a missing ZIP
    missing_zips = chunk.groupby("record_from")["zip"].apply(lambda x: x.isna().sum()).reset_index(name = "missing zip")
    
    # Get the number with a missing VIN
    missing_vins = chunk.groupby("record_from")["vehicle_id"].apply(lambda x: x.isna().sum()).reset_index(name = "missing VIN")
    
    # Missing address
    missing_streets = chunk.groupby("record_from")["street"].apply(lambda x: x.isna().sum()).reset_index(name = "missing street")
    
    # Get the number with state not in CT
    state_not_CT = chunk.groupby("record_from")["state"].apply(lambda x: (x!="CT").sum()).reset_index(name ="not CT")
    
    # Merge
    all_columns  = missing_zips.merge(missing_vins,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(missing_streets,
                                   how = 'outer',
                                   left_on = 'record_from',
                                   right_on = 'record_from')
    
    all_columns  = all_columns.merge(state_not_CT,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(total,
                                    how = 'outer',
                                    left_on = 'record_from',
                                    right_on = 'record_from')
    
    
    # Assign this to a DF to save
    missing_df = pd.concat([missing_df, all_columns])
    
    # add to count
    i +=1

missing_df = missing_df.groupby("record_from").sum()
missing_df = missing_df.reset_index()
missing_df_new = missing_df.copy(deep=True)

Currently on chunk number 0
Currently on chunk number 1000
Currently on chunk number 2000
Currently on chunk number 3000
Currently on chunk number 4000
Currently on chunk number 5000


In [18]:
original_data = reset_original_data()

In [19]:
i = 0

missing_df = pd.DataFrame([])

for chunk in original_data:
    
    # Give progress
    if (i % 1000 ==0):
        print(f"Currently on chunk number {i}")
        # display(df)
        
    # Create DF
    # Get the total number for that record_from
    total = chunk.groupby("record_from").count()["Unnamed: 0"].reset_index(name="count")
    
    # Get the number with a missing ZIP
    missing_zips = chunk.groupby("record_from")["zip"].apply(lambda x: x.isna().sum()).reset_index(name = "missing zip")
    
    # Get the number with a missing VIN
    missing_vins = chunk.groupby("record_from")["vehicle_id"].apply(lambda x: x.isna().sum()).reset_index(name = "missing VIN")
    
    # Missing address
    missing_streets = chunk.groupby("record_from")["street"].apply(lambda x: x.isna().sum()).reset_index(name = "missing street")
    
    # Get the number with state not in CT
    state_not_CT = chunk.groupby("record_from")["state"].apply(lambda x: (x!="CT").sum()).reset_index(name ="not CT")
    
    # Merge
    all_columns  = missing_zips.merge(missing_vins,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(missing_streets,
                                   how = 'outer',
                                   left_on = 'record_from',
                                   right_on = 'record_from')
    
    all_columns  = all_columns.merge(state_not_CT,
                                  how='outer',
                                  left_on = 'record_from',
                                  right_on = 'record_from')
    
    all_columns = all_columns.merge(total,
                                    how = 'outer',
                                    left_on = 'record_from',
                                    right_on = 'record_from')
    
    
    # Assign this to a DF to save
    missing_df = pd.concat([missing_df, all_columns])
    
    # add to count
    i +=1

missing_df = missing_df.groupby("record_from").sum()
missing_df = missing_df.reset_index()
missing_df_original = missing_df.copy(deep=True)

Currently on chunk number 0
Currently on chunk number 1000
Currently on chunk number 2000
Currently on chunk number 3000
Currently on chunk number 4000
Currently on chunk number 5000


In [20]:
missing_df_original["missing zip rank"] = missing_df_original["missing zip"].rank(method = 'min', ascending = False)
missing_df_original["missing VIN rank"] = missing_df_original["missing VIN"].rank(method = 'min', ascending = False)
missing_df_original["missing street rank"] = missing_df_original["missing street"].rank(method = 'min', ascending = False)

In [21]:
missing_df_new["missing zip rank"] = missing_df_new["missing zip"].rank(method = 'min', ascending = False)
missing_df_new["missing VIN rank"] = missing_df_new["missing VIN"].rank(method = 'min', ascending = False)
missing_df_new["missing street rank"] = missing_df_new["missing street"].rank(method = 'min', ascending = False)

In [22]:
missing_df_original.sort_values("missing street rank", ascending = True).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count,missing zip rank,missing VIN rank,missing street rank
91,103_Norwalk_MV_21.xlsx,75044,75044,75044,75044,75044,4.0,1.0,1.0
90,103_Norwalk_MVData_2020.csv,71664,71664,71664,71664,71664,5.0,2.0,2.0
290,77_Manchester_MVData_2019.csv,48990,3203,48990,48990,48990,12.0,12.0,3.0
64,077_Manchester_MV_21.xls,44487,3,44487,44487,44487,18.0,37.0,4.0
124,11_Bloomfield_MVData_2019.csv,19643,19643,19643,1630,19643,71.0,6.0,5.0
8,009_Bethel_MV_21.xlsx,17855,1,17855,1678,17855,79.0,44.0,6.0
99,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522,119.0,61.0,7.0
235,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202,123.0,61.0,8.0
169,141_Thompson_MV_21.xlsx,10054,457,10054,10054,10054,151.0,14.0,9.0
28,037_Derby_MV_21.xls,9511,0,9511,9511,9511,161.0,61.0,10.0


In [23]:
missing_df_original_zips_formerge = missing_df_original[["record_from", "missing zip", "missing zip rank"]].sort_values("missing zip rank", ascending = True).head(15)
missing_df_original_VINS_formerge = missing_df_original[["record_from", "missing VIN", "missing VIN rank"]].sort_values("missing VIN rank", ascending = True).head(10)
missing_df_original_streets_formerge = missing_df_original[["record_from", "missing street", "missing street rank"]].sort_values("missing street rank", ascending = True).head(10)

In [24]:
missing_df_new_zips_formerge = missing_df_new[["record_from", "missing zip", "missing zip rank"]].sort_values("missing zip rank", ascending = True).head(15)
missing_df_new_VINS_formerge = missing_df_new[["record_from", "missing VIN", "missing VIN rank"]].sort_values("missing VIN rank", ascending = True).head(10)
missing_df_new_streets_formerge = missing_df_new[["record_from", "missing street", "missing street rank"]].sort_values("missing street rank", ascending = True).head(10)

In [25]:
compare_missing_zips = missing_df_original_zips_formerge.merge(missing_df_new_zips_formerge, how = 'left',
                                                              left_on = "missing zip rank",
                                                              right_on = "missing zip rank",
                                                              suffixes = ("_original", "_recompiled"))

compare_missing_zips = compare_missing_zips.set_index("missing zip rank")

compare_missing_zips["Difference"] = compare_missing_zips["missing zip_recompiled"] - compare_missing_zips["missing zip_original"]

compare_missing_zips = compare_missing_zips.reset_index()

In [30]:
cm = sns.light_palette("green", as_cmap=True, reverse = True)

In [31]:
compare_missing_zips = compare_missing_zips.style.background_gradient(subset = ["Difference"], cmap=cm)
# compare_missing_zips.data = compare_missing_zips.data.drop("Town_original", axis = 1)
compare_missing_zips.data = compare_missing_zips.data.drop("missing zip rank", axis = 1)

In [32]:
total_row_zips = pd.DataFrame(compare_missing_zips.data.sum(), columns=["Total"]).T
total_row_zips.loc["Total", "record_from_original"] = ""
total_row_zips.loc["Total", "record_from_recompiled"] =""

In [33]:
compare_missing_zips.data = pd.concat([compare_missing_zips.data, total_row_zips])

In [49]:
compare_missing_zips.set_caption("Comparison of Missing ZIP Codes: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing zip_original,record_from_recompiled,missing zip_recompiled,Difference
0,15_Bridgeport_MVData_2019.csv,82704,107_Orange_MV_21.xlsx,13522,-69182
1,015_Bridgeport_MV_21.xlsx,80383,27_Clinton_MVData_2020.csv,13202,-67181
2,135_Stamford_MV_21.xlsx,77594,131_Southington_MV_21.xlsx,10089,-67505
3,103_Norwalk_MV_21.xlsx,75044,037_Derby_MV_21.xls,9511,-65533
4,103_Norwalk_MVData_2020.csv,71664,117_Redding_MV_21.xlsx,8171,-63493
5,151_Waterbury_MV_21.xlsx,64647,016_Bridgewater_MV_21_ALTERED.csv,4886,-59761
6,93_NewHaven_MVData_2019.csv,60366,045_East_Lyme_MV_21_ALTERED.csv,4475,-55891
7,093_New_Haven_MV_21.xlsx,57470,44_East_Lyme_MVData_2019.csv,4301,-53169
8,057_Greenwich_MV_21.xlsx,51748,112_Pomfret_MVData_2020.csv,4130,-47618
9,57_Greenwich_MVData_2020.csv,51478,055_Goshen_MV_21.XLSX,4004,-47474


### Comparing missing VINs

In [35]:
compare_missing_VINS = missing_df_original_VINS_formerge.merge(missing_df_new_VINS_formerge, how = 'left',
                                                              left_on = "missing VIN rank",
                                                              right_on = "missing VIN rank",
                                                              suffixes = ("_original", "_recompiled"))

compare_missing_VINS = compare_missing_VINS.set_index("missing VIN rank")

compare_missing_VINS["Difference"] = compare_missing_VINS["missing VIN_recompiled"] - compare_missing_VINS["missing VIN_original"]

compare_missing_VINS = compare_missing_VINS.reset_index()

compare_missing_VINS = compare_missing_VINS.style.background_gradient(subset = ["Difference"], cmap=cm)

compare_missing_VINS.data = compare_missing_VINS.data.drop("missing VIN rank", axis = 1)

In [36]:
total_row = pd.DataFrame(compare_missing_VINS.data.sum(), columns=["Total"]).T

In [37]:
total_row.loc["Total", "record_from_original"] = ""
total_row.loc["Total", "record_from_recompiled"] =""

In [38]:
compare_missing_VINS.data = pd.concat([compare_missing_VINS.data, total_row])

In [39]:
compare_missing_VINS.set_caption("Comparison of Missing VINs: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing VIN_original,record_from_recompiled,missing VIN_recompiled,Difference
0,103_Norwalk_MV_21.xlsx,75044,11_Bloomfield_MVData_2019.csv,19643,-55401
1,103_Norwalk_MVData_2020.csv,71664,107_Orange_MVData_2020.csv,14650,-57014
2,126_Shelton_MV_21.xlsx,39086,124_Seymour_MV_21.csv,13860,-25226
3,143_Torrington_MV_21.xlsx,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
4,143_Torrington_MVData_2020.csv,29674,016_Bridgewater_MV_21_ALTERED.csv,4886,-24788
5,11_Bloomfield_MVData_2019.csv,19643,121_Salem_MVData_2019.csv,218,-19425
6,107_Orange_MVData_2020.csv,14650,101_North_Haven_MV_21.xlsx,84,-14566
7,124_Seymour_MV_21.csv,13860,083_Middletown_MV_21.xlsx,82,-13778
8,016_Bridgewater_MV_21.xlsx,4886,136_Sterling_MVData_2019.csv,69,-4817
9,122_Salisbury_MV_21.xlsx,4390,163_Windham_MV_21.xlsx,37,-4353


### Comparing Missing Street Addresses

In [40]:
compare_missing_streets = missing_df_original_streets_formerge.merge(missing_df_new_streets_formerge, how = 'left',
                                                      left_on = "missing street rank",
                                                      right_on = "missing street rank",
                                                      suffixes = ("_original", "_recompiled"))

compare_missing_streets = compare_missing_streets.set_index("missing street rank")

compare_missing_streets["Difference"] = compare_missing_streets["missing street_recompiled"] - compare_missing_streets["missing street_original"]

compare_missing_streets = compare_missing_streets.reset_index()

compare_missing_streets = compare_missing_streets.style.background_gradient(subset = ["Difference"], cmap=cm)

compare_missing_streets.data = compare_missing_streets.data.drop("missing street rank", axis = 1)

In [42]:
total_row_streets = pd.DataFrame(compare_missing_streets.data.sum(), columns=["Total"]).T
total_row_streets.loc["Total", "record_from_original"] = ""
total_row_streets.loc["Total", "record_from_recompiled"] =""

In [43]:
compare_missing_streets.data = pd.concat([compare_missing_streets.data, total_row_streets])

In [44]:
compare_missing_streets.set_caption("Comparison of Missing Street Addresses: New and Recompiled Data").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '20px'),
        ('font-weight', 'bold'),
        ('text-align', 'center')]}])

Unnamed: 0,record_from_original,missing street_original,record_from_recompiled,missing street_recompiled,Difference
0,103_Norwalk_MV_21.xlsx,75044,107_Orange_MV_21.xlsx,13522,-61522
1,103_Norwalk_MVData_2020.csv,71664,045_East_Lyme_MV_21_ALTERED.csv,13325,-58339
2,77_Manchester_MVData_2019.csv,48990,27_Clinton_MVData_2020.csv,13202,-35788
3,077_Manchester_MV_21.xls,44487,037_Derby_MV_21.xls,9511,-34976
4,11_Bloomfield_MVData_2019.csv,19643,016_Bridgewater_MV_21_ALTERED.csv,4886,-14757
5,009_Bethel_MV_21.xlsx,17855,10_Bethlehem_MVData_2019.csv,4760,-13095
6,107_Orange_MV_21.xlsx,13522,112_Pomfret_MVData_2020.csv,4130,-9392
7,27_Clinton_MVData_2020.csv,13202,055_Goshen_MV_21.XLSX,4004,-9198
8,141_Thompson_MV_21.xlsx,10054,098_Norfolk_MV_21.xlsx,1945,-8109
9,037_Derby_MV_21.xls,9511,101_North_Haven_MV_21.xlsx,84,-9427


***

**Consider missing VINs**

In [50]:
missing_df_new.sort_values("missing VIN", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count,missing zip rank,missing VIN rank,missing street rank
134,11_Bloomfield_MVData_2019.csv,0,19643,0,1630,19643,41.0,1.0,33.0
108,107_Orange_MVData_2020.csv,0,14650,0,1939,14650,41.0,2.0,33.0
144,124_Seymour_MV_21.csv,0,13860,0,1119,15410,41.0,3.0,33.0
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254,6.0,4.0,5.0
180,141_Thompson_MV_21.xlsx,0,457,7,447,10054,41.0,5.0,21.0
137,121_Salem_MVData_2019.csv,0,218,0,126,4914,41.0,6.0,33.0
97,101_North_Haven_MV_21.xlsx,84,84,84,2978,24425,14.0,7.0,10.0
77,083_Middletown_MV_21.xlsx,82,82,82,3060,35302,15.0,8.0,11.0
168,136_Sterling_MVData_2019.csv,71,69,69,223,4249,17.0,9.0,12.0
224,163_Windham_MV_21.xlsx,37,37,37,848,16516,19.0,10.0,13.0


**MISSING VINs**
* Bloomfield file for 2019 does not contain VINs
* Orange file for 2019 does not contain VINs
* Seymour file has major issues for missing VINs
* Bridgewater file - has this many missing VINs in the underlying file.
* Thompson file - has 457 missing VINs in the underlying file anyway

That is - since the VINs are missing in the underlying file anyway, nothing can be done to fix these missing VINs. So we are good.

**Consider missing street addresses**

In [51]:
 missing_df_new.sort_values("missing street", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count,missing zip rank,missing VIN rank,missing street rank
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522,1.0,24.0,1.0
39,045_East_Lyme_MV_21_ALTERED.csv,4475,0,13325,5171,17616,7.0,24.0,2.0
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202,2.0,24.0,3.0
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511,4.0,24.0,4.0
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254,6.0,4.0,5.0
114,10_Bethlehem_MVData_2019.csv,0,0,4760,206,4760,41.0,24.0,6.0
119,112_Pomfret_MVData_2020.csv,4130,0,4130,4130,4130,9.0,24.0,7.0
49,055_Goshen_MV_21.XLSX,4004,0,4004,4004,4004,10.0,24.0,8.0
92,098_Norfolk_MV_21.xlsx,1945,0,1945,1945,1945,11.0,24.0,9.0
97,101_North_Haven_MV_21.xlsx,84,84,84,2978,24425,14.0,7.0,10.0


Orange 2021, Clinton 2020, Derby 2021, Bridgewater 2021 - All missing this info anyway.
Bethlehem 2019, Pomfret 2020, Goshen 2021 - all the same
Norfolk 2019 - is missing 1945 addresses in the underlying file

**Consider missing ZIP codes**

In [48]:
missing_df_new.sort_values("missing zip", ascending = False).head(20)

Unnamed: 0,record_from,missing zip,missing VIN,missing street,not CT,count,missing zip rank,missing VIN rank,missing street rank
109,107_Orange_MV_21.xlsx,13522,0,13522,13522,13522,1.0,24.0,1.0
248,27_Clinton_MVData_2020.csv,13202,0,13202,13202,13202,2.0,24.0,3.0
160,131_Southington_MV_21.xlsx,10089,0,2,12764,42996,3.0,24.0,25.0
33,037_Derby_MV_21.xls,9511,0,9511,9511,9511,4.0,24.0,4.0
129,117_Redding_MV_21.xlsx,8171,0,0,8171,8171,5.0,24.0,33.0
15,016_Bridgewater_MV_21_ALTERED.csv,4886,4886,4886,5074,7254,6.0,4.0,5.0
39,045_East_Lyme_MV_21_ALTERED.csv,4475,0,13325,5171,17616,7.0,24.0,2.0
267,44_East_Lyme_MVData_2019.csv,4301,0,0,5215,17448,8.0,24.0,33.0
119,112_Pomfret_MVData_2020.csv,4130,0,4130,4130,4130,9.0,24.0,7.0
49,055_Goshen_MV_21.XLSX,4004,0,4004,4004,4004,10.0,24.0,8.0


* Orange 2021 - **not an error** - the ZIP is genuinely missing in the underlying file
* Clinton - is genuinely missing
* Southington - 10,089 ZIPs are genuinely missing
* Derby - is genuinely missing
* Redding - is genuinely missing in the underlying file
* Bridgewater 2021 - real
* East Lyme 21 - really 4475 are indeed missing.
* Pomfret - Real
* Goshen - Real
* Norfolk 2021 - genuinely missing