# Reconstructing municipal data

In [None]:
!pip install usaddress

In [26]:
import pathlib
import os
import pandas as pd

import re
from tqdm import tqdm
import usaddress

## Setup raw file paths

In [352]:
raw_files_20_19 = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "CSVFiles2019"
raw_files_21 = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" /"Vehicles_2022" / "Town files"

In [353]:
list_20_19 = [filename.name for filename in raw_files_20_19.iterdir() 
              if ((filename.suffix!=".dta") & (filename.is_file()) 
                  & ~( ("_old" in filename.name) | ("noVIN" in filename.name) | ("uncleaned" in filename.name)))]
list_21 = [filename.name for filename in raw_files_21.iterdir()]

In [354]:
list_20_19 = list_20_19[0:list_20_19.index('all_pace_missing_clean.csv')]
list_20_19.remove("102_NorthStonington_MVData_2020.xlsx")

## Confirm these correspond to the large CSV

In [34]:
large_csv = pd.read_csv(pathlib.Path().resolve().parent / "ignored-data" / "vehicles_2022" / "2019-21_data_compiled_simplified_zipcitymatched.csv", chunksize = 1000)

In [35]:
large_csv_sources = []

i = 0

for chunk in large_csv:
    if i%100 == 0:
        print(i)
    
    sources = list(chunk["record_from"].unique())
    
    large_csv_sources.append(sources)
    
    i+=1    

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700


In [36]:
large_csv_sources_flat = []

for sublist in large_csv_sources:
    for item in sublist:
        large_csv_sources_flat.append(item)

In [38]:
large_csv_sources_df = pd.DataFrame(large_csv_sources_flat, columns = ["sources"])
unique_sources_large_csv = large_csv_sources_df["sources"].unique()

In [119]:
all_raw_files = list_20_19 + list_21

### Identify and resolve differences

**Items that are in all_raw_files but not in the large CSV**

In [121]:
for item in all_raw_files:
    if item not in unique_sources_large_csv:
        print(item)

102_NorthStonington_MVData_2020.xlsx


Closer inspection reveals that these files are identical. Therefore we drop `102_NorthStonington_MVData_2020.xlsx` from the 2019 and 2020 list

In [122]:
list_20_19.remove("102_NorthStonington_MVData_2020.xlsx")
all_raw_files = list_20_19 + list_21

Now repeat

In [123]:
for item in all_raw_files:
    if item not in unique_sources_large_csv:
        print(item)

**Items that are in the large CSV but not in all raw files**

In [120]:
for item in unique_sources_large_csv:
    if item not in all_raw_files:
        print(item)

**So all differences are now resolved.**

# Re-run

## Confirm correspondence to the three renamer files

In [127]:
renamer_files_dir = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun"

In [129]:
source_files_renamers = []

for file in renamer_files_dir.iterdir():
    df = pd.read_csv(file)
    source_files_renamers.append(list(df["record_from"]))

In [131]:
source_files_renamers = [item for sublist in source_files_renamers for item in sublist]

**Confirm correspondence**

In [133]:
for item in source_files_renamers:
    if item not in all_raw_files:
        print(item)

In [135]:
for item in all_raw_files:
    if item not in source_files_renamers:
        print(item)

**So we have correspondence - we just need to ensure that the contents of the renamers are correct. We do this manually.**

# MANUAL ADJUSTMENTS 1 - RENAMING FILES

In [217]:
column_renamer_2021 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2021_column_renamer_RN.csv") 

In [218]:
column_renamer_2020 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2020_column_renamer_RN.csv") 

In [219]:
column_renamer_2019 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2019_column_renamer_RN.csv") 

### Norwalk - 2021

In [220]:
# Norwalk
norwalk_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Norwalk")].index.tolist()[0]

In [221]:
# Manually set the values
nor_cols_21 = ["txcm prim name full","txcm prim addr1","txcm prim city",
            "txcm prim state","txcm prim zip","txcm veh year",
            "txcm make","txcm model","txcm class","txcm vin"]

for i in range(1,11):
        column_renamer_2021.iloc[norwalk_index_21,i] = nor_cols_21[i-1]

In [222]:
column_renamer_2021.iloc[140,: ]

record_from      103_Norwalk_MV_21.xlsx
name                txcm prim name full
street                  txcm prim addr1
city                     txcm prim city
state                   txcm prim state
zip                       txcm prim zip
vehicle_year              txcm veh year
vehicle_make                  txcm make
vehicle_model                txcm model
vehicle_class                txcm class
vehicle_id                     txcm vin
lease_street                        NaN
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_zip                           NaN
Name: 140, dtype: object

### Norwalk - 2020

In [223]:
# Norwalk
norwalk_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Norwalk")].index.tolist()[0]

In [224]:
column_renamer_2020.loc[norwalk_index_20]

record_from      103_Norwalk_MVData_2020.csv
name                                     NaN
street                                   NaN
vehicle_id                               NaN
vehicle_make                             NaN
vehicle_model                            NaN
vehicle_year                             NaN
city                                     NaN
state                                    NaN
vehicle_class                            NaN
zip                                      NaN
UID                                      NaN
lease_city                               NaN
lease_state                              NaN
lease_street                             NaN
lease_zip                                NaN
Name: 44, dtype: object

In [225]:
nor_cols_20 = ["txcm prim name full","txcm prim addr1","txcm vin",
               "txcm make","txcm model", "txcm veh year",
               "txcm prim city", "txcm prim state","txcm class", 
               "txcm prim zip"]

In [226]:
for i in range(1,11):
        column_renamer_2020.iloc[norwalk_index_20,i] = nor_cols_20[i-1]

In [227]:
column_renamer_2020.loc[norwalk_index_20]

record_from      103_Norwalk_MVData_2020.csv
name                     txcm prim name full
street                       txcm prim addr1
vehicle_id                          txcm vin
vehicle_make                       txcm make
vehicle_model                     txcm model
vehicle_year                   txcm veh year
city                          txcm prim city
state                        txcm prim state
vehicle_class                     txcm class
zip                            txcm prim zip
UID                                      NaN
lease_city                               NaN
lease_state                              NaN
lease_street                             NaN
lease_zip                                NaN
Name: 44, dtype: object

### Colchester - 2021

In [228]:
colchester_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Colchester")].index.tolist()[0]

In [229]:
column_renamer_2021.loc[colchester_index_21]

record_from      028_Colchester_MV_21.xlsx
name                                   NaN
street                             address
city                                   NaN
state                                  NaN
zip                                    NaN
vehicle_year                           NaN
vehicle_make                           NaN
vehicle_model                          NaN
vehicle_class                          NaN
vehicle_id                             NaN
lease_street                           NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_zip                              NaN
Name: 149, dtype: object

**Manual adjustments required**

In [230]:
colchester_21 = pd.read_excel(raw_files_21 / "028_Colchester_MV_21.xlsx")

In [231]:
colchester_cities = []
colchester_zipcodes = []
colchester_states = []
for entry in list(colchester_21["city_state_zip"]):
    if not pd.isna(entry):
        split = entry.split(" ")
        zip_code = split[len(split) - 1]
        state = split[len(split)-2]
        city = ' '.join(split[0:len(split)-2])
    else:
        zip_code = 'nan'
        state = 'nan'
        city = 'nan'

    colchester_cities.append(city)
    colchester_zipcodes.append(zip_code)
    colchester_states.append(state)

In [232]:
colchester_21["city"] = colchester_cities
colchester_21["state"] = colchester_states
colchester_21["zip"] = colchester_zipcodes

In [233]:
colchester_21.head(10)

Unnamed: 0,id,name_1,name_2,address,city_state_zip,identification,city,state,zip
0,50001,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,5NHUCC42X8N055055,COLCHESTER,CT,06415-
1,50002,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,55NBE1213J1005012,COLCHESTER,CT,06415-
2,50003,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,1GTEC14C18Z901358,COLCHESTER,CT,06415-
3,50004,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,1C6RR7FT6JS214821,COLCHESTER,CT,06415-
4,50005,A BEST ROOFING LLC,,381 CABIN RD,COLCHESTER CT 06415-1522,5JWCA1228MP512126,COLCHESTER,CT,06415-1522
5,50006,A JOURNEY TO THE POTTERS HOUSE INC,,81 DEER RUN DR,COLCHESTER CT 06415-1806,5NMS2CAD8KH115177,COLCHESTER,CT,06415-1806
6,50007,A JOURNEY TO THE POTTERS HOUSE INC,,81 DEER RUN DR,COLCHESTER CT 06415-1806,1FTBR2CG3LKA67039,COLCHESTER,CT,06415-1806
7,50008,A JOURNEY TO THE POTTERS HOUSE INC,,81 DEER RUN DR,COLCHESTER CT 06415-1806,542BB1013DB005628,COLCHESTER,CT,06415-1806
8,50009,A.I. BOERENKO PLUMBING & PUMP LLC,,44 BULL HILL RD,COLCHESTER CT 06415-2628,1FDXE4FS5GDC41065,COLCHESTER,CT,06415-2628
9,50010,A.I. BOERENKO PLUMBING & PUMP LLC,,44 BULL HILL RD,COLCHESTER CT 06415-2628,3D6WZ4ET9BG616637,COLCHESTER,CT,06415-2628


In [234]:
colchester_cols_21 = ["name_1", "address", "city", "state", "zip","","","","", "identification"]

for i in [1,2,3,4,5,10]:
        column_renamer_2021.iloc[colchester_index_21, i] = colchester_cols_21[i-1]
        
column_renamer_2021.loc[colchester_index_21, "record_from"] = "028_Colchester_MV_21_ALTERED.csv"

In [235]:
 column_renamer_2021.iloc[colchester_index_21]

record_from      028_Colchester_MV_21_ALTERED.csv
name                                       name_1
street                                    address
city                                         city
state                                       state
zip                                           zip
vehicle_year                                  NaN
vehicle_make                                  NaN
vehicle_model                                 NaN
vehicle_class                                 NaN
vehicle_id                         identification
lease_street                                  NaN
UID                                           NaN
lease_city                                    NaN
lease_state                                   NaN
lease_zip                                     NaN
Name: 149, dtype: object

## Salisbury 2021

In [236]:
salisbury_21 = pd.read_excel(raw_files_21 / "122_Salisbury_MV_21.xlsx", header = None)
salisbury_21.columns = ["name", "street", "city", "state", "zip", "vehicle_year", "vehicle_make", "vehicle_model", "unknown", "vehicle_id"]
salisbury_21 = salisbury_21.drop("unknown", axis =1)

In [237]:
salisbury_21

Unnamed: 0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_id
0,ABENDROTH MAXWELL H,211 INDIAN MTN RD,LAKEVILLE,CT,6039,2015,VOLKS,PASSAT S,1VWAT7A36FC096475
1,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2019,CHEVR,BOLT EV,1G1FZ6S0XK4118867
2,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,EQUINOX,3GNAXVEX6LS505290
3,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2021,GMC,CANYON D,1GTG6EEN1M1144011
4,ACAR LEASING LTD,4001 EMBARCADERO DR,ARLINGTON,TX,76014,2020,CHEVR,SILVERAD,3GCPYBEK3LG424222
...,...,...,...,...,...,...,...,...,...
4386,ZUCKER BETH L,PO BOX 1427,LAKEVILLE,CT,6039,2010,HONDA,PILOT TO,5FNYF4H87AB006447
4387,ZUCKER CLAIRE H,128 ROCKY LN,SALISBURY,CT,6068,2018,SUBAR,OUTBACK,4S4BSENC7J3394269
4388,ZUCKER JON R,PO BOX 52,TACONIC,CT,6079,2016,TOYOT,HIGHLAND,5TDDKRFH8GS239395
4389,ZUCKER SCOTT,PO BOX 1427,LAKEVILLE,CT,6039,1997,JEEP,WRANGLER,1J4FY19S2VP461478


In [238]:
salisbury_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Salisbury")].index.tolist()[0]

In [239]:
column_renamer_2021.loc[salisbury_index_21]

record_from      122_Salisbury_MV_21.xlsx
name                                  NaN
street                                NaN
city                                  NaN
state                                 NaN
zip                                   NaN
vehicle_year                          NaN
vehicle_make                          NaN
vehicle_model                         NaN
vehicle_class                         NaN
vehicle_id                            NaN
lease_street                          NaN
UID                                   NaN
lease_city                            NaN
lease_state                           NaN
lease_zip                             NaN
Name: 96, dtype: object

In [240]:
salisbury_cols_21 = ["name","street", "city", 
                     "state", "zip", "vehicle_year", "vehicle_make", 
                     "vehicle_model", "", "vehicle_id"]

for i in [1,2,3,4,5,6,7,8,10]:
    column_renamer_2021.iloc[salisbury_index_21, i] = salisbury_cols_21[i-1]
    
column_renamer_2021.loc[salisbury_index_21, "record_from"] = "122_Salisbury_MV_21_ALTERED.csv"

In [241]:
column_renamer_2021.loc[salisbury_index_21]

record_from      122_Salisbury_MV_21_ALTERED.csv
name                                        name
street                                    street
city                                        city
state                                      state
zip                                          zip
vehicle_year                        vehicle_year
vehicle_make                        vehicle_make
vehicle_model                      vehicle_model
vehicle_class                                NaN
vehicle_id                            vehicle_id
lease_street                                 NaN
UID                                          NaN
lease_city                                   NaN
lease_state                                  NaN
lease_zip                                    NaN
Name: 96, dtype: object

### Torrington 2021

In [242]:
torrington_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Torrington")].index.tolist()[0]
column_renamer_2021.iloc[torrington_index_21, column_renamer_2021.columns.isin(["vehicle_id"])] = "VEHICLE ID"
column_renamer_2021.loc[torrington_index_21]

record_from      143_Torrington_MV_21.xlsx
name                              TAXPAYER
street                              STREET
city                                  CITY
state                                STATE
zip                               ZIP CODE
vehicle_year                  VEHICLE YEAR
vehicle_make                           NaN
vehicle_model                VEHICLE MODEL
vehicle_class                          NaN
vehicle_id                      VEHICLE ID
lease_street                           NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_zip                              NaN
Name: 115, dtype: object

### Torrington - 2021

In [243]:
shelton_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Shelton")].index.tolist()[0]
column_renamer_2021.iloc[shelton_index_21, column_renamer_2021.columns.isin(["vehicle_id"])] = "VIN #"
column_renamer_2021.loc[shelton_index_21]

record_from      126_Shelton_MV_21.xlsx
name                           TAXPAYER
street                           STREET
city                               CITY
state                             STATE
zip                                ZIP1
vehicle_year                       YEAR
vehicle_make                       MAKE
vehicle_model                     MODEL
vehicle_class                     CLASS
vehicle_id                        VIN #
lease_street                        NaN
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_zip                           NaN
Name: 100, dtype: object

### Bethel - 2021

In [244]:
bethel_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Bethel")].index.tolist()[0]
column_renamer_2021.iloc[bethel_index_21, column_renamer_2021.columns.isin(["street"])] = "Mailing Address"
column_renamer_2021.loc[bethel_index_21]

record_from      009_Bethel_MV_21.xlsx
name                          Taxpayer
street                 Mailing Address
city                              City
state                           State 
zip                                Zip
vehicle_year                      Year
vehicle_make                      Make
vehicle_model                      NaN
vehicle_class                    Class
vehicle_id                 Vehicle Vin
lease_street         Residence Address
UID                                NaN
lease_city                         NaN
lease_state                        NaN
lease_zip                          NaN
Name: 8, dtype: object

### Manchester - 2021

In [245]:
manchester_21 = pd.read_excel(raw_files_21 / "077_Manchester_MV_21.xls")

In [246]:
manchester_21 = manchester_21.rename(columns = {"Unnamed: 5": "city",
                                      "Unnamed: 6":"state",
                                      "Unnamed: 7":"zip"})
manchester_21

Unnamed: 0,#,LIST #,TAXPAYER,CARE OF,MAILING ADDRESS,city,state,zip,Unnamed: 8,Unnamed: 9,TAX DIST,YEAR,MAKE,MODEL,CLASS CODE,VIN NUMBER,STYLE,COLOR,ASMNT
0,1.0,50001.0,@URSVC LLC,,2 WILLOWBROOK RD,CROMWELL,CT,06416,-,2505,T,2013.0,CADIL,SRX PREM,1.0,3GYFNJE33DS595309,WAGON,BLA,10890.0
1,2.0,50002.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2004.0,ISUZU,FRR,2.0,JALF5C13547700369,TILT C,WHI,6510.0
2,3.0,50003.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2004.0,UDUMP,61214R0,10.0,43ZDN22B340000019,TRAILE,BLA,550.0
3,4.0,50004.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,1998.0,WELD,UTILITY,10.0,1J9HH2421W1118262,TRAILE,BLA,550.0
4,5.0,50005.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2007.0,KENWO,CONSTRUC,2.0,2NKMHZ7X07M210675,HOIST,WHI,12580.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44482,44483.0,95117.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,06040,-,6754,T,2009.0,FORD,F150,1.0,1FTRF12859KB23566,PICKUP,WHI,6860.0
44483,44484.0,95118.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,06040,-,6754,T,2010.0,NISSA,ALTIMA 2,1.0,1N4AL2AP2AN549552,SEDAN,GRA,4220.0
44484,,,,,,,,,,,,,,,,,,,
44485,,,,,,,,,,,,,,,,,,,


In [247]:
manchester_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Manchester")].index.tolist()[0]
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["street"])] = "MAILING ADDRESS"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["city"])] = "city"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["state"])] = "state"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["zip"])] = "zip"
column_renamer_2021.loc[manchester_index_21, "record_from"] = "077_Manchester_MV_21_ALTERED.csv"
column_renamer_2021.loc[manchester_index_21]

record_from      077_Manchester_MV_21_ALTERED.csv
name                                     TAXPAYER
street                            MAILING ADDRESS
city                                         city
state                                       state
zip                                           zip
vehicle_year                                 YEAR
vehicle_make                                 MAKE
vehicle_model                               MODEL
vehicle_class                          CLASS CODE
vehicle_id                             VIN NUMBER
lease_street                                  NaN
UID                                        LIST #
lease_city                                    NaN
lease_state                                   NaN
lease_zip                                     NaN
Name: 61, dtype: object

### Norfolk - 2021
**NOTE that I have placed `RESIDENCE_STREET` as the street address**

In [248]:
norfolk_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Norfolk")].index.tolist()[0]
column_renamer_2021.iloc[norfolk_index_21, column_renamer_2021.columns.isin(["street"])] = "RESIDENCE_STREET"
column_renamer_2021.iloc[norfolk_index_21, column_renamer_2021.columns.isin(["city"])] = "RESIDENCE_CITY"
column_renamer_2021.iloc[norfolk_index_21]

record_from      098_Norfolk_MV_21.xlsx
name                           TAXPAYER
street                 RESIDENCE_STREET
city                     RESIDENCE_CITY
state                               NaN
zip                                 NaN
vehicle_year               VEHICLE_YEAR
vehicle_make               VEHICLE_MAKE
vehicle_model             VEHICLE_MODEL
vehicle_class             VEHICLE_CLASS
vehicle_id                   VEHICLE_ID
lease_street           RESIDENCE_STREET
UID                                 NaN
lease_city               RESIDENCE_CITY
lease_state                         NaN
lease_zip                           NaN
Name: 77, dtype: object

### Thompson - 2021

In [249]:
thompson_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Thompson")].index.tolist()[0]
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["street"])] = "street"
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["state"])] = "state"
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["zip"])] = "zip"
column_renamer_2021.iloc[thompson_index_21]

record_from      141_Thompson_MV_21.xlsx
name                       taxpayer_name
street                            street
city                                 NaN
state                              state
zip                                  zip
vehicle_year                         NaN
vehicle_make                         NaN
vehicle_model                        NaN
vehicle_class                        NaN
vehicle_id                           vin
lease_street                         NaN
UID                                  NaN
lease_city                           NaN
lease_state                          NaN
lease_zip                            NaN
Name: 113, dtype: object

### Bridgewater - 2021

For bridgewater, we need to make manual edits to the file

In [250]:
bridgewater_21 = pd.read_excel(raw_files_21 / "016_Bridgewater_MV_21.xlsx")

First fix the mailing addresses

In [251]:
bridge_mail_address_states  = []
bridge_mail_address_zips    = []
bridge_mail_address_streets = []
bridge_mail_address_cities  = []


for address in tqdm(list(bridgewater_21["mail_address"])):
    # Fields needed
    fields_needed = ["USPSBoxType", "USPSBoxID", "AddressNumber", "StreetName", "StreetNamePostType",
                 "PlaceName", "StateName", "ZipCode"]
    
    # Create the street+city, state, and zip
    try:
        # Parse the address, and turn this into a list of lists
        parsed_address = usaddress.parse(address)
        parsed_address_list = [list(tup) for tup in parsed_address]
        
        # The parsed address includes duplicate fields, so we deduplicate the fields
        fields = [sublist[1] for sublist in parsed_address_list]
        fields = list(dict.fromkeys(fields))
        
        # We then create a dictionary based on these deduplicated fields
        fields_dict = dict.fromkeys(fields)
        
        # For each key in the dictionary, we set the value by appending all values for that field
        for field in fields:
            field_value = ' '.join([sublist[0] for sublist in parsed_address_list if sublist[1]==field])
            fields_dict[field] = field_value
        
        # Standardize the dictionary - we ensure that all required fields are in the dictionary
        # Where there is no value for that key, we set the value as an empty string
        for field_needed in fields_needed:
            if not field_needed in fields_dict.keys():
                 fields_dict[field_needed] = ""
    
        # We set the street, city, zip code, and state
        street = " ".join([fields_dict["USPSBoxType"],
                           fields_dict["USPSBoxID"],
                           fields_dict["AddressNumber"],
                           fields_dict["StreetName"],
                           fields_dict["StreetNamePostType"]]).strip()
        city = fields_dict["PlaceName"]
        zip_code = fields_dict["ZipCode"]
        state = fields_dict["StateName"]
    except:
        street = ""
        city = ""
        zip_code = ""
        state = ""
    
    # We append these to the master lists, that will be set as columns
    bridge_mail_address_states.append(state)
    bridge_mail_address_zips.append(zip_code)
    bridge_mail_address_streets.append(street)
    bridge_mail_address_cities.append(city)

100%|██████████| 7254/7254 [00:00<00:00, 8952.38it/s] 


In [252]:
bridgewater_21["mail_address_street"] = bridge_mail_address_streets
bridgewater_21["mail_address_city"] = bridge_mail_address_cities
bridgewater_21["mail_address_zip"] = bridge_mail_address_zips
bridgewater_21["mail_address_state"] = bridge_mail_address_states
bridgewater_21

Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2,mail_address_street,mail_address_city,mail_address_zip,mail_address_state
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-,PO BOX 357,BRIDGEWATER,06752-0357,CT
1,50002.0,ABBETT LINDA J,,PO BOX 357 BRIDGEWATER CT 06752-0357,JTMBD33V476023102,121 CLATTER VLY RD BRIDGEWATER CT -,PO BOX 357,BRIDGEWATER,06752-0357,CT
2,50003.0,ABBETT MARILYN S,ABBETT LINDA J,PO BOX 126 BRIDGEWATER CT 06752-0126,JTMDJREV4GD005535,168 HENRY SANFORD RD BRIDGEWATER CT 06752-,PO BOX 126,BRIDGEWATER,06752-0126,CT
3,50004.0,ABBEY RESTAURANT LLC,,11 CEDAR HILL RD BRIDGEWATER CT 06752-1001,1GNWGEFGXH1246624,,11 CEDAR HILL RD,BRIDGEWATER,06752-1001,CT
4,50005.0,ABBOTT CHARLES D,,113 HENRY SANFORD RD BRIDGEWATER CT 06752-1214,186RDB1C5JH000357,,113 HENRY SANFORD RD,BRIDGEWATER,06752-1214,CT
...,...,...,...,...,...,...,...,...,...,...
7249,52001.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,NS1DR8PS8FC543002,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT
7250,52002.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,4WAAUER1641000333,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT
7251,52003.0,YOUNG SHANNON B,YOUNG DANIELLE,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,1GNSKJKC0HR271476,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT
7252,52004.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,5NRHF12288S008667,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT


Now fix the second address - probably the lease address

In [253]:
bridge_secondary_address_states  = []
bridge_secondary_address_zips    = []
bridge_secondary_address_streets = []
bridge_secondary_address_cities  = []


for address in tqdm(list(bridgewater_21["address_2"])):
    # Fields needed
    fields_needed = ["USPSBoxType", "USPSBoxID", "AddressNumber", "StreetName", "StreetNamePostType",
                 "PlaceName", "StateName", "ZipCode"]
    
    # Create the street+city, state, and zip
    try:
        # Parse the address, and turn this into a list of lists
        parsed_address = usaddress.parse(address)
        parsed_address_list = [list(tup) for tup in parsed_address]
        
        # The parsed address includes duplicate fields, so we deduplicate the fields
        fields = [sublist[1] for sublist in parsed_address_list]
        fields = list(dict.fromkeys(fields))
        
        # We then create a dictionary based on these deduplicated fields
        fields_dict = dict.fromkeys(fields)
        
        # For each key in the dictionary, we set the value by appending all values for that field
        for field in fields:
            field_value = ' '.join([sublist[0] for sublist in parsed_address_list if sublist[1]==field])
            fields_dict[field] = field_value
        
        # Standardize the dictionary - we ensure that all required fields are in the dictionary
        # Where there is no value for that key, we set the value as an empty string
        for field_needed in fields_needed:
            if not field_needed in fields_dict.keys():
                 fields_dict[field_needed] = ""
    
        # We set the street, city, zip code, and state
        street = " ".join([fields_dict["USPSBoxType"],
                           fields_dict["USPSBoxID"],
                           fields_dict["AddressNumber"],
                           fields_dict["StreetName"],
                           fields_dict["StreetNamePostType"]]).strip()
        city = fields_dict["PlaceName"]
        zip_code = fields_dict["ZipCode"]
        state = fields_dict["StateName"]
    except:
        street = ""
        city = ""
        zip_code = ""
        state = ""
    
    # We append these to the master lists, that will be set as columns
    bridge_secondary_address_states.append(state)
    bridge_secondary_address_zips.append(zip_code)
    bridge_secondary_address_streets.append(street)
    bridge_secondary_address_cities.append(city)

bridgewater_21["secondary_address_street"] = bridge_secondary_address_streets
bridgewater_21["secondary_address_city"] =   bridge_secondary_address_cities
bridgewater_21["secondary_address_zip"] =    bridge_secondary_address_zips
bridgewater_21["secondary_address_state"] =  bridge_secondary_address_states
bridgewater_21

100%|██████████| 7254/7254 [00:00<00:00, 44812.95it/s]


Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2,mail_address_street,mail_address_city,mail_address_zip,mail_address_state,secondary_address_street,secondary_address_city,secondary_address_zip,secondary_address_state
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VALLEY RD,BRIDGEWATER,06752-,CT
1,50002.0,ABBETT LINDA J,,PO BOX 357 BRIDGEWATER CT 06752-0357,JTMBD33V476023102,121 CLATTER VLY RD BRIDGEWATER CT -,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VLY RD,BRIDGEWATER,,CT
2,50003.0,ABBETT MARILYN S,ABBETT LINDA J,PO BOX 126 BRIDGEWATER CT 06752-0126,JTMDJREV4GD005535,168 HENRY SANFORD RD BRIDGEWATER CT 06752-,PO BOX 126,BRIDGEWATER,06752-0126,CT,168 HENRY SANFORD RD,BRIDGEWATER,06752-,CT
3,50004.0,ABBEY RESTAURANT LLC,,11 CEDAR HILL RD BRIDGEWATER CT 06752-1001,1GNWGEFGXH1246624,,11 CEDAR HILL RD,BRIDGEWATER,06752-1001,CT,,,,
4,50005.0,ABBOTT CHARLES D,,113 HENRY SANFORD RD BRIDGEWATER CT 06752-1214,186RDB1C5JH000357,,113 HENRY SANFORD RD,BRIDGEWATER,06752-1214,CT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7249,52001.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,NS1DR8PS8FC543002,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT,,,,
7250,52002.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,4WAAUER1641000333,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT,,,,
7251,52003.0,YOUNG SHANNON B,YOUNG DANIELLE,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,1GNSKJKC0HR271476,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT,,,,
7252,52004.0,YOUNG SHANNON B,,221 CHRISTIAN ST BRIDGEWATER CT 06752-1505,5NRHF12288S008667,,221 CHRISTIAN ST,BRIDGEWATER,06752-1505,CT,,,,


**Now fix the column renamer**

In [254]:
bridgewater_21.head(2)

Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2,mail_address_street,mail_address_city,mail_address_zip,mail_address_state,secondary_address_street,secondary_address_city,secondary_address_zip,secondary_address_state
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VALLEY RD,BRIDGEWATER,06752-,CT
1,50002.0,ABBETT LINDA J,,PO BOX 357 BRIDGEWATER CT 06752-0357,JTMBD33V476023102,121 CLATTER VLY RD BRIDGEWATER CT -,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VLY RD,BRIDGEWATER,,CT


In [255]:
bridgewater_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Bridgewater")].index.tolist()[0]

In [328]:
df_rename.loc["016_Bridgewater_MV_ALTERED.csv"]

name                        taxpayer_name
street                mail_address_street
city                    mail_address_city
state                  mail_address_state
zip                      mail_address_zip
vehicle_year                         None
vehicle_make                         None
vehicle_model                        None
vehicle_class                        None
vehicle_id                            vin
lease_street     secondary_address_street
UID                                  None
lease_city         secondary_address_city
lease_state       secondary_address_state
lease_zip           secondary_address_zip
Name: 016_Bridgewater_MV_ALTERED.csv, dtype: object

In [325]:
df_rename.iloc[135]

name                        taxpayer_name
street                mail_address_street
city                    mail_address_city
state                  mail_address_state
zip                      mail_address_zip
vehicle_year                         None
vehicle_make                         None
vehicle_model                        None
vehicle_class                        None
vehicle_id                            vin
lease_street     secondary_address_street
UID                                  None
lease_city         secondary_address_city
lease_state       secondary_address_state
lease_zip           secondary_address_zip
Name: 016_Bridgewater_MV_ALTERED.csv, dtype: object

In [320]:
column_renamer_2021.loc[bridgewater_index_21]

record_from      016_Bridgewater_MV_ALTERED.csv
name                              taxpayer_name
street                      mail_address_street
city                          mail_address_city
state                        mail_address_state
zip                            mail_address_zip
vehicle_year                               None
vehicle_make                               None
vehicle_model                              None
vehicle_class                              None
vehicle_id                                  vin
lease_street           secondary_address_street
UID                                        None
lease_city               secondary_address_city
lease_state             secondary_address_state
lease_zip                 secondary_address_zip
Name: 135, dtype: object

In [257]:
renamer_fields_bridgewater = dict.fromkeys(column_renamer_2021.columns[1:])

In [258]:
renamer_fields_bridgewater["name"] = "taxpayer_name"
renamer_fields_bridgewater["street"] = "mail_address_street"
renamer_fields_bridgewater["city"] = "mail_address_city"
renamer_fields_bridgewater["state"]=  "mail_address_state"
renamer_fields_bridgewater["zip"] = "mail_address_zip"
# renamer_fields_bridgewater["vehicle_year"] 
# renamer_fields_bridgewater["vehicle_make"]
# renamer_fields_bridgewater["vehicle_model"]
# renamer_fields_bridgewater["vehicle_class"]
renamer_fields_bridgewater["vehicle_id"] = "vin"
renamer_fields_bridgewater["lease_street"] = "secondary_address_street"
# renamer_fields_bridgewater["UID"]
renamer_fields_bridgewater["lease_city"] = "secondary_address_city"
renamer_fields_bridgewater["lease_state"] = "secondary_address_state"
renamer_fields_bridgewater["lease_zip"] = "secondary_address_zip"

In [259]:
for key in renamer_fields_bridgewater.keys():
    column_renamer_2021.iloc[bridgewater_index_21, column_renamer_2021.columns.isin([key])] = renamer_fields_bridgewater[key]

In [345]:
# Set the record_from column appropriately
column_renamer_2021.loc[bridgewater_index_21, "record_from"] = "016_Bridgewater_MV_21_ALTERED.csv"

In [346]:
column_renamer_2021.loc[bridgewater_index_21]

record_from      016_Bridgewater_MV_21_ALTERED.csv
name                                 taxpayer_name
street                         mail_address_street
city                             mail_address_city
state                           mail_address_state
zip                               mail_address_zip
vehicle_year                                  None
vehicle_make                                  None
vehicle_model                                 None
vehicle_class                                 None
vehicle_id                                     vin
lease_street              secondary_address_street
UID                                           None
lease_city                  secondary_address_city
lease_state                secondary_address_state
lease_zip                    secondary_address_zip
Name: 135, dtype: object

### Southington - 2021

In [262]:
southington_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Southington")].index.tolist()[0]

In [263]:
column_renamer_2021.loc[southington_index_21]

record_from      131_Southington_MV_21.xlsx
name                                   Name
street                                  NaN
city                                   City
state                                 State
zip                                     Zip
vehicle_year                           Year
vehicle_make                           Make
vehicle_model                         Model
vehicle_class                         Class
vehicle_id                              VIN
lease_street                            NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_zip                               NaN
Name: 155, dtype: object

In [264]:
renamer_fields_southington = dict.fromkeys(column_renamer_2021.columns[1:])

In [265]:
#renamer_fields_southington["name"] = "taxpayer_name"
renamer_fields_southington["street"] = "Street Address"
# renamer_fields_southington["city"] = "Street Address"
# renamer_fields_southington["state"]=  "mail_address_state"
# renamer_fields_southington["zip"] = "mail_address_zip"
# renamer_fields_southington["vehicle_year"] 
# renamer_fields_southington["vehicle_make"]
# renamer_fields_southington["vehicle_model"]
# renamer_fields_southington["vehicle_class"]
# renamer_fields_southington["vehicle_id"] = "vin"
# renamer_fields_southington["lease_street"] = "secondary_address_street"
# renamer_fields_southington["UID"]
# renamer_fields_southington["lease_city"] = "secondary_address_city"
# renamer_fields_southington["lease_state"] = "secondary_address_state"
# renamer_fields_southington["lease_zip"] = "secondary_address_zip"

In [266]:
for key in renamer_fields_southington.keys():
    if renamer_fields_southington[key]:
        column_renamer_2021.iloc[southington_index_21, column_renamer_2021.columns.isin([key])] = renamer_fields_southington[key]

In [267]:
column_renamer_2021.loc[southington_index_21]

record_from      131_Southington_MV_21.xlsx
name                                   Name
street                       Street Address
city                                   City
state                                 State
zip                                     Zip
vehicle_year                           Year
vehicle_make                           Make
vehicle_model                         Model
vehicle_class                         Class
vehicle_id                              VIN
lease_street                            NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_zip                               NaN
Name: 155, dtype: object

## Check 2021 renamer

**Missing street address**

In [268]:
column_renamer_2021[column_renamer_2021["street"].isna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
27,037_Derby_MV_21.xls,,,,,,YEAR,MAKE,MODEL,,VIN,,,,,
42,055_Goshen_MV_21.XLSX,TAXPAYER,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,
84,107_Orange_MV_21.xlsx,,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,


These three files do not contain street addresses anyway - no way to fix this.

**Missing VIN**

In [269]:
column_renamer_2021[column_renamer_2021["vehicle_id"].isna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip


So there are no places where we don't have a name for the VIN column.

# 2020 Renamer

### Andover - 2020

In [270]:
andover_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Andover")].index.tolist()[0]

In [271]:
column_renamer_2020.loc[andover_index_20]

record_from      1_Andover_MVData_2020.csv
name                              TAXPAYER
street                                 NaN
vehicle_id                             NaN
vehicle_make                           NaN
vehicle_model                          NaN
vehicle_year                           NaN
city                                   NaN
state                                  NaN
vehicle_class                          NaN
zip                                    NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_street                           NaN
lease_zip                              NaN
Name: 0, dtype: object

In [272]:
renamer_fields_andover = dict.fromkeys(column_renamer_2020.columns[1:])

In [273]:
# renamer_fields_andover["name"] = "taxpayer_name"
renamer_fields_andover["street"] = "street"
renamer_fields_andover["city"] = "city"
renamer_fields_andover["state"]=  "state"
renamer_fields_andover["zip"] = "zip1"
renamer_fields_andover["vehicle_year"] = "vehicle_year" 
renamer_fields_andover["vehicle_make"] = "vehicle_make"
renamer_fields_andover["vehicle_model"] = "vehicle_model"
renamer_fields_andover["vehicle_class"] = "vehicle_class"
renamer_fields_andover["vehicle_id"] = "vehicle_id"
# renamer_fields_andover["lease_street"] = "secondary_address_street"
# renamer_fields_andover["UID"]
# renamer_fields_andover["lease_city"] = "secondary_address_city"
# renamer_fields_andover["lease_state"] = "secondary_address_state"
# renamer_fields_andover["lease_zip"] = "secondary_address_zip"

In [274]:
for key in renamer_fields_andover.keys():
    if renamer_fields_andover[key]:
        column_renamer_2020.iloc[andover_index_20, column_renamer_2020.columns.isin([key])] = renamer_fields_andover[key]

In [275]:
column_renamer_2020.loc[andover_index_20]

record_from      1_Andover_MVData_2020.csv
name                              TAXPAYER
street                              street
vehicle_id                      vehicle_id
vehicle_make                  vehicle_make
vehicle_model                vehicle_model
vehicle_year                  vehicle_year
city                                  city
state                                state
vehicle_class                vehicle_class
zip                                   zip1
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_street                           NaN
lease_zip                              NaN
Name: 0, dtype: object

### Chaplin - 2020

In [276]:
chaplin_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Chaplin")].index.tolist()[0]
column_renamer_2020.loc[chaplin_index_20]

record_from      24_Chaplin_MVData_2020.csv
name                               taxpayer
street                                  NaN
vehicle_id                              NaN
vehicle_make                            NaN
vehicle_model                           NaN
vehicle_year                            NaN
city                                    NaN
state                                   NaN
vehicle_class                           NaN
zip                                     NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_street                            NaN
lease_zip                               NaN
Name: 6, dtype: object

In [277]:
renamer_fields_chaplin = dict.fromkeys(column_renamer_2020.columns[1:])

In [278]:
renamer_fields_chaplin["name"] = "taxpayer"
renamer_fields_chaplin["street"] = "street"
renamer_fields_chaplin["city"] = "city"
renamer_fields_chaplin["state"]=  "state"
renamer_fields_chaplin["zip"] = "zip1"
renamer_fields_chaplin["vehicle_year"] = "vehicle_year" 
renamer_fields_chaplin["vehicle_make"] = "vehicle_make"
renamer_fields_chaplin["vehicle_model"] = "vehicle_model"
renamer_fields_chaplin["vehicle_class"] = "vehicle_class"
renamer_fields_chaplin["vehicle_id"] = "vehicle_id"
# renamer_fields_chaplin["lease_street"] = "secondary_address_street"
# renamer_fields_chaplin["UID"]
# renamer_fields_chaplin["lease_city"] = "secondary_address_city"
# renamer_fields_chaplin["lease_state"] = "secondary_address_state"
# renamer_fields_chaplin["lease_zip"] = "secondary_address_zip"

In [279]:
for key in renamer_fields_chaplin.keys():
    if renamer_fields_chaplin[key]:
        column_renamer_2020.iloc[chaplin_index_20, column_renamer_2020.columns.isin([key])] = renamer_fields_chaplin[key]

In [280]:
column_renamer_2020.loc[chaplin_index_20]

record_from      24_Chaplin_MVData_2020.csv
name                               taxpayer
street                               street
vehicle_id                       vehicle_id
vehicle_make                   vehicle_make
vehicle_model                 vehicle_model
vehicle_year                   vehicle_year
city                                   city
state                                 state
vehicle_class                 vehicle_class
zip                                    zip1
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_street                            NaN
lease_zip                               NaN
Name: 6, dtype: object

### Torrington - 2020

In [281]:
torrington_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Torrington")].index.tolist()[0]

In [282]:
column_renamer_2020.loc[torrington_index_20]

record_from      143_Torrington_MVData_2020.csv
name                                   TAXPAYER
street                                   STREET
vehicle_id                                  NaN
vehicle_make                                NaN
vehicle_model                     VEHICLE MODEL
vehicle_year                       VEHICLE YEAR
city                                       CITY
state                                     STATE
vehicle_class                               NaN
zip                                    ZIP CODE
UID                                         NaN
lease_city                                  NaN
lease_state                                 NaN
lease_street                                NaN
lease_zip                                   NaN
Name: 69, dtype: object

In [283]:
column_renamer_2020.loc[torrington_index_20, "vehicle_id"] = "VEHICLE ID"
column_renamer_2020.loc[torrington_index_20, "vehicle_make"] = "VEHICLE MAKE"
column_renamer_2020.loc[torrington_index_20, "vehicle_class"] = "VEHICLE CLASS"

In [284]:
column_renamer_2020.loc[torrington_index_20]

record_from      143_Torrington_MVData_2020.csv
name                                   TAXPAYER
street                                   STREET
vehicle_id                           VEHICLE ID
vehicle_make                       VEHICLE MAKE
vehicle_model                     VEHICLE MODEL
vehicle_year                       VEHICLE YEAR
city                                       CITY
state                                     STATE
vehicle_class                     VEHICLE CLASS
zip                                    ZIP CODE
UID                                         NaN
lease_city                                  NaN
lease_state                                 NaN
lease_street                                NaN
lease_zip                                   NaN
Name: 69, dtype: object

### Orange - 2020

In [285]:
orange_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Orange")].index.tolist()[0]
column_renamer_2020.loc[orange_index_20]

record_from      107_Orange_MVData_2020.csv
name                               TAXPAYER
street                               STREET
vehicle_id                              NaN
vehicle_make                   VEHICLE_MAKE
vehicle_model                 VEHICLE_MODEL
vehicle_year                   VEHICLE_YEAR
city                                   CITY
state                                 STATE
vehicle_class                 VEHICLE_CLASS
zip                                    ZIP1
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_street                            NaN
lease_zip                               NaN
Name: 47, dtype: object

Cannot change this - the underlying file does not contain VINs

## Check 2020 renamer

In [286]:
column_renamer_2020[column_renamer_2020["vehicle_id"].isna()]

Unnamed: 0,record_from,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
47,107_Orange_MVData_2020.csv,TAXPAYER,STREET,,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,CITY,STATE,VEHICLE_CLASS,ZIP1,,,,,


So it is just Orange, for which there are no VINs in the underlying file.

In [287]:
column_renamer_2020[column_renamer_2020["street"].isna()]

Unnamed: 0,record_from,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
8,27_Clinton_MVData_2020.csv,,,VIN,Make,Model,Year,,,Class,,,,,,
49,112_Pomfret_MVData_2020.csv,TAXPAYER,,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,,,VEHICLE_CLASS,,,,,,


It is Clinton and Pomfret - neither of which contain street data in the underlying file.

## 2019 Renamer

### Bloomfield - 2019

In [288]:
bloomfield_index_19 = column_renamer_2019[column_renamer_2019["record_from"].str.contains("Bloomfield")].index.tolist()[0]
column_renamer_2019.loc[bloomfield_index_19]

record_from      11_Bloomfield_MVData_2019.csv
UID                                    LIST NO
name                                  TAXPAYER
street                                     NaN
city                                       NaN
state                                    STATE
zip                                        NaN
vehicle_year                              YEAR
vehicle_make                               NaN
vehicle_model                            MODEL
vehicle_class                            CLASS
vehicle_id                                 NaN
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 6, dtype: object

In [384]:
column_renamer_2019.loc[bloomfield_index_19, "street"] = "ADDRESS LINE 1"
column_renamer_2019.loc[bloomfield_index_19, "city"] = "CITY/TOWN"
column_renamer_2019.loc[bloomfield_index_19, "zip"] = "ZIP"
column_renamer_2019.loc[bloomfield_index_19, "vehicle_make"] = "MAKE "
column_renamer_2019.loc[bloomfield_index_19, "vehicle_year"] = "YEAR"
column_renamer_2019.loc[bloomfield_index_19]

record_from      11_Bloomfield_MVData_2019.csv
UID                                    LIST NO
name                                  TAXPAYER
street                          ADDRESS LINE 1
city                                 CITY/TOWN
state                                    STATE
zip                                        ZIP
vehicle_year                              YEAR
vehicle_make                             MAKE 
vehicle_model                            MODEL
vehicle_class                            CLASS
vehicle_id                                 NaN
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 6, dtype: object

### Manchester -  2019

In [290]:
manchester_19 = pd.read_csv(raw_files_20_19 / "77_Manchester_MVData_2019.csv")

In [291]:
manchester_19 = manchester_19[0:45785]

In [292]:
manchester_19 = manchester_19.rename(columns = {"Unnamed: 4" : "city",
                                                "Unnamed: 5" : "state"})
manchester_19["zip"] = manchester_19["Unnamed: 6"].astype('Int64').astype(str) + manchester_19["Unnamed: 7"] + manchester_19["Unnamed: 8"]

In [293]:
manchester_19

Unnamed: 0,#,TAXPAYER,CARE OF,MAILING ADDRESS,city,state,Unnamed: 6,Unnamed: 7,Unnamed: 8,DIST,...,MODEL,CODE,VIN NUMBER,STYLE,COLOR,ASMNT,Unnamed: 18,Unnamed: 19,Unnamed: 20,zip
0,50001.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,FRR,2.0,JALF5C13547700369,TILT C,WHI,"$7,540",,,,6256-0183
1,50002.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,61214R0,10.0,43ZDN22B340000019,TRAILE,BLA,$550,,,,6256-0183
2,50003.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,CONSTRUC,2.0,2NKMHZ7X07M210675,HOIST,WHI,"$16,230",,,,6256-0183
3,50004.0,A & A INTEGRATED PEST MAN &,,457 CENTER ST,MANCHESTER,CT,6040.0,-,3937,T,...,SILVERAD,3.0,1GCVKREC4FZ374901,4 DOOR,SIL,"$17,500",,,,6040-3937
4,50005.0,A & A INTEGRATED PEST MANAGEMENT COMPANY,,457 CENTER ST,MANCHESTER,CT,6040.0,-,3937,T,...,CITY EXP,3.0,3N63M0ZN8HK697113,VAN,WHI,"$10,970",,,,6040-3937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45780,96539.0,ZYLBERMAN SOL,,87 CONCORD RD,MANCHESTER,CT,6042.0,-,1723,T,...,GENESIS,1.0,KMHGN4JE6GU143308,SEDAN,SIL,"$14,960",,,,6042-1723
45781,96540.0,ZYREK ANDREW R,ZYREK GEORGE P,37 ABBE RD,MANCHESTER,CT,6040.0,-,6867,T,...,SPIDER G,25.0,ZARBA5649H1046161,CONVER,BLA,$500,,,,6040-6867
45782,96541.0,ZYREK GEORGE P,,37 ABBE RD,MANCHESTER,CT,6040.0,-,6867,T,...,E3504M A,1.0,WDDHF8JB3CA537403,SEDAN,BLU,"$9,800",,,,6040-6867
45783,96542.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,6040.0,-,6754,T,...,F150,1.0,1FTRF12859KB23566,PICKUP,WHI,"$6,530",,,,6040-6754


In [294]:
manchester_index_19 = column_renamer_2019[column_renamer_2019["record_from"].str.contains('Manchester')].index.tolist()[0]
column_renamer_2019.loc[manchester_index_19]

record_from      77_Manchester_MVData_2019.csv
UID                                        NaN
name                                  TAXPAYER
street                                     NaN
city                                       NaN
state                                      NaN
zip                                        NaN
vehicle_year                              YEAR
vehicle_make                              MAKE
vehicle_model                            MODEL
vehicle_class                              NaN
vehicle_id                          VIN NUMBER
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 48, dtype: object

In [295]:
column_renamer_2019.loc[manchester_index_19, "street"] = "MAILING ADDRESS"
column_renamer_2019.loc[manchester_index_19, "state"] = "state"
column_renamer_2019.loc[manchester_index_19, "city"] = "city"
column_renamer_2019.loc[manchester_index_19, "zip"] = "zip"
column_renamer_2019.loc[manchester_index_19, "record_from"] = "77_Manchester_MVData_2019_ALTERED.csv"
column_renamer_2019.loc[manchester_index_19]

record_from      77_Manchester_MVData_2019_ALTERED.csv
UID                                                NaN
name                                          TAXPAYER
street                                 MAILING ADDRESS
city                                              city
state                                            state
zip                                                zip
vehicle_year                                      YEAR
vehicle_make                                      MAKE
vehicle_model                                    MODEL
vehicle_class                                      NaN
vehicle_id                                  VIN NUMBER
lease_city                                         NaN
lease_state                                        NaN
lease_street                                       NaN
lease_zip                                          NaN
Name: 48, dtype: object

## Check 2019

In [296]:
column_renamer_2019[column_renamer_2019["street"].isna()]

Unnamed: 0,record_from,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_city,lease_state,lease_street,lease_zip
5,10_Bethlehem_MVData_2019.csv,,TAXPAYER,,CITY,STATE,ZIP1,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,


Bethlehem does not contain street addresses in the underlying file, anyway.

In [297]:
column_renamer_2019[column_renamer_2019["vehicle_id"].isna()]

Unnamed: 0,record_from,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_city,lease_state,lease_street,lease_zip
6,11_Bloomfield_MVData_2019.csv,LIST NO,TAXPAYER,ADDRESS LINE 1,CITY/TOWN,STATE,ZIP,YEAR,MAKE,MODEL,CLASS,,,,,


Bloomfield does not contain VINs in the underlying file, anyway.

# --- REMAKE FILE ---

In [298]:
import time
import warnings

warnings.filterwarnings("ignore")

In [299]:
sorted(column_renamer_2021.columns) == sorted(column_renamer_2020.columns)== sorted(column_renamer_2019.columns)

True

In [330]:
# Files that will be updated. However, recall that some of the files, we've made edits to. 
# Therefore we save them
colchester_21.to_csv(raw_files_21 / "028_Colchester_MV_21_ALTERED.csv")
salisbury_21.to_csv(raw_files_21  /  "122_Salisbury_MV_21_ALTERED.csv")
manchester_21.to_csv(raw_files_21 / "077_Manchester_MV_21_ALTERED.csv")
bridgewater_21.to_csv(raw_files_21 / "016_Bridgewater_MV_21_ALTERED.csv")
manchester_19.to_csv(raw_files_20_19 / "77_Manchester_MVData_2019_ALTERED.csv")

# We then update the file lists
# First remove the files that we have changed
list_21 = [item for item in list_21 if not item in ["028_Colchester_MV_21.xlsx",
                                                    "122_Salisbury_MV_21.xlsx",
                                                    "077_Manchester_MV_21.xls",
                                                    "016_Bridgewater_MV_21.xlsx"]]

list_20_19.remove("77_Manchester_MVData_2019.csv")

# Then append the new items - don't need to as they're aready in there
# list_21.append("028_Colchester_MV_21_ALTERED.csv")
# list_21.append("122_Salisbury_MV_21_ALTERED.csv")
# list_21.append("077_Manchester_MV_21_ALTERED.csv")
# list_21.append("016_Bridgewater_MV_21_ALTERED.csv")
# list_20_19.append("77_Manchester_MVData_2019_ALTERED.csv")

ValueError: list.remove(x): x not in list

In [348]:
# Create master output dataframe
df_compiled_update = pd.DataFrame(columns = column_renamer_2021.columns)

In [350]:
updating_files = list_21 + list_20_19

In [351]:
df_rename

Unnamed: 0_level_0,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1_Andover_MVData_2020.csv,TAXPAYER,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip1,,,,,
4_Avon_MVData_2020.csv,NAME,STREET,IDENT#,MAKE,MODEL,YR,,,,,,,,,
6_BeaconFalls_MVData_2020.csv,Taxpayer,Street,Vehicle ID,Vehicle Make,Vehicle Model,,City,State,Vehicle Class,Zip Code,,,,,
9_Bethel_MVData_2020.csv,TAXPAYER,STREET,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,CITY,STATE,VEHICLE_CLASS,ZIP1,,,,,
13_Bozrah_MVData_2020.csv,Taxpayer,Street,Vin#,Make,Model,Year,City,State,Class,Zip1,Unique ID,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155_WestHaven_MVData_2020.csv,TAXPAYER,STREET,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,CITY,STATE,VEHICLE_CLASS,ZIP1,,,,,
157_Weston_MVData_2020.csv,TAXPAYER,STREET,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,CITY,STATE,VEHICLE_CLASS,ZIP1,,,,,
162_Winchester_MVData_2020.csv,NAME,ADDRESS 1,VIN NO,MAKE,MODEL,YEAR,CITY,STATE,CLASS,ZIP CODE,,,,,
163_Windham_MVData_2020.csv,Taxpayer,Street,Vin#,Make,Model,Year,City,State,Class,Zip1,,,,,


In [396]:
updating_files.index(f)

186

In [397]:
updating_files[186:]

['124_Seymour_MVData_2020.csv',
 '125_Sharon_MVData_2020.csv',
 '126_Shelton_MVData_2020.csv',
 '127_Sherman_MVData_2020.csv',
 '128_Simsbury_MVData_2019.csv',
 '129_Somers_MVData_2020.csv',
 '12_Bolton_MVData_2019.csv',
 '130_SouthWindsor_MVData_2020.csv',
 '130_South_Windsor_MVData_2019.csv',
 '131_Southbury_MVData_2019.csv',
 '132_Southington_MVData_2020.csv',
 '133_Sprague_MVData_2020.csv',
 '134_Stafford_MVData_2020.csv',
 '135_Stamford_MVData_2020.csv',
 '136_Sterling_MVData_2019.csv',
 '137_Stonington_MVData_2019.csv',
 '138_Stratford_MVData_2020.csv',
 '139_Suffield_MVData_2019.csv',
 '13_Bozrah_MVData_2020.csv',
 '140_Thomaston_MVData_2019.csv',
 '141_Thompson_MVData_2020.csv',
 '142_Tolland_MVData_2020.csv',
 '143_Torrington_MVData_2020.csv',
 '144_Trumbull_MV Data_2020.csv',
 '144_Trumbull_MVData_2020.csv',
 '145_Union_MVData_2019.csv',
 '146_Vernon_MVData_2020.csv',
 '147_Voluntown_MVData_2020.csv',
 '148_Wallingford_MVData_2019.csv',
 '149_Warren_MVData_2019.csv',
 '14_Bra

In [None]:
count = 0
t_start = time.time()

In [399]:
for f in updating_files[186:]:
    year = 0
    
    # Set paths and renaming file depending on the year
    # Set renaming dictionary depending on the year
    if "_21" in f:
        year = 2021
        f_path = raw_files_21 / f
        df_rename = column_renamer_2021
    elif "_2019" in f:
        year = 2019
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2019
    elif "_2020" in f:
        year = 2020
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2020
    
    df_rename = df_rename.set_index("record_from")
    
    # Open the file
    try:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path)
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path)
        else:
            continue
    except:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path, encoding = 'unicode_escape')
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path, encoding= 'unicode_escape')
        else:
            continue

    # get the renaming dict for f and keep valid columns
    rename_dict_f = {v: k for k, v in df_rename.loc[f][df_rename.loc[f].notnull()].to_dict().items()}
    valid_col_f = [k for k in rename_dict_f]
    df_f = df_f[valid_col_f].rename(columns=rename_dict_f)
    df_f['record_from'] = f        # add a column for path
    df_compiled_update = df_compiled_update.append(df_f, ignore_index=True)

    count = count + 1
    if count % 10 == 0:
        t_end = time.time()
        print('finished the {}th file; time used for the last ten files: {:2f} sec'.format(count, t_end - t_start))
        t_start = t_end

finished the 190th file; time used for the last ten files: 343.137590 sec
finished the 200th file; time used for the last ten files: 5.970299 sec
finished the 210th file; time used for the last ten files: 7.532127 sec
finished the 220th file; time used for the last ten files: 16.892902 sec
finished the 230th file; time used for the last ten files: 11.507635 sec
finished the 240th file; time used for the last ten files: 9.159696 sec
finished the 250th file; time used for the last ten files: 7.934847 sec
finished the 260th file; time used for the last ten files: 8.020593 sec
finished the 270th file; time used for the last ten files: 9.277568 sec
finished the 280th file; time used for the last ten files: 7.985374 sec
finished the 290th file; time used for the last ten files: 10.166022 sec
finished the 300th file; time used for the last ten files: 16.274727 sec
finished the 310th file; time used for the last ten files: 15.664137 sec
finished the 320th file; time used for the last ten files

In [403]:
df_compiled_update.to_csv(raw_files_21.parent / "Compiled" / "2019-21_data_compiled_RN_092923.csv")

In [404]:
len(df_compiled_update)

5787151