# Reconstructing municipal data

In [3]:
# Path Management
import pathlib
import os
import sys

# DataFrame Management
import pandas as pd

# Regex
import re

# Progress tracking
from tqdm import tqdm

In [4]:
# Address parsing
import usaddress

# Set up

## Setup raw file paths

These files are in the Dropbox master folder `2019 MV Data by Town` under the respective files shown below

In [8]:
sys.platform

'linux'

In [5]:
if sys.platform == "linux":
    path = pathlib.Path().resolve()
    raw_files_20_19 = path.parent.parent / "data" / "municipal_compilation" / "town_files_2019_2020"
    raw_files_21 =    path.parent.parent / "data" / "municipal_compilation" / "town_files_2021"
else:
    raw_files_20_19 = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "CSVFiles2019"
    raw_files_21 = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" /"Vehicles_2022" / "Town files"

In the list comprehension, we drop files that are not required. **Note:** It would be simpler to move these unneeded files to a separate directory, but I don't do this, for replicability.

In [6]:
# Create lists
list_20_19 = [filename.name for filename in raw_files_20_19.iterdir() 
              if ((filename.suffix!=".dta") & (filename.is_file()) 
                  & ~( ("_old" in filename.name) | ("noVIN" in filename.name) | ("uncleaned" in filename.name) | ("towns" in filename.name)))]
list_20_19 = [x for x in list_20_19 if x not in ['CT_2019_vindecoded.csv','all_pace_missing_clean.csv']]
list_20_19.remove("102_NorthStonington_MVData_2020.xlsx")

# Create 2021 lists
list_21 = [filename.name for filename in raw_files_21.iterdir()]

  & ~( ("_old" in filename.name) | ("noVIN" in filename.name) | ("uncleaned" in filename.name) | ("towns" in filename.name)))]


We do some more manual editing of the file lists. Everything after the file `all_pace_missing_clean.csv` is not needed.

In [22]:
# list_20_19 = list_20_19[0:list_20_19.index('all_pace_missing_clean.csv')]
# list_20_19.remove("102_NorthStonington_MVData_2020.xlsx")

In [7]:
# all_raw_files is the list of files I am going to use
all_raw_files = list_20_19 + list_21

## Confirm these correspond to the large CSV

To confirm I am using the correct list of files, I compare the list of files I am using to the previous version of the dataset (large_csv). 

In [8]:
if sys.platform == "linux":
    large_csv = pd.read_csv(path.parent.parent / "data" / "municipal_compilation"/ "2019-21_data_compiled_simplified_zipcitymatched.csv", chunksize = 1000)
else:
    large_csv = pd.read_csv(pathlib.Path().resolve().parent / "ignored-data" / "vehicles_2022" / "2019-21_data_compiled_simplified_zipcitymatched.csv", chunksize = 1000)

In [9]:
large_csv_sources = []

for chunk in tqdm(large_csv):
    sources = list(chunk["record_from"].unique())

    for item in sources:
        if item not in large_csv_sources:
            large_csv_sources.append(item)

5791it [00:26, 215.45it/s]


In [10]:
# Rename
unique_sources_large_csv = large_csv_sources

### Identify and resolve differences

**Compare items that are in all_raw_files but not in the large CSV**

In [11]:
for item in all_raw_files:
    if item not in unique_sources_large_csv:
        print(item)

77_Manchester_MVData_2019_ALTERED.csv
016_Bridgewater_MV_21_ALTERED.csv
028_Colchester_MV_21_ALTERED.csv
122_Salisbury_MV_21_ALTERED.csv
072_Ledyard_MV_21_ALTERED.csv
077_Manchester_MV_21_ALTERED.csv
045_East_Lyme_MV_21_ALTERED.csv


Thus we find that files that RN has altered, are the only files that differ between them - that makes sense.

**Items that are in the large CSV but not in all raw files**

In [12]:
for item in unique_sources_large_csv:
    if item not in all_raw_files:
        print(item)

There are no items that are in the large CSV but not in all_raw_files. That is, all_raw_files (the file list that will be used in the remainder of this `.ipynb`) is a larger set.

**So all differences are now resolved.**

# Fix Renamer Files and Underlying Files

In this section, we **make manual changes to the renamer files.** That is, rather than rebuilding the run controller files that are used to create the renamer files, we manually change the renamer files.

The renamer files are used to rename column headers in the underlying csv and xlsx files provided by the municipalities. The renamer files are created by a separate script written by Jiarong Qi (jiarong.qi@yale.edu) - we take these as a fixed input and make changes to them.

A superior approach would be to make direct edits to the code that creates the renamer files. This code is to be found under `SEEDS3_SolarEVs/Jiarong/0_Codes/vehicle_registration_compile.ipynb`. We *do not make edits to that file* but instead treat its outputs as given, and directly alter them to address issues.

## Confirm correspondence to the three renamer files

First, we *again* confirm that we are using the correct list of files. 

To do this, I place all the renamer files in a separate directory, go through them, and pull out all the filenames in the renamers (the filenames are in the column "record_from", that is present in each renamer file). I then confirm correspondence to `all_raw_files`.

In [18]:
if sys.platform == 'linux':
    renamer_files_dir = path.parent.parent / "data" / "municipal_compilation" / "renamer_files"
else:
    renamer_files_dir = pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun"

In [19]:
source_files_renamers = []

for file in renamer_files_dir.iterdir():
    df = pd.read_csv(file)
    source_files_renamers.append(list(df["record_from"]))

In [20]:
source_files_renamers = [item for sublist in source_files_renamers for item in sublist]

**Confirm correspondence**

In [21]:
for item in source_files_renamers:
    if item not in all_raw_files:
        print(item)

In [22]:
for item in all_raw_files:
    if item not in source_files_renamers:
        print(item)

77_Manchester_MVData_2019_ALTERED.csv
016_Bridgewater_MV_21_ALTERED.csv
028_Colchester_MV_21_ALTERED.csv
122_Salisbury_MV_21_ALTERED.csv
072_Ledyard_MV_21_ALTERED.csv
077_Manchester_MV_21_ALTERED.csv
045_East_Lyme_MV_21_ALTERED.csv


This confirms that we are using the correct list of files. The ALTERED files are produced below.

# Make Manual Adjustments to renamer files and underlying files.

In this section, we fix two types of errors:
* Errors in the renamer files (that are produced by the script `SEEDS3_SolarEVs/Jiarong/0_Codes/vehicle_registration_compile.ipynb`) which we do not alter. In particular, in many cases these renamer files do NOT accurately record the correct column name in the underlying file. This means that data from the underlying file is not transferred to the compiled file. We fix these errors.
* Erros in the UNDERLYING files provided by the municipalities. For example, in some cases the Zip code and the street address are in the same column, meaning we cannot use them. The current `.ipynb` addresses these issues. Whenever we make updates to an underlying file, we re-save it with `_ALTERED.csv` at the end. See the list of altered files above.

## Load column renamers
These are the raw column renamers produced by `SEEDS3_SolarEVs/Jiarong/0_Codes/vehicle_registration_compile.ipynb.` We copy and paste them into a separate directory for ease of use, but their original locations are under:
* `SEEDS3_SolarEVs/1_rawdata/Vehicles_2022/compiled/2019` or `/2020` or `/2021` as `vehicle_\{year}_column_renamer.csv`

In [23]:
if sys.platform == "linux":
    column_renamer_2021 = pd.read_csv(path.parent.parent / "data" / "municipal_compilation"/ "renamer_files"/ "vehicle_2021_column_renamer_RN.csv")
    column_renamer_2020 = pd.read_csv(path.parent.parent / "data" / "municipal_compilation"/ "renamer_files"/ "vehicle_2020_column_renamer_RN.csv")
    column_renamer_2019 = pd.read_csv(path.parent.parent / "data" / "municipal_compilation"/ "renamer_files"/ "vehicle_2019_column_renamer_RN.csv") 
else:
    column_renamer_2021 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2021_column_renamer_RN.csv")
    column_renamer_2020 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2020_column_renamer_RN.csv")
    column_renamer_2019 = pd.read_csv(pathlib.Path().resolve().parent.parent / "Dropbox" / "2019 MV Data by Town" / "RN_0923_rerun" / "vehicle_2019_column_renamer_RN.csv") 

### Function to alter renamers

In [24]:
def fix_renamer(year, name, change_dict):
    renamer_dict = {"2021": column_renamer_2021, "2020" : column_renamer_2020, "2019":column_renamer_2019}
    renamer_fields = {'name': None,
                         'street': None,
                         'city': None,
                         'state': None,
                         'zip': None,
                         'vehicle_year': None,
                         'vehicle_make': None,
                         'vehicle_model': None,
                         'vehicle_class': None,
                         'vehicle_id': None,
                         'lease_street': None,
                         'UID': None,
                         'lease_city': None,
                         'lease_state': None,
                         'lease_zip': None}
    renamer = renamer_dict[year]
    renamer_old = renamer.copy(deep = True)
    town_index = renamer[renamer["record_from"].str.contains(name)].index.tolist()[0]
    print("-----OLD-----")
    print(renamer.loc[town_index])
    for key in change_dict.keys():
        try:
            renamer.iloc[town_index, renamer.columns.isin([key])] = change_dict[key]
        except:
            print(f"{key} not in renamer columns")

    print("-----NEW-----")
    print(renamer.loc[town_index])

### (Renamer Edits) Norwalk - 2021

We now commence making direct edits to the renamer files. **Note:** As I wrote this code, I discovered improved ways to make these edits, so the methodology is not identical for every town. However, it should be similar.

In [25]:
# Extract the index (row number) of the relevant town in the relevant renamer file
norwalk_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Norwalk")].index.tolist()[0]

Note: To work out what the column names should be, I needed to go into the underlying file. For 2021, this is to be found under `vehicles_2022/town_files` 

In [26]:
# Manually change the values
nor_cols_21 = ["txcm prim name full","txcm prim addr1","txcm prim city",
            "txcm prim state","txcm prim zip","txcm veh year",
            "txcm make","txcm model","txcm class","txcm vin"]

for i in range(1,11):
        column_renamer_2021.iloc[norwalk_index_21,i] = nor_cols_21[i-1]

In [27]:
column_renamer_2021.iloc[140,: ]

record_from      103_Norwalk_MV_21.xlsx
name                txcm prim name full
street                  txcm prim addr1
city                     txcm prim city
state                   txcm prim state
zip                       txcm prim zip
vehicle_year              txcm veh year
vehicle_make                  txcm make
vehicle_model                txcm model
vehicle_class                txcm class
vehicle_id                     txcm vin
lease_street                        NaN
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_zip                           NaN
Name: 140, dtype: object

### (Renamer Edits) Norwalk - 2020

In [28]:
# Norwalk
norwalk_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Norwalk")].index.tolist()[0]

In [29]:
column_renamer_2020.loc[norwalk_index_20]

record_from      103_Norwalk_MVData_2020.csv
name                                     NaN
street                                   NaN
vehicle_id                               NaN
vehicle_make                             NaN
vehicle_model                            NaN
vehicle_year                             NaN
city                                     NaN
state                                    NaN
vehicle_class                            NaN
zip                                      NaN
UID                                      NaN
lease_city                               NaN
lease_state                              NaN
lease_street                             NaN
lease_zip                                NaN
Name: 44, dtype: object

In [30]:
nor_cols_20 = ["txcm prim name full","txcm prim addr1","txcm vin",
               "txcm make","txcm model", "txcm veh year",
               "txcm prim city", "txcm prim state","txcm class", 
               "txcm prim zip"]

In [31]:
for i in range(1,11):
        column_renamer_2020.iloc[norwalk_index_20,i] = nor_cols_20[i-1]

In [32]:
column_renamer_2020.loc[norwalk_index_20]

record_from      103_Norwalk_MVData_2020.csv
name                     txcm prim name full
street                       txcm prim addr1
vehicle_id                          txcm vin
vehicle_make                       txcm make
vehicle_model                     txcm model
vehicle_year                   txcm veh year
city                          txcm prim city
state                        txcm prim state
vehicle_class                     txcm class
zip                            txcm prim zip
UID                                      NaN
lease_city                               NaN
lease_state                              NaN
lease_street                             NaN
lease_zip                                NaN
Name: 44, dtype: object

### (Renamer Edits & Underlying Edits) Colchester - 2021

As can be seen below, for Colchester, it is necessary to make **direct edits to the underlying file provided by the municipality.** Therefore for this town the code comes in two sections. First, make the underlying changes, then, update the column renamer

In [33]:
colchester_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Colchester")].index.tolist()[0]

In [34]:
column_renamer_2021.loc[colchester_index_21]

record_from      028_Colchester_MV_21.xlsx
name                                   NaN
street                             address
city                                   NaN
state                                  NaN
zip                                    NaN
vehicle_year                           NaN
vehicle_make                           NaN
vehicle_model                          NaN
vehicle_class                          NaN
vehicle_id                             NaN
lease_street                           NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_zip                              NaN
Name: 149, dtype: object

**Manual adjustments required**

In [37]:
colchester_21 = pd.read_excel(raw_files_21 / "028_Colchester_MV_21.xlsx")

As we can see below, the issue with this file is that the city, state, and zip are all in the same column (NOTE: If you see a city, state, and zip column separately at the end - that is because you've already run the edits below). We need to make manual edits to the file to separate these out.

In [38]:
colchester_21.head(3)

Unnamed: 0,id,name_1,name_2,address,city_state_zip,identification
0,50001,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,5NHUCC42X8N055055
1,50002,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,55NBE1213J1005012
2,50003,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,1GTEC14C18Z901358


This code splits out the city, state, and zip code from the one column, and turns them into three separate columns.

In [39]:
colchester_cities = []
colchester_zipcodes = []
colchester_states = []
for entry in list(colchester_21["city_state_zip"]):
    if not pd.isna(entry):
        split = entry.split(" ")
        zip_code = split[len(split) - 1]
        state = split[len(split)-2]
        city = ' '.join(split[0:len(split)-2])
    else:
        zip_code = 'nan'
        state = 'nan'
        city = 'nan'

    colchester_cities.append(city)
    colchester_zipcodes.append(zip_code)
    colchester_states.append(state)

In [40]:
colchester_21["city"] = colchester_cities
colchester_21["state"] = colchester_states
colchester_21["zip"] = colchester_zipcodes

In [41]:
colchester_21.head(3)

Unnamed: 0,id,name_1,name_2,address,city_state_zip,identification,city,state,zip
0,50001,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,5NHUCC42X8N055055,COLCHESTER,CT,06415-
1,50002,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,55NBE1213J1005012,COLCHESTER,CT,06415-
2,50003,A BEST GUTTERS LLC,,381 CABIN RD,COLCHESTER CT 06415-,1GTEC14C18Z901358,COLCHESTER,CT,06415-


**Edit the renamer**
Now that we have fixed the issues in the underlying file, we make edits to the renamer file.

In [42]:
colchester_cols_21 = ["name_1", "address", "city", "state", "zip","","","","", "identification"]

for i in [1,2,3,4,5,10]:
        column_renamer_2021.iloc[colchester_index_21, i] = colchester_cols_21[i-1]

Because we have made manual edits to the underlying file and will re-save the file later on with a new name, containing `_ALTERED.csv` as a suffix, we update the `record_from` column.

In [43]:
column_renamer_2021.loc[colchester_index_21, "record_from"] = "028_Colchester_MV_21_ALTERED.csv"

In [44]:
 column_renamer_2021.iloc[colchester_index_21]

record_from      028_Colchester_MV_21_ALTERED.csv
name                                       name_1
street                                    address
city                                         city
state                                       state
zip                                           zip
vehicle_year                                  NaN
vehicle_make                                  NaN
vehicle_model                                 NaN
vehicle_class                                 NaN
vehicle_id                         identification
lease_street                                  NaN
UID                                           NaN
lease_city                                    NaN
lease_state                                   NaN
lease_zip                                     NaN
Name: 149, dtype: object

## Salisbury 2021

**Manual edits required**

It doesn't seem like it, but this file required manual edits to the underlying file as well - this is because the underlying file did not have any column headers at all. Consequently, I add them back in.

In [45]:
salisbury_21 = pd.read_excel(raw_files_21 / "122_Salisbury_MV_21.xlsx", header = None)
salisbury_21.columns = ["name", "street", "city", "state", "zip", "vehicle_year", "vehicle_make", "vehicle_model", "unknown", "vehicle_id"]
salisbury_21 = salisbury_21.drop("unknown", axis =1)

In [46]:
salisbury_21.head(1)

Unnamed: 0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_id
0,ABENDROTH MAXWELL H,211 INDIAN MTN RD,LAKEVILLE,CT,6039,2015,VOLKS,PASSAT S,1VWAT7A36FC096475


**Make edits to the renamer**

Having rectified the issues in the underlying file, I make edits to the renamer file.

### Salisbury 2021

In [163]:
salisbury_rename_dict = {"record_from":"122_Salisbury_MV_21_ALTERED.csv",
                        "name" : "name",
                          "street" : "street",
                          "city" : "city", 
                          "state" :"state",
                          "zip":"zip",
                          "vehicle_year" :"vehicle_year",
                          "vehicle_make" :"vehicle_make", 
                          "vehicle_model" : "vehicle_model",
                          "vehicle_id":"vehicle_id"}

In [164]:
fix_renamer("2021", "Salisbury", salisbury_rename_dict)

-----OLD-----
record_from      122_Salisbury_MV_21_ALTERED.xlsx
name                                         name
street                                     street
city                                         city
state                                       state
zip                                           zip
vehicle_year                         vehicle_year
vehicle_make                         vehicle_make
vehicle_model                       vehicle_model
vehicle_class                                 NaN
vehicle_id                             vehicle_id
lease_street                                  NaN
UID                                           NaN
lease_city                                    NaN
lease_state                                   NaN
lease_zip                                     NaN
Name: 96, dtype: object
-----NEW-----
record_from      122_Salisbury_MV_21_ALTERED.csv
name                                        name
street                                    street
c

### Torrington 2021

In [49]:
torrington_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Torrington")].index.tolist()[0]
column_renamer_2021.iloc[torrington_index_21, column_renamer_2021.columns.isin(["vehicle_id"])] = "VEHICLE ID"
column_renamer_2021.loc[torrington_index_21]

record_from      143_Torrington_MV_21.xlsx
name                              TAXPAYER
street                              STREET
city                                  CITY
state                                STATE
zip                               ZIP CODE
vehicle_year                  VEHICLE YEAR
vehicle_make                           NaN
vehicle_model                VEHICLE MODEL
vehicle_class                          NaN
vehicle_id                      VEHICLE ID
lease_street                           NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_zip                              NaN
Name: 115, dtype: object

### Shelton - 2021

In [50]:
shelton_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Shelton")].index.tolist()[0]
column_renamer_2021.iloc[shelton_index_21, column_renamer_2021.columns.isin(["vehicle_id"])] = "VIN #"
column_renamer_2021.loc[shelton_index_21]

record_from      126_Shelton_MV_21.xlsx
name                           TAXPAYER
street                           STREET
city                               CITY
state                             STATE
zip                                ZIP1
vehicle_year                       YEAR
vehicle_make                       MAKE
vehicle_model                     MODEL
vehicle_class                     CLASS
vehicle_id                        VIN #
lease_street                        NaN
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_zip                           NaN
Name: 100, dtype: object

### Bethel - 2021

In [51]:
bethel_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Bethel")].index.tolist()[0]
column_renamer_2021.iloc[bethel_index_21, column_renamer_2021.columns.isin(["street"])] = "Mailing Address"
column_renamer_2021.loc[bethel_index_21]

record_from      009_Bethel_MV_21.xlsx
name                          Taxpayer
street                 Mailing Address
city                              City
state                           State 
zip                                Zip
vehicle_year                      Year
vehicle_make                      Make
vehicle_model                      NaN
vehicle_class                    Class
vehicle_id                 Vehicle Vin
lease_street         Residence Address
UID                                NaN
lease_city                         NaN
lease_state                        NaN
lease_zip                          NaN
Name: 8, dtype: object

### Manchester - 2021

**Manual edits required**

In this case, some of the critical columns we require were unnamed in the underlying file, so we give them column names.

In [53]:
!pip install xlrd

Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [54]:
manchester_21 = pd.read_excel(raw_files_21 / "077_Manchester_MV_21.xls")

In [55]:
manchester_21 = manchester_21.rename(columns = {"Unnamed: 5": "city",
                                      "Unnamed: 6":"state",
                                      "Unnamed: 7":"zip"})
manchester_21

Unnamed: 0,#,LIST #,TAXPAYER,CARE OF,MAILING ADDRESS,city,state,zip,Unnamed: 8,Unnamed: 9,TAX DIST,YEAR,MAKE,MODEL,CLASS CODE,VIN NUMBER,STYLE,COLOR,ASMNT
0,1.0,50001.0,@URSVC LLC,,2 WILLOWBROOK RD,CROMWELL,CT,06416,-,2505,T,2013.0,CADIL,SRX PREM,1.0,3GYFNJE33DS595309,WAGON,BLA,10890.0
1,2.0,50002.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2004.0,ISUZU,FRR,2.0,JALF5C13547700369,TILT C,WHI,6510.0
2,3.0,50003.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2004.0,UDUMP,61214R0,10.0,43ZDN22B340000019,TRAILE,BLA,550.0
3,4.0,50004.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,1998.0,WELD,UTILITY,10.0,1J9HH2421W1118262,TRAILE,BLA,550.0
4,5.0,50005.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 8231,MANCHESTER,CT,06040,-,0231,E,2007.0,KENWO,CONSTRUC,2.0,2NKMHZ7X07M210675,HOIST,WHI,12580.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44482,44483.0,95117.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,06040,-,6754,T,2009.0,FORD,F150,1.0,1FTRF12859KB23566,PICKUP,WHI,6860.0
44483,44484.0,95118.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,06040,-,6754,T,2010.0,NISSA,ALTIMA 2,1.0,1N4AL2AP2AN549552,SEDAN,GRA,4220.0
44484,,,,,,,,,,,,,,,,,,,
44485,,,,,,,,,,,,,,,,,,,


**Make edits to the column renamer**

A different technique here is used to update the column renamer. I use the `.loc[]` method on the DataFrame object. 

In [56]:
manchester_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Manchester")].index.tolist()[0]
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["street"])] = "MAILING ADDRESS"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["city"])] = "city"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["state"])] = "state"
column_renamer_2021.iloc[manchester_index_21, column_renamer_2021.columns.isin(["zip"])] = "zip"

This part is important - when we alter the file, we re-save it with `_ALTERED` at the end (this happens later). Therefore we must update the `record_from` column with the new filename (again, there are probably better ways to do this)

In [57]:
column_renamer_2021.loc[manchester_index_21, "record_from"] = "077_Manchester_MV_21_ALTERED.csv"
column_renamer_2021.loc[manchester_index_21]

record_from      077_Manchester_MV_21_ALTERED.csv
name                                     TAXPAYER
street                            MAILING ADDRESS
city                                         city
state                                       state
zip                                           zip
vehicle_year                                 YEAR
vehicle_make                                 MAKE
vehicle_model                               MODEL
vehicle_class                          CLASS CODE
vehicle_id                             VIN NUMBER
lease_street                                  NaN
UID                                        LIST #
lease_city                                    NaN
lease_state                                   NaN
lease_zip                                     NaN
Name: 61, dtype: object

### Norfolk - 2021
**NOTE that I have placed `RESIDENCE_STREET` as the street address**

In [58]:
norfolk_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Norfolk")].index.tolist()[0]
column_renamer_2021.iloc[norfolk_index_21, column_renamer_2021.columns.isin(["street"])] = "RESIDENCE_STREET"
column_renamer_2021.iloc[norfolk_index_21, column_renamer_2021.columns.isin(["city"])] = "RESIDENCE_CITY"
column_renamer_2021.iloc[norfolk_index_21]

record_from      098_Norfolk_MV_21.xlsx
name                           TAXPAYER
street                 RESIDENCE_STREET
city                     RESIDENCE_CITY
state                               NaN
zip                                 NaN
vehicle_year               VEHICLE_YEAR
vehicle_make               VEHICLE_MAKE
vehicle_model             VEHICLE_MODEL
vehicle_class             VEHICLE_CLASS
vehicle_id                   VEHICLE_ID
lease_street           RESIDENCE_STREET
UID                                 NaN
lease_city               RESIDENCE_CITY
lease_state                         NaN
lease_zip                           NaN
Name: 77, dtype: object

### Thompson - 2021

In [59]:
thompson_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Thompson")].index.tolist()[0]
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["street"])] = "street"
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["state"])] = "state"
column_renamer_2021.iloc[thompson_index_21, column_renamer_2021.columns.isin(["zip"])] = "zip"
column_renamer_2021.iloc[thompson_index_21]

record_from      141_Thompson_MV_21.xlsx
name                       taxpayer_name
street                            street
city                                 NaN
state                              state
zip                                  zip
vehicle_year                         NaN
vehicle_make                         NaN
vehicle_model                        NaN
vehicle_class                        NaN
vehicle_id                           vin
lease_street                         NaN
UID                                  NaN
lease_city                           NaN
lease_state                          NaN
lease_zip                            NaN
Name: 113, dtype: object

### Bridgewater - 2021

**Manual edits required**

For bridgewater, we need to make manual edits to the file. These are quite substantial edits. The Mail address contains the full address with the town and state and ZIP. Parsing this is very difficult, so we use the `usaddress` package to parse the address. `usaddress` uses a pre-trained NLP model to distinguish between the different elements of the address.

In [60]:
bridgewater_21 = pd.read_excel(raw_files_21 / "016_Bridgewater_MV_21.xlsx")

In [61]:
bridgewater_21.head(1)

Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-


First fix the mailing addresses, by using `usaddress` package to parse the different parts of the column.

In [62]:
bridge_mail_address_states  = []
bridge_mail_address_zips    = []
bridge_mail_address_streets = []
bridge_mail_address_cities  = []


for address in tqdm(list(bridgewater_21["mail_address"])):
    # Fields needed
    fields_needed = ["USPSBoxType", "USPSBoxID", "AddressNumber", "StreetName", "StreetNamePostType",
                 "PlaceName", "StateName", "ZipCode"]
    
    # Create the street+city, state, and zip
    try:
        # Parse the address, and turn this into a list of lists
        parsed_address = usaddress.parse(address)
        parsed_address_list = [list(tup) for tup in parsed_address]
        
        # The parsed address includes duplicate fields, so we deduplicate the fields
        fields = [sublist[1] for sublist in parsed_address_list]
        fields = list(dict.fromkeys(fields))
        
        # We then create a dictionary based on these deduplicated fields
        fields_dict = dict.fromkeys(fields)
        
        # For each key in the dictionary, we set the value by appending all values for that field
        for field in fields:
            field_value = ' '.join([sublist[0] for sublist in parsed_address_list if sublist[1]==field])
            fields_dict[field] = field_value
        
        # Standardize the dictionary - we ensure that all required fields are in the dictionary
        # Where there is no value for that key, we set the value as an empty string
        for field_needed in fields_needed:
            if not field_needed in fields_dict.keys():
                 fields_dict[field_needed] = ""
    
        # We set the street, city, zip code, and state
        street = " ".join([fields_dict["USPSBoxType"],
                           fields_dict["USPSBoxID"],
                           fields_dict["AddressNumber"],
                           fields_dict["StreetName"],
                           fields_dict["StreetNamePostType"]]).strip()
        city = fields_dict["PlaceName"]
        zip_code = fields_dict["ZipCode"]
        state = fields_dict["StateName"]
    except:
        street = ""
        city = ""
        zip_code = ""
        state = ""
    
    # We append these to the master lists, that will be set as columns
    bridge_mail_address_states.append(state)
    bridge_mail_address_zips.append(zip_code)
    bridge_mail_address_streets.append(street)
    bridge_mail_address_cities.append(city)

100%|██████████| 7254/7254 [00:00<00:00, 16448.14it/s]


In [63]:
bridgewater_21["mail_address_street"] = bridge_mail_address_streets
bridgewater_21["mail_address_city"] = bridge_mail_address_cities
bridgewater_21["mail_address_zip"] = bridge_mail_address_zips
bridgewater_21["mail_address_state"] = bridge_mail_address_states

In [64]:
bridgewater_21.head(5)

Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2,mail_address_street,mail_address_city,mail_address_zip,mail_address_state
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-,PO BOX 357,BRIDGEWATER,06752-0357,CT
1,50002.0,ABBETT LINDA J,,PO BOX 357 BRIDGEWATER CT 06752-0357,JTMBD33V476023102,121 CLATTER VLY RD BRIDGEWATER CT -,PO BOX 357,BRIDGEWATER,06752-0357,CT
2,50003.0,ABBETT MARILYN S,ABBETT LINDA J,PO BOX 126 BRIDGEWATER CT 06752-0126,JTMDJREV4GD005535,168 HENRY SANFORD RD BRIDGEWATER CT 06752-,PO BOX 126,BRIDGEWATER,06752-0126,CT
3,50004.0,ABBEY RESTAURANT LLC,,11 CEDAR HILL RD BRIDGEWATER CT 06752-1001,1GNWGEFGXH1246624,,11 CEDAR HILL RD,BRIDGEWATER,06752-1001,CT
4,50005.0,ABBOTT CHARLES D,,113 HENRY SANFORD RD BRIDGEWATER CT 06752-1214,186RDB1C5JH000357,,113 HENRY SANFORD RD,BRIDGEWATER,06752-1214,CT


Now fix the second address - probably the lease address

In [65]:
bridge_secondary_address_states  = []
bridge_secondary_address_zips    = []
bridge_secondary_address_streets = []
bridge_secondary_address_cities  = []


for address in tqdm(list(bridgewater_21["address_2"])):
    # Fields needed
    fields_needed = ["USPSBoxType", "USPSBoxID", "AddressNumber", "StreetName", "StreetNamePostType",
                 "PlaceName", "StateName", "ZipCode"]
    
    # Create the street+city, state, and zip
    try:
        # Parse the address, and turn this into a list of lists
        parsed_address = usaddress.parse(address)
        parsed_address_list = [list(tup) for tup in parsed_address]
        
        # The parsed address includes duplicate fields, so we deduplicate the fields
        fields = [sublist[1] for sublist in parsed_address_list]
        fields = list(dict.fromkeys(fields))
        
        # We then create a dictionary based on these deduplicated fields
        fields_dict = dict.fromkeys(fields)
        
        # For each key in the dictionary, we set the value by appending all values for that field
        for field in fields:
            field_value = ' '.join([sublist[0] for sublist in parsed_address_list if sublist[1]==field])
            fields_dict[field] = field_value
        
        # Standardize the dictionary - we ensure that all required fields are in the dictionary
        # Where there is no value for that key, we set the value as an empty string
        for field_needed in fields_needed:
            if not field_needed in fields_dict.keys():
                 fields_dict[field_needed] = ""
    
        # We set the street, city, zip code, and state
        street = " ".join([fields_dict["USPSBoxType"],
                           fields_dict["USPSBoxID"],
                           fields_dict["AddressNumber"],
                           fields_dict["StreetName"],
                           fields_dict["StreetNamePostType"]]).strip()
        city = fields_dict["PlaceName"]
        zip_code = fields_dict["ZipCode"]
        state = fields_dict["StateName"]
    except:
        street = ""
        city = ""
        zip_code = ""
        state = ""
    
    # We append these to the master lists, that will be set as columns
    bridge_secondary_address_states.append(state)
    bridge_secondary_address_zips.append(zip_code)
    bridge_secondary_address_streets.append(street)
    bridge_secondary_address_cities.append(city)

100%|██████████| 7254/7254 [00:00<00:00, 70580.48it/s]


In [66]:
bridgewater_21["secondary_address_street"] = bridge_secondary_address_streets
bridgewater_21["secondary_address_city"] =   bridge_secondary_address_cities
bridgewater_21["secondary_address_zip"] =    bridge_secondary_address_zips
bridgewater_21["secondary_address_state"] =  bridge_secondary_address_states

In [67]:
bridgewater_21.head(5)

Unnamed: 0,id,taxpayer_name,2nd_name,mail_address,vin,address_2,mail_address_street,mail_address_city,mail_address_zip,mail_address_state,secondary_address_street,secondary_address_city,secondary_address_zip,secondary_address_state
0,50001.0,ABBETT LINDA J,ABBETT MARILYN S,PO BOX 357 BRIDGEWATER CT 06752-0357,2T3DFREV1GW496859,121 CLATTER VALLEY RD BRIDGEWATER CT 06752-,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VALLEY RD,BRIDGEWATER,06752-,CT
1,50002.0,ABBETT LINDA J,,PO BOX 357 BRIDGEWATER CT 06752-0357,JTMBD33V476023102,121 CLATTER VLY RD BRIDGEWATER CT -,PO BOX 357,BRIDGEWATER,06752-0357,CT,121 CLATTER VLY RD,BRIDGEWATER,,CT
2,50003.0,ABBETT MARILYN S,ABBETT LINDA J,PO BOX 126 BRIDGEWATER CT 06752-0126,JTMDJREV4GD005535,168 HENRY SANFORD RD BRIDGEWATER CT 06752-,PO BOX 126,BRIDGEWATER,06752-0126,CT,168 HENRY SANFORD RD,BRIDGEWATER,06752-,CT
3,50004.0,ABBEY RESTAURANT LLC,,11 CEDAR HILL RD BRIDGEWATER CT 06752-1001,1GNWGEFGXH1246624,,11 CEDAR HILL RD,BRIDGEWATER,06752-1001,CT,,,,
4,50005.0,ABBOTT CHARLES D,,113 HENRY SANFORD RD BRIDGEWATER CT 06752-1214,186RDB1C5JH000357,,113 HENRY SANFORD RD,BRIDGEWATER,06752-1214,CT,,,,


**Now fix the column renamer**

In [68]:
bridgewater_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Bridgewater")].index.tolist()[0]

In [69]:
column_renamer_2021.loc[bridgewater_index_21]

record_from      016_Bridgewater_MV_21.xlsx
name                          taxpayer_name
street                                  NaN
city                                    NaN
state                                   NaN
zip                                     NaN
vehicle_year                            NaN
vehicle_make                            NaN
vehicle_model                           NaN
vehicle_class                           NaN
vehicle_id                              vin
lease_street                            NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_zip                               NaN
Name: 135, dtype: object

In [70]:
renamer_fields_bridgewater = dict.fromkeys(column_renamer_2021.columns[1:])

In [71]:
renamer_fields_bridgewater["name"] = "taxpayer_name"
renamer_fields_bridgewater["street"] = "mail_address_street"
renamer_fields_bridgewater["city"] = "mail_address_city"
renamer_fields_bridgewater["state"]=  "mail_address_state"
renamer_fields_bridgewater["zip"] = "mail_address_zip"
# renamer_fields_bridgewater["vehicle_year"] 
# renamer_fields_bridgewater["vehicle_make"]
# renamer_fields_bridgewater["vehicle_model"]
# renamer_fields_bridgewater["vehicle_class"]
renamer_fields_bridgewater["vehicle_id"] = "vin"
renamer_fields_bridgewater["lease_street"] = "secondary_address_street"
# renamer_fields_bridgewater["UID"]
renamer_fields_bridgewater["lease_city"] = "secondary_address_city"
renamer_fields_bridgewater["lease_state"] = "secondary_address_state"
renamer_fields_bridgewater["lease_zip"] = "secondary_address_zip"

In [72]:
for key in renamer_fields_bridgewater.keys():
    column_renamer_2021.iloc[bridgewater_index_21, column_renamer_2021.columns.isin([key])] = renamer_fields_bridgewater[key]

In [73]:
# Set the record_from column appropriately
column_renamer_2021.loc[bridgewater_index_21, "record_from"] = "016_Bridgewater_MV_21_ALTERED.csv"

In [74]:
column_renamer_2021.loc[bridgewater_index_21]

record_from      016_Bridgewater_MV_21_ALTERED.csv
name                                 taxpayer_name
street                         mail_address_street
city                             mail_address_city
state                           mail_address_state
zip                               mail_address_zip
vehicle_year                                  None
vehicle_make                                  None
vehicle_model                                 None
vehicle_class                                 None
vehicle_id                                     vin
lease_street              secondary_address_street
UID                                           None
lease_city                  secondary_address_city
lease_state                secondary_address_state
lease_zip                    secondary_address_zip
Name: 135, dtype: object

### Southington - 2021

In [75]:
renamer_fields_southington = dict.fromkeys(column_renamer_2021.columns[1:])
renamer_fields_southington

{'name': None,
 'street': None,
 'city': None,
 'state': None,
 'zip': None,
 'vehicle_year': None,
 'vehicle_make': None,
 'vehicle_model': None,
 'vehicle_class': None,
 'vehicle_id': None,
 'lease_street': None,
 'UID': None,
 'lease_city': None,
 'lease_state': None,
 'lease_zip': None}

In [76]:
southington_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Southington")].index.tolist()[0]

In [77]:
column_renamer_2021.loc[southington_index_21]

record_from      131_Southington_MV_21.xlsx
name                                   Name
street                                  NaN
city                                   City
state                                 State
zip                                     Zip
vehicle_year                           Year
vehicle_make                           Make
vehicle_model                         Model
vehicle_class                         Class
vehicle_id                              VIN
lease_street                            NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_zip                               NaN
Name: 155, dtype: object

In [78]:
renamer_fields_southington = dict.fromkeys(column_renamer_2021.columns[1:])

In [79]:
renamer_fields_southington["street"] = "Street Address"

In [80]:
for key in renamer_fields_southington.keys():
    if renamer_fields_southington[key]:
        column_renamer_2021.iloc[southington_index_21, column_renamer_2021.columns.isin([key])] = renamer_fields_southington[key]

In [81]:
column_renamer_2021.loc[southington_index_21]

record_from      131_Southington_MV_21.xlsx
name                                   Name
street                       Street Address
city                                   City
state                                 State
zip                                     Zip
vehicle_year                           Year
vehicle_make                           Make
vehicle_model                         Model
vehicle_class                         Class
vehicle_id                              VIN
lease_street                            NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_zip                               NaN
Name: 155, dtype: object

### North Branford 2021

In [82]:
fix_renamer("2021", "North_Branford", {"street":"Address 1",
                                       "state":"ST",
                                       "lease_state":"Res ST"})

-----OLD-----
record_from      099_North_Branford_MV_21.xlsx
name                                      Name
street                                      ST
city                                      City
state                                      NaN
zip                                        Zip
vehicle_year                              YEAR
vehicle_make                              Make
vehicle_model                            Model
vehicle_class                            CLASS
vehicle_id                                 Vin
lease_street             Residential Address 1
UID                                        NaN
lease_city                    Residential City
lease_state                                NaN
lease_zip                              Res Zip
Name: 78, dtype: object
-----NEW-----
record_from      099_North_Branford_MV_21.xlsx
name                                      Name
street                               Address 1
city                                      City
state   

In [83]:
newington_20_rename_dict = {"zip" : "ZIP"}
fix_renamer("2020", "Newington", newington_20_rename_dict)

-----OLD-----
record_from      96_Newington_MVData_2020.csv
name                                 TAXPAYER
street                                     ST
vehicle_id                                VIN
vehicle_make                             MAKE
vehicle_model                           MODEL
vehicle_year                             YEAR
city                                     CITY
state                                     NaN
vehicle_class                             NaN
zip                                       NaN
UID                                       UID
lease_city                                NaN
lease_state                               NaN
lease_street                              NaN
lease_zip                                 NaN
Name: 39, dtype: object
-----NEW-----
record_from      96_Newington_MVData_2020.csv
name                                 TAXPAYER
street                                     ST
vehicle_id                                VIN
vehicle_make                

### Sherman 2021

In [84]:
fix_renamer("2021", "Sherman", {"state":"st",
                                "city": "town",
                               "street":"address",
                               "zip": "zip"})

-----OLD-----
record_from      127_Sherman_MV_21.xlsx
name                              owner
street                               st
city                                NaN
state                               NaN
zip                                 NaN
vehicle_year                       year
vehicle_make                       make
vehicle_model                   MODEL X
vehicle_class                     class
vehicle_id                          vin
lease_street                        NaN
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_zip                           NaN
Name: 101, dtype: object
-----NEW-----
record_from      127_Sherman_MV_21.xlsx
name                              owner
street                          address
city                               town
state                                st
zip                                 zip
vehicle_year                       year
vehicle_make               

#### Address those that can be done automatically

In [85]:
missing_zips_2021_dict = {
 "004_Avon_MV_21.xlsx" : "ZIP",
 "031_Cornwall_MV_21.xlsx" : "ZIP",
}

for source_file in missing_zips_2021_dict.keys():
    source_file_index = column_renamer_2021[column_renamer_2021["record_from"]==source_file].index.tolist()[0]
    column_renamer_2021.loc[source_file_index, "zip"] = missing_zips_2021_dict[source_file]

In [86]:
column_renamer_2021[column_renamer_2021["record_from"]=="127_Sherman_MV_21.xlsx"]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
101,127_Sherman_MV_21.xlsx,owner,address,town,st,zip,year,make,MODEL X,class,vin,,,,,


#### Address those requiring manual edits

In [87]:
east_lyme_21 = pd.read_csv(raw_files_21 / "045_East_Lyme_MV_21.csv")
east_lyme_21

Unnamed: 0,Act #,Address,Name,Unnamed: 3,Address cont,Style,Make,Yr,C,Color,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,10000,11 WEST SOCIETY RD,205-207 MONTAUK AVE LLC,16480,"EAST LYME, CT 06333",SPORTV,MERCE,2016,4,SIL,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
1,10001,PO BOX 297,2CDP LLC,76300,"WATERFORD, CT 06385-0297",WAGON4,AUDI,2021,8,GRY,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
2,20000,BOX 881,5 STAR SEALCOATING LLC,6030,"EAST LYME, CT 06333-0881",CABAND,GMC,2001,8,GRN,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
3,30000,335 MAIN ST,A BEAUTIFUL COMPANY GENERAL CONTRAC,18160,"NIANTIC, CT 06357-3128",CREWPI,FORD,2014,6,GRN,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
4,30001,335 MAIN ST,A BEAUTIFUL COMPANY GENERAL CONTRAC,21770,"NIANTIC, CT 06357-3128",Truck,RAM,2019,6,BLK,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17611,300149,13 ROXBURY CT,ZURZUSKI DAVID THOMAS 3rd,25880,"NIANTIC, CT 06357-1313",SUPERC,FORD,2018,6,BLK,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
17612,300150,ZURZUSKI SHEILA L,ZURZUSKI DAVID THOMAS 3rd,16970,13 ROXBURY CT,SEDAN4,SUBAR,2019,4,SIL,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
17613,300151,13 ROXBURY CT,ZURZUSKI DAVID THOMAS 3rd,2905,"NIANTIC, CT 06357-1313",WAGON4,JEEP,1999,6,GRY,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1
17614,300152,13 ROXBURY CT,ZURZUSKI ZACHARY H,18410,"NIANTIC, CT 06357-1313",SEDAN4,SUBAR,2018,4,BLU,...,Page: Exemptions,2962222,Gross:,206501863,Net:,203539641,ZUR,-,300153,Page -1 of 1


In [88]:
def address_splitter(addresses):
    address_list = list(addresses)
    
    address_states  = []
    address_zips    = []
    address_streets = []
    address_cities  = []
    
    # Fields needed
    fields_needed = ["USPSBoxType", 
                     "USPSBoxID", 
                     "AddressNumber", 
                     "StreetName", 
                     "StreetNamePostType",
                     "PlaceName", 
                     "StateName", 
                     "ZipCode"]
    
    
    # Go through each address and parse it
    for address in tqdm(addresses):
        empty_address_dict = dict.fromkeys(fields_needed)
        
        # Create the street+city, state, and zip
        try:
            parsed_address = usaddress.parse(address)
            parsed_address_list = [list(tup) for tup in parsed_address]
            
            for field in empty_address_dict.keys():
                field_values = [sublist[0] for sublist in parsed_address_list if sublist[1]==field]
                num_notempty = len(field_values)
                
                if num_notempty > 0:
                    empty_address_dict[field] = ' '.join([sublist[0].replace(',','') for sublist in parsed_address_list if sublist[1]==field])
                else:
                    empty_address_dict[field] = ""
            
            # We set the street, city, zip code, and state
            street = " ".join([empty_address_dict["USPSBoxType"],
                               empty_address_dict["USPSBoxID"],
                               empty_address_dict["AddressNumber"],
                               empty_address_dict["StreetName"],
                               empty_address_dict["StreetNamePostType"]]).strip()
            city =     empty_address_dict["PlaceName"]
            zip_code = empty_address_dict["ZipCode"]
            state =    empty_address_dict["StateName"]
            
        except:
            street = ""
            city = ""
            zip_code = ""
            state = ""
                
        # We append these to the master lists, that will be set as columns
        address_states.append(state)
        address_zips.append(zip_code)
        address_streets.append(street)
        address_cities.append(city)
        
    output_df = pd.DataFrame({"state": address_states,
                              "zip": address_zips,
                              "street":address_streets,
                              "city": address_cities})
    return output_df

In [89]:
df = address_splitter(east_lyme_21["Address cont"])

100%|██████████| 17616/17616 [00:01<00:00, 12273.97it/s]


In [90]:
east_lyme_21 = east_lyme_21.join(df)

In [91]:
east_lyme_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Lyme")].index.tolist()[0]
for column in ["state", "zip", "street", "city"]:
    column_renamer_2021.loc[east_lyme_index_21, column] = column

In [92]:
column_renamer_2021.loc[east_lyme_index_21]

record_from      045_East_Lyme_MV_21.csv
name                                Name
street                            street
city                                city
state                              state
zip                                  zip
vehicle_year                         NaN
vehicle_make                        Make
vehicle_model                      Model
vehicle_class                        NaN
vehicle_id                           Vin
lease_street                         NaN
UID                                  NaN
lease_city                           NaN
lease_state                          NaN
lease_zip                            NaN
Name: 33, dtype: object

In [93]:
column_renamer_2021.loc[east_lyme_index_21, "record_from"] = "045_East_Lyme_MV_21_ALTERED.csv"

### Ledyard 2021

In [94]:
ledyard_21 = pd.read_excel(raw_files_21 / "072_Ledyard_MV_21.xlsx")

In [95]:
ledyard_parse_address_1 = address_splitter(ledyard_21["ADDRESS"])
ledyard_parse_address_2 = address_splitter(ledyard_21["SECOND NAME/ADDRESS"])

ledyard_full_parsed_address = ledyard_parse_address_1.join(ledyard_parse_address_2, rsuffix = "_2")

100%|██████████| 13858/13858 [00:01<00:00, 11990.34it/s]
100%|██████████| 13858/13858 [00:01<00:00, 12700.06it/s]


In [96]:
ledyard_parse_address_3 = address_splitter(ledyard_21["ADDRESS.1"])

100%|██████████| 13858/13858 [00:00<00:00, 44812.14it/s]


In [97]:
mask_1 = ~(ledyard_full_parsed_address["street"] =="") #use street
mask_2 = ~(ledyard_full_parsed_address["street_2"] =="") #When there is NOT a name in the second address column
mask_3 = (ledyard_full_parsed_address["street_2"] =="") #When there is a name in the second address column
ledyard_full_parsed_address.loc[mask_1, "street_final"] = ledyard_full_parsed_address.loc[mask_1, "street"]
ledyard_full_parsed_address.loc[mask_2, "street_final"] = ledyard_full_parsed_address.loc[mask_2, "street_2"]
ledyard_full_parsed_address.loc[mask_3, "city"] = ledyard_parse_address_3.loc[mask_3, "city"]
ledyard_full_parsed_address.loc[mask_3, "zip"] = ledyard_parse_address_3.loc[mask_3, "zip"]
ledyard_full_parsed_address.loc[mask_3, "state"] = ledyard_parse_address_3.loc[mask_3, "state"]

In [98]:
ledyard_21 = ledyard_21.join(ledyard_full_parsed_address[["state", "city", "zip", "street_final"]])
ledyard_21 = ledyard_21.rename(columns = {"street_final":"street"})

In [99]:
ledyard_index_21 = column_renamer_2021[column_renamer_2021["record_from"].str.contains("Ledyard")].index.tolist()[0]
for column in ["state", "zip", "street", "city"]:
    column_renamer_2021.loc[ledyard_index_21, column] = column

In [100]:
column_renamer_2021.loc[ledyard_index_21, "record_from"] = "072_Ledyard_MV_21_ALTERED.csv"

## Check 2021 renamer

**Missing street address**

In [101]:
column_renamer_2021[column_renamer_2021["street"].isna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
27,037_Derby_MV_21.xls,,,,,,YEAR,MAKE,MODEL,,VIN,,,,,
42,055_Goshen_MV_21.XLSX,TAXPAYER,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,
84,107_Orange_MV_21.xlsx,,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,


These three files do not contain street addresses anyway - no way to fix this.

**Missing VIN**

In [102]:
column_renamer_2021[column_renamer_2021["vehicle_id"].isna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip


So there are no places where we don't have a name for the VIN column.

**Missing ZIP**

In [103]:
column_renamer_2021[column_renamer_2021["zip"].isna()]

Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
27,037_Derby_MV_21.xls,,,,,,YEAR,MAKE,MODEL,,VIN,,,,,
42,055_Goshen_MV_21.XLSX,TAXPAYER,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,
77,098_Norfolk_MV_21.xlsx,TAXPAYER,RESIDENCE_STREET,RESIDENCE_CITY,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,RESIDENCE_STREET,,RESIDENCE_CITY,,
84,107_Orange_MV_21.xlsx,,,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,
143,117_Redding_MV_21.xlsx,,STREET,,,,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,,


The ZIP is genuinely missing in these underlying files - so nothing to be done here.

# 2020 Renamer

### Avon 2020

In [169]:
avon_2020 = pd.read_csv(path.parent.parent / "data" / "municipal_compilation" /"town_files_2019_2020" / "4_Avon_MVData_2020.csv")

In [186]:
avon_2020["state"] =  avon_2020["CITYST"].str[-2:].str.strip()
avon_2020["city"] = avon_2020["CITYST"].str[:-2].str.strip()
avon_2020.to_csv(path.parent.parent / "data" / "municipal_compilation" /"town_files_2019_2020" / "4_Avon_MVData_2020_ALTERED.csv")

In [190]:
avon_20_rename_dict = {"zip" : "ZIP",
                       "state" : "state",
                       "city" : "city",
                       "record_from" : "4_Avon_MVData_2020_ALTERED.csv"}
                       
fix_renamer("2020", "Avon", avon_20_rename_dict)

-----OLD-----
record_from      4_Avon_MVData_2020.csv
name                               NAME
street                           STREET
vehicle_id                       IDENT#
vehicle_make                       MAKE
vehicle_model                     MODEL
vehicle_year                         YR
city                               city
state                             state
vehicle_class                       NaN
zip                                 ZIP
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_street                        NaN
lease_zip                           NaN
Name: 1, dtype: object
-----NEW-----
record_from      4_Avon_MVData_2020_ALTERED.csv
name                                       NAME
street                                   STREET
vehicle_id                               IDENT#
vehicle_make                               MAKE
vehicle_model                             MODEL
vehicle_year         

### Newington 2020

In [193]:
fix_renamer("2020", "Newington", {"street":"MAILING ADDRESS","state":"ST", "vehicle_class":"DMV CLASS"})

-----OLD-----
record_from      96_Newington_MVData_2020.csv
name                                 TAXPAYER
street                        MAILING ADDRESS
vehicle_id                                VIN
vehicle_make                             MAKE
vehicle_model                           MODEL
vehicle_year                             YEAR
city                                     CITY
state                                      ST
vehicle_class                       DMV CLASS
zip                                       ZIP
UID                                       UID
lease_city                                NaN
lease_state                               NaN
lease_street                              NaN
lease_zip                                 NaN
Name: 39, dtype: object
-----NEW-----
record_from      96_Newington_MVData_2020.csv
name                                 TAXPAYER
street                        MAILING ADDRESS
vehicle_id                                VIN
vehicle_make                

### Andover - 2020

In [105]:
renamer_fields_andover = dict.fromkeys(column_renamer_2020.columns[1:])

In [106]:
renamer_fields_andover["street"] = "street"
renamer_fields_andover["city"] = "city"
renamer_fields_andover["state"]=  "state"
renamer_fields_andover["zip"] = "zip1"
renamer_fields_andover["vehicle_year"] = "vehicle_year" 
renamer_fields_andover["vehicle_make"] = "vehicle_make"
renamer_fields_andover["vehicle_model"] = "vehicle_model"
renamer_fields_andover["vehicle_class"] = "vehicle_class"
renamer_fields_andover["vehicle_id"] = "vehicle_id"

In [107]:
fix_renamer("2020", "Andover", renamer_fields_andover)

-----OLD-----
record_from      1_Andover_MVData_2020.csv
name                              TAXPAYER
street                                 NaN
vehicle_id                             NaN
vehicle_make                           NaN
vehicle_model                          NaN
vehicle_year                           NaN
city                                   NaN
state                                  NaN
vehicle_class                          NaN
zip                                    NaN
UID                                    NaN
lease_city                             NaN
lease_state                            NaN
lease_street                           NaN
lease_zip                              NaN
Name: 0, dtype: object
-----NEW-----
record_from      1_Andover_MVData_2020.csv
name                                  None
street                              street
vehicle_id                      vehicle_id
vehicle_make                  vehicle_make
vehicle_model                vehicle_model
veh

### Chaplin - 2020

In [108]:
renamer_fields_chaplin = dict.fromkeys(column_renamer_2020.columns[1:])

In [109]:
renamer_fields_chaplin["name"] = "taxpayer"
renamer_fields_chaplin["street"] = "street"
renamer_fields_chaplin["city"] = "city"
renamer_fields_chaplin["state"]=  "state"
renamer_fields_chaplin["zip"] = "zip1"
renamer_fields_chaplin["vehicle_year"] = "vehicle_year" 
renamer_fields_chaplin["vehicle_make"] = "vehicle_make"
renamer_fields_chaplin["vehicle_model"] = "vehicle_model"
renamer_fields_chaplin["vehicle_class"] = "vehicle_class"
renamer_fields_chaplin["vehicle_id"] = "vehicle_id"

In [110]:
fix_renamer("2020", "Chaplin", renamer_fields_chaplin)

-----OLD-----
record_from      24_Chaplin_MVData_2020.csv
name                               taxpayer
street                                  NaN
vehicle_id                              NaN
vehicle_make                            NaN
vehicle_model                           NaN
vehicle_year                            NaN
city                                    NaN
state                                   NaN
vehicle_class                           NaN
zip                                     NaN
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_street                            NaN
lease_zip                               NaN
Name: 6, dtype: object
-----NEW-----
record_from      24_Chaplin_MVData_2020.csv
name                               taxpayer
street                               street
vehicle_id                       vehicle_id
vehicle_make                   vehicle_make
vehicle_model            

### Torrington - 2020

In [111]:
torrington_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Torrington")].index.tolist()[0]

In [112]:
column_renamer_2020.loc[torrington_index_20]

record_from      143_Torrington_MVData_2020.csv
name                                   TAXPAYER
street                                   STREET
vehicle_id                                  NaN
vehicle_make                                NaN
vehicle_model                     VEHICLE MODEL
vehicle_year                       VEHICLE YEAR
city                                       CITY
state                                     STATE
vehicle_class                               NaN
zip                                    ZIP CODE
UID                                         NaN
lease_city                                  NaN
lease_state                                 NaN
lease_street                                NaN
lease_zip                                   NaN
Name: 69, dtype: object

In [113]:
column_renamer_2020.loc[torrington_index_20, "vehicle_id"] = "VEHICLE ID"
column_renamer_2020.loc[torrington_index_20, "vehicle_make"] = "VEHICLE MAKE"
column_renamer_2020.loc[torrington_index_20, "vehicle_class"] = "VEHICLE CLASS"

In [114]:
column_renamer_2020.loc[torrington_index_20]

record_from      143_Torrington_MVData_2020.csv
name                                   TAXPAYER
street                                   STREET
vehicle_id                           VEHICLE ID
vehicle_make                       VEHICLE MAKE
vehicle_model                     VEHICLE MODEL
vehicle_year                       VEHICLE YEAR
city                                       CITY
state                                     STATE
vehicle_class                     VEHICLE CLASS
zip                                    ZIP CODE
UID                                         NaN
lease_city                                  NaN
lease_state                                 NaN
lease_street                                NaN
lease_zip                                   NaN
Name: 69, dtype: object

### Orange - 2020

Cannot change this - the underlying file does not contain VINs

In [115]:
orange_index_20 = column_renamer_2020[column_renamer_2020["record_from"].str.contains("Orange")].index.tolist()[0]
column_renamer_2020.loc[orange_index_20]

record_from      107_Orange_MVData_2020.csv
name                               TAXPAYER
street                               STREET
vehicle_id                              NaN
vehicle_make                   VEHICLE_MAKE
vehicle_model                 VEHICLE_MODEL
vehicle_year                   VEHICLE_YEAR
city                                   CITY
state                                 STATE
vehicle_class                 VEHICLE_CLASS
zip                                    ZIP1
UID                                     NaN
lease_city                              NaN
lease_state                             NaN
lease_street                            NaN
lease_zip                               NaN
Name: 47, dtype: object

In [116]:
fix_renamer("2020", "Sherman", {"street":"ADDRESS", "state":"ST", "zip":"ZIP", "vehicle_model":"MODEL 3"})

-----OLD-----
record_from      127_Sherman_MVData_2020.csv
name                                   OWNER
street                                    ST
vehicle_id                               VIN
vehicle_make                            MAKE
vehicle_model                            NaN
vehicle_year                            YEAR
city                                    CITY
state                                    NaN
vehicle_class                          CLASS
zip                                      NaN
UID                                      NaN
lease_city                               NaN
lease_state                              NaN
lease_street                             NaN
lease_zip                                NaN
Name: 59, dtype: object
-----NEW-----
record_from      127_Sherman_MVData_2020.csv
name                                   OWNER
street                               ADDRESS
vehicle_id                               VIN
vehicle_make                            MAKE
veh

## Address missing ZIPs in 2020 renamer

In [117]:
missing_zips_2020_dict = {
 "96_Newington_MVData_2020.csv" : "ZIP",
 "127_Sherman_MVData_2020.csv" : "ZIP",
 "4_Avon_MVData_2020.csv" : "ZIP",
}

for source_file in missing_zips_2020_dict.keys():
    source_file_index = column_renamer_2020[column_renamer_2020["record_from"]==source_file].index.tolist()[0]
    column_renamer_2020.loc[source_file_index, "zip"] = missing_zips_2020_dict[source_file]

In [118]:
column_renamer_2020.loc[source_file_index]

record_from      4_Avon_MVData_2020.csv
name                               NAME
street                           STREET
vehicle_id                       IDENT#
vehicle_make                       MAKE
vehicle_model                     MODEL
vehicle_year                         YR
city                                NaN
state                               NaN
vehicle_class                       NaN
zip                                 ZIP
UID                                 NaN
lease_city                          NaN
lease_state                         NaN
lease_street                        NaN
lease_zip                           NaN
Name: 1, dtype: object

## Check 2020 renamer

In [119]:
column_renamer_2020[column_renamer_2020["vehicle_id"].isna()]

Unnamed: 0,record_from,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
47,107_Orange_MVData_2020.csv,TAXPAYER,STREET,,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,CITY,STATE,VEHICLE_CLASS,ZIP1,,,,,


So it is just Orange, for which there are no VINs in the underlying file.

In [120]:
column_renamer_2020[column_renamer_2020["street"].isna()]

Unnamed: 0,record_from,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
8,27_Clinton_MVData_2020.csv,,,VIN,Make,Model,Year,,,Class,,,,,,
49,112_Pomfret_MVData_2020.csv,TAXPAYER,,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,,,VEHICLE_CLASS,,,,,,


It is Clinton and Pomfret - neither of which contain street data in the underlying file.

In [121]:
column_renamer_2020[column_renamer_2020["zip"].isna()]

Unnamed: 0,record_from,name,street,vehicle_id,vehicle_make,vehicle_model,vehicle_year,city,state,vehicle_class,zip,UID,lease_city,lease_state,lease_street,lease_zip
8,27_Clinton_MVData_2020.csv,,,VIN,Make,Model,Year,,,Class,,,,,,
49,112_Pomfret_MVData_2020.csv,TAXPAYER,,VEHICLE_ID,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_YEAR,,,VEHICLE_CLASS,,,,,,


This data is missing in the underlying file anyway

## 2019 Renamer

### Canterbury 2019

In [209]:
fix_renamer("2019", "Canterbury", {"name":"Owner",
                                   "street":"Address",
                                  "state":"ST"})

-----OLD-----
record_from      22_Canterbury_MVData_2019.csv
UID                                        NaN
name                                     Owner
street                                 Address
city                                      City
state                                       ST
zip                                        ZIP
vehicle_year                              Year
vehicle_make                              Make
vehicle_model                            Model
vehicle_class                            Class
vehicle_id                                 VIN
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 15, dtype: object
-----NEW-----
record_from      22_Canterbury_MVData_2019.csv
UID                                        NaN
name                                     Owner
street                                 Address
city    

### Bloomfield - 2019

In [122]:
bloomfield_index_19 = column_renamer_2019[column_renamer_2019["record_from"].str.contains("Bloomfield")].index.tolist()[0]
column_renamer_2019.loc[bloomfield_index_19]

record_from      11_Bloomfield_MVData_2019.csv
UID                                    LIST NO
name                                  TAXPAYER
street                                     NaN
city                                       NaN
state                                    STATE
zip                                        NaN
vehicle_year                              YEAR
vehicle_make                               NaN
vehicle_model                            MODEL
vehicle_class                            CLASS
vehicle_id                                 NaN
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 6, dtype: object

In [123]:
column_renamer_2019.loc[bloomfield_index_19, "street"] = "ADDRESS LINE 1"
column_renamer_2019.loc[bloomfield_index_19, "city"] = "CITY/TOWN"
column_renamer_2019.loc[bloomfield_index_19, "zip"] = "ZIP"
column_renamer_2019.loc[bloomfield_index_19, "vehicle_make"] = "MAKE "
column_renamer_2019.loc[bloomfield_index_19, "vehicle_year"] = "YEAR"
column_renamer_2019.loc[bloomfield_index_19]

record_from      11_Bloomfield_MVData_2019.csv
UID                                    LIST NO
name                                  TAXPAYER
street                          ADDRESS LINE 1
city                                 CITY/TOWN
state                                    STATE
zip                                        ZIP
vehicle_year                              YEAR
vehicle_make                             MAKE 
vehicle_model                            MODEL
vehicle_class                            CLASS
vehicle_id                                 NaN
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 6, dtype: object

### Manchester -  2019

In [124]:
manchester_19 = pd.read_csv(raw_files_20_19 / "77_Manchester_MVData_2019.csv")

In [125]:
manchester_19 = manchester_19[0:45785]

In [126]:
manchester_19 = manchester_19.rename(columns = {"Unnamed: 4" : "city",
                                                "Unnamed: 5" : "state"})
manchester_19["zip"] = manchester_19["Unnamed: 6"].astype('Int64').astype(str) + manchester_19["Unnamed: 7"] + manchester_19["Unnamed: 8"]

In [127]:
manchester_19

Unnamed: 0,#,TAXPAYER,CARE OF,MAILING ADDRESS,city,state,Unnamed: 6,Unnamed: 7,Unnamed: 8,DIST,...,MODEL,CODE,VIN NUMBER,STYLE,COLOR,ASMNT,Unnamed: 18,Unnamed: 19,Unnamed: 20,zip
0,50001.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,FRR,2.0,JALF5C13547700369,TILT C,WHI,"$7,540",,,,6256-0183
1,50002.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,61214R0,10.0,43ZDN22B340000019,TRAILE,BLA,$550,,,,6256-0183
2,50003.0,1 800 DUMP RUNS OF HARTFORD LLC,,PO BOX 183,NORTH WINDHAM,CT,6256.0,-,0183,E,...,CONSTRUC,2.0,2NKMHZ7X07M210675,HOIST,WHI,"$16,230",,,,6256-0183
3,50004.0,A & A INTEGRATED PEST MAN &,,457 CENTER ST,MANCHESTER,CT,6040.0,-,3937,T,...,SILVERAD,3.0,1GCVKREC4FZ374901,4 DOOR,SIL,"$17,500",,,,6040-3937
4,50005.0,A & A INTEGRATED PEST MANAGEMENT COMPANY,,457 CENTER ST,MANCHESTER,CT,6040.0,-,3937,T,...,CITY EXP,3.0,3N63M0ZN8HK697113,VAN,WHI,"$10,970",,,,6040-3937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45780,96539.0,ZYLBERMAN SOL,,87 CONCORD RD,MANCHESTER,CT,6042.0,-,1723,T,...,GENESIS,1.0,KMHGN4JE6GU143308,SEDAN,SIL,"$14,960",,,,6042-1723
45781,96540.0,ZYREK ANDREW R,ZYREK GEORGE P,37 ABBE RD,MANCHESTER,CT,6040.0,-,6867,T,...,SPIDER G,25.0,ZARBA5649H1046161,CONVER,BLA,$500,,,,6040-6867
45782,96541.0,ZYREK GEORGE P,,37 ABBE RD,MANCHESTER,CT,6040.0,-,6867,T,...,E3504M A,1.0,WDDHF8JB3CA537403,SEDAN,BLU,"$9,800",,,,6040-6867
45783,96542.0,ZYSK DONALD R,,197 GARDNER ST,MANCHESTER,CT,6040.0,-,6754,T,...,F150,1.0,1FTRF12859KB23566,PICKUP,WHI,"$6,530",,,,6040-6754


In [128]:
manchester_index_19 = column_renamer_2019[column_renamer_2019["record_from"].str.contains('Manchester')].index.tolist()[0]
column_renamer_2019.loc[manchester_index_19]

record_from      77_Manchester_MVData_2019.csv
UID                                        NaN
name                                  TAXPAYER
street                                     NaN
city                                       NaN
state                                      NaN
zip                                        NaN
vehicle_year                              YEAR
vehicle_make                              MAKE
vehicle_model                            MODEL
vehicle_class                              NaN
vehicle_id                          VIN NUMBER
lease_city                                 NaN
lease_state                                NaN
lease_street                               NaN
lease_zip                                  NaN
Name: 48, dtype: object

In [129]:
column_renamer_2019.loc[manchester_index_19, "street"] = "MAILING ADDRESS"
column_renamer_2019.loc[manchester_index_19, "state"] = "state"
column_renamer_2019.loc[manchester_index_19, "city"] = "city"
column_renamer_2019.loc[manchester_index_19, "zip"] = "zip"
column_renamer_2019.loc[manchester_index_19, "record_from"] = "77_Manchester_MVData_2019_ALTERED.csv"
column_renamer_2019.loc[manchester_index_19]

record_from      77_Manchester_MVData_2019_ALTERED.csv
UID                                                NaN
name                                          TAXPAYER
street                                 MAILING ADDRESS
city                                              city
state                                            state
zip                                                zip
vehicle_year                                      YEAR
vehicle_make                                      MAKE
vehicle_model                                    MODEL
vehicle_class                                      NaN
vehicle_id                                  VIN NUMBER
lease_city                                         NaN
lease_state                                        NaN
lease_street                                       NaN
lease_zip                                          NaN
Name: 48, dtype: object

### Address missing ZIPs in 2019 renamer

In [130]:
missing_zips_2019_dict = {
 "22_Canterbury_MVData_2019.csv" : "ZIP",
 "44_East_Lyme_MVData_2019.csv" : "Zip1"
}

for source_file in missing_zips_2019_dict.keys():
    source_file_index = column_renamer_2019[column_renamer_2019["record_from"]==source_file].index.tolist()[0]
    column_renamer_2019.loc[source_file_index, "zip"] = missing_zips_2019_dict[source_file]

## Check 2019

In [131]:
column_renamer_2019[column_renamer_2019["street"].isna()]

Unnamed: 0,record_from,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_city,lease_state,lease_street,lease_zip
5,10_Bethlehem_MVData_2019.csv,,TAXPAYER,,CITY,STATE,ZIP1,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,,,,


Bethlehem does not contain street addresses in the underlying file, anyway.

In [132]:
column_renamer_2019[column_renamer_2019["vehicle_id"].isna()]

Unnamed: 0,record_from,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_city,lease_state,lease_street,lease_zip
6,11_Bloomfield_MVData_2019.csv,LIST NO,TAXPAYER,ADDRESS LINE 1,CITY/TOWN,STATE,ZIP,YEAR,MAKE,MODEL,CLASS,,,,,


Bloomfield does not contain VINs in the underlying file, anyway.

In [133]:
column_renamer_2019[column_renamer_2019["zip"].isna()]

Unnamed: 0,record_from,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_city,lease_state,lease_street,lease_zip


So no issues with missing ZIPs here

# --- REMAKE FILE ---

In this section we finally recompile the file.

In [134]:
import time
import warnings

warnings.filterwarnings("ignore")

In [135]:
sorted(column_renamer_2021.columns) == sorted(column_renamer_2020.columns)== sorted(column_renamer_2019.columns)

True

First we save all of the underlying files we edited, with `_ALTERED` at the end. Note that we have updated the `record_from` column of the renamers before doing this.

In [408]:
# Files that will be updated. However, recall that some of the files, we've made edits to. 
# Therefore we save them
colchester_21.to_csv(raw_files_21 / "028_Colchester_MV_21_ALTERED.csv")
salisbury_21.to_csv(raw_files_21  /  "122_Salisbury_MV_21_ALTERED.csv")
manchester_21.to_csv(raw_files_21 / "077_Manchester_MV_21_ALTERED.csv")
bridgewater_21.to_csv(raw_files_21 / "016_Bridgewater_MV_21_ALTERED.csv")
east_lyme_21.to_csv(raw_files_21 / "045_East_Lyme_MV_21_ALTERED.csv")
ledyard_21.to_csv(raw_files_21 / "072_Ledyard_MV_21_ALTERED.csv")
manchester_19.to_csv(raw_files_20_19 / "77_Manchester_MVData_2019_ALTERED.csv")
# Add AVON

We must update the raw file lists to ensure that the old files are removed

In [None]:
# We then update the file lists
# First remove the files that we have changed
list_21 = [item for item in list_21 if not item in ["028_Colchester_MV_21.xlsx",
                                                    "122_Salisbury_MV_21.xlsx",
                                                    "077_Manchester_MV_21.xls",
                                                    "016_Bridgewater_MV_21.xlsx",
                                                    "045_East_Lyme_MV_21.csv",
                                                    "072_Ledyard_MV_21.xlsx"]]

list_20_19.remove("77_Manchester_MVData_2019.csv")
list_20_19.remove("4_Avon_MVData_2020.csv")

In [218]:
# List of altered files
altered_files = ["028_Colchester_MV_21_ALTERED.csv", "122_Salisbury_MV_21_ALTERED.csv", "077_Manchester_MV_21_ALTERED.csv",
                 "016_Bridgewater_MV_21_ALTERED.csv", "77_Manchester_MVData_2019_ALTERED.csv","045_East_Lyme_MV_21_ALTERED.csv",
                 "072_Ledyard_MV_21_ALTERED.csv", "4_Avon_MVData_2020_ALTERED.csv"]

# Now append the altered files to the lists 
for altered_file in altered_files:
    if "_MV_21" in altered_file:
        if not altered_file in list_21:
            list_21.append(altered_file)
    else:
        if not altered_file in list_20_19:
            list_20_19.append(altered_file)

Now change the renamers so that the "record_from" column is right

In [150]:
# Create master output dataframe
df_compiled_update = pd.DataFrame(columns = column_renamer_2021.columns)

In [151]:
updating_files = list_21 + list_20_19

In [152]:
count = 0
t_start = time.time()
rows = 0

In [165]:
for f in tqdm(updating_files[66:]):
    year = 0
    
    # Set paths and renaming file depending on the year
    # Set renaming dictionary depending on the year
    if "_21" in f:
        year = 2021
        f_path = raw_files_21 / f
        df_rename = column_renamer_2021
    elif "_2019" in f:
        year = 2019
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2019
    elif "_2020" in f:
        year = 2020
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2020
    
    df_rename = df_rename.set_index("record_from")
    
    # Open the file
    try:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path)
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path)
        else:
            continue
    except:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path, encoding = 'unicode_escape')
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path, encoding= 'unicode_escape')
        else:
            continue

    # get the renaming dict for f and keep valid columns
    rename_dict_f = {v: k for k, v in df_rename.loc[f][df_rename.loc[f].notnull()].to_dict().items()}
    valid_col_f = [k for k in rename_dict_f]
    df_f = df_f[valid_col_f].rename(columns=rename_dict_f)
    df_f['record_from'] = f        # add a column for path

    # Get length
    add_rows = len(df_f)
    rows +=add_rows

    # Concat
    df_compiled_update = pd.concat([df_compiled_update, df_f], ignore_index=True)

    count = count + 1
    if count % 10 == 0:
        t_end = time.time()
        print('finished the {}th file; time used for the last ten files: {:2f} sec'.format(count, t_end - t_start))
        t_start = t_end

  2%|▏         | 4/262 [00:08<11:47,  2.74s/it]

finished the 70th file; time used for the last ten files: 311.326070 sec


  5%|▌         | 14/262 [00:27<07:11,  1.74s/it]

finished the 80th file; time used for the last ten files: 19.264122 sec


  9%|▉         | 24/262 [00:53<07:52,  1.99s/it]

finished the 90th file; time used for the last ten files: 25.886863 sec


 13%|█▎        | 34/262 [01:12<12:41,  3.34s/it]

finished the 100th file; time used for the last ten files: 19.111655 sec


 17%|█▋        | 44/262 [01:42<07:35,  2.09s/it]

finished the 110th file; time used for the last ten files: 30.153198 sec


 21%|██        | 54/262 [02:06<08:03,  2.32s/it]

finished the 120th file; time used for the last ten files: 23.168924 sec


 24%|██▍       | 64/262 [02:28<07:20,  2.22s/it]

finished the 130th file; time used for the last ten files: 22.710615 sec


 28%|██▊       | 74/262 [02:57<08:57,  2.86s/it]

finished the 140th file; time used for the last ten files: 29.078697 sec


 32%|███▏      | 84/262 [03:21<07:05,  2.39s/it]

finished the 150th file; time used for the last ten files: 23.224851 sec


 36%|███▌      | 94/262 [03:47<08:21,  2.98s/it]

finished the 160th file; time used for the last ten files: 26.786367 sec


 40%|███▉      | 104/262 [03:53<01:44,  1.52it/s]

finished the 170th file; time used for the last ten files: 5.923675 sec


 44%|████▎     | 114/262 [03:59<01:27,  1.68it/s]

finished the 180th file; time used for the last ten files: 5.966673 sec


 47%|████▋     | 124/262 [04:06<01:39,  1.38it/s]

finished the 190th file; time used for the last ten files: 6.832741 sec


 51%|█████     | 134/262 [04:13<01:20,  1.59it/s]

finished the 200th file; time used for the last ten files: 6.438436 sec


 55%|█████▍    | 144/262 [04:19<01:25,  1.38it/s]

finished the 210th file; time used for the last ten files: 6.863443 sec


 59%|█████▉    | 154/262 [04:27<01:25,  1.26it/s]

finished the 220th file; time used for the last ten files: 7.685555 sec


 63%|██████▎   | 164/262 [04:35<01:15,  1.29it/s]

finished the 230th file; time used for the last ten files: 7.672926 sec


 66%|██████▋   | 174/262 [04:43<01:17,  1.13it/s]

finished the 240th file; time used for the last ten files: 8.402695 sec


 70%|███████   | 184/262 [04:51<01:00,  1.28it/s]

finished the 250th file; time used for the last ten files: 7.797152 sec


 74%|███████▍  | 194/262 [04:59<00:56,  1.20it/s]

finished the 260th file; time used for the last ten files: 8.412917 sec


 78%|███████▊  | 204/262 [05:08<00:49,  1.17it/s]

finished the 270th file; time used for the last ten files: 8.647869 sec


 82%|████████▏ | 214/262 [05:17<00:41,  1.16it/s]

finished the 280th file; time used for the last ten files: 8.766355 sec


 85%|████████▌ | 224/262 [05:27<00:38,  1.01s/it]

finished the 290th file; time used for the last ten files: 10.034203 sec


 89%|████████▉ | 234/262 [05:36<00:26,  1.06it/s]

finished the 300th file; time used for the last ten files: 9.383341 sec


 93%|█████████▎| 244/262 [05:46<00:17,  1.01it/s]

finished the 310th file; time used for the last ten files: 9.956859 sec


 97%|█████████▋| 254/262 [05:56<00:08,  1.00s/it]

finished the 320th file; time used for the last ten files: 10.045131 sec


100%|██████████| 262/262 [06:04<00:00,  1.39s/it]


In [216]:
version = "102023"
if sys.platform == 'linux':
    df_compiled_update.to_csv(path.parent.parent/ "data" / "municipal_dataset_latest" / f"2019-21_data_compiled_RN_{version}.csv")
else:
    df_compiled_update.to_csv(raw_files_21.parent / "Compiled" / "2019-21_data_compiled_RN_100323.csv")

In [168]:
len(df_compiled_update)

5787151

In [686]:
len(df_compiled_update)

5787151

### Hack add

In [210]:
df_compiled_update_2 = df_compiled_update.copy(deep=True)
df_compiled_update = pd.DataFrame([])

In [211]:
for f in tqdm(["22_Canterbury_MVData_2019.csv"]):
    year = 0
    
    # Set paths and renaming file depending on the year
    # Set renaming dictionary depending on the year
    if "_21" in f:
        year = 2021
        f_path = raw_files_21 / f
        df_rename = column_renamer_2021
    elif "_2019" in f:
        year = 2019
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2019
    elif "_2020" in f:
        year = 2020
        f_path = raw_files_20_19 / f
        df_rename = column_renamer_2020
    
    df_rename = df_rename.set_index("record_from")
    
    # Open the file
    try:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path)
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path)
        else:
            continue
    except:
        if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
            df_f = pd.read_csv(f_path, encoding = 'unicode_escape')
        elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
            df_f = pd.read_excel(f_path, encoding= 'unicode_escape')
        else:
            continue

    # get the renaming dict for f and keep valid columns
    rename_dict_f = {v: k for k, v in df_rename.loc[f][df_rename.loc[f].notnull()].to_dict().items()}
    valid_col_f = [k for k in rename_dict_f]
    df_f = df_f[valid_col_f].rename(columns=rename_dict_f)
    df_f['record_from'] = f        # add a column for path

    # Get length
    add_rows = len(df_f)
    rows +=add_rows

    # Concat
    df_compiled_update = pd.concat([df_compiled_update, df_f], ignore_index=True)

    count = count + 1
    if count % 10 == 0:
        t_end = time.time()
        print('finished the {}th file; time used for the last ten files: {:2f} sec'.format(count, t_end - t_start))
        t_start = t_end

100%|██████████| 1/1 [00:00<00:00, 22.47it/s]


In [212]:
print(len(df_compiled_update_2))
df_compiled_update_2 = df_compiled_update_2.loc[~df_compiled_update_2["record_from"].isin(["22_Canterbury_MVData_2019.csv"]), :]
print(len(df_compiled_update_2))

5787151
5780817


In [214]:
5787151-5780817

6334

In [215]:
df_compiled_update = pd.concat([df_compiled_update_2, df_compiled_update])