Title: vehicle registration compile <br/>
Inputs: raw vehicle registration data from CT towns <br/>
Output: (1) runnersingle flat file <br/>
Author: Jiarong Qi (jiarong.qi@yale.edu) <br/>
Date: Oct 2022 <br/>
Note: please also refer to the R codes Compile vehicle data from raw files by Asa Watten (asa.watten@yale.edu) <br/>

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import datetime

## create/update run controller

### what files to look at

change txt to csv manually

In [10]:
# which file types we are lookin at
targeting_type = ['xlsx','xls','csv']

In [11]:
# move files to skipping_path
# could be based on file types (e.g., txt & pdf)
# or file structure (e.g., Salem)
skipping_path = '../../1_rawdata/Vehicles_2022/Town files -- need additional treating'
files_skip = os.listdir(skipping_path)
files_skip

[]

In [12]:
path = "../../1_rawdata/Vehicles_2022/Town files/"

files = os.listdir(path)
included_files = [f for f in files if f not in files_skip]

files_by_type = {}
for t in targeting_type:
    files_t = [f for f in included_files if f.capitalize().endswith(t)]
    files_by_type.update({t:files_t})

{t:len(files_by_type[t]) for t in files_by_type}

{'xlsx': 147, 'xls': 3, 'csv': 9}

In [13]:
len(included_files)

159

In [14]:
# see if all files has been renamed
# if not, run file_renamer.ipynb first
[i for i in included_files if not i[:3].isdigit()]

[]

### check & update column renaming dictionary

In [15]:
# update after running the codes below
column_matching_dict = {
    'UID': ['uid', 'Unique_ID', 'UID', 'LIST_NO', 'UNIQUE_ID', 'LIST NO', 'Unique.ID', 'LIST #', 'List#', 
        'Unique ID'],
    'name': ['TAXPAYER', 'Name', 'owner', 'NAME', 'taxpayer', 'Taxpayer', 'taxpayer_name', 'name', 'OWNER',
        'PrimaryOwnerFirstName'], 
    'street': ['STREET', 'Street', 'ADDRESS 1', 'ADDRESS', 'address', 'Address', 'st', 'ST', 
        'Address 1', 'PrimaryOwnerAddressLine1'], 
    'city': ['CITY', 'City', 'PrimaryOwnerCity', 'CITY/TOWN'], 
    'state': ['STATE', 'State', 'State ', 'PrimaryOwnerState'], 
    'zip': ['ZIP1', 'Zip1', 'ZIP CODE', 'Zip', 'Zip Code', 'PrimaryOwnerZip'],
    'lease_street': ['STREET_MAILING_ADDR', 'Residence Address', 'Residential Address 1', 'Lessee1ResidencyAddressLine1',
        'RESIDENCE_STREET_ADDR', 'RESIDENTIAL ADDRESS 1', 'RESIDENCE_STREET', 'PrimaryOwnerResidencyAddressLine1', 
        'Street.1'],
    'lease_city': ['CITY_MAILING_ADDR', 'Residential City', 'City.1', 'RESIDENTIAL CITY', 'RESIDENCE_CITY', 
        'PrimaryOwnerResidencyCity'],
    'lease_state': ['STATE_MAILING_ADDR', 'Residential State', 'State.1', 'RESIDENTIAL STATE', 
        'Lessee1ResidencyState', 'PrimaryOwnerResidencyState'],
    'lease_zip': ['ZIP1_MAILING_ADDR', 'Zip1.1', 'Res Zip', 'RESIDENTIAL ZIP CODE', 'Lessee1ResidencyZip',
        'PrimaryOwnerResidencyZip'],
    'vehicle_year': ['VEHICLE_YEAR', 'YEAR', 'Year', 'VEHICLE YEAR', 'year', 'YR', 'MV YEAR'], 
    'vehicle_make': ['VEHICLE_MAKE', 'Make', 'MAKE', 'make', 'Vehicle Make', 'MV MAKE'], 
    'vehicle_model': ['VEHICLE_MODEL', 'MODEL', 'Model', 'MODEL X', 'VEHICLE MODEL', 'Vehicle Model', 'MV MODEL'], 
    'vehicle_class': ['VEHICLE_CLASS', 'CLASS', 'Class', 'class', 'CLASS CODE', 'Vehicle Class', 'MV CLASS'],
    'vehicle_id': ['VEHICLE_ID', 'VIN', 'Vin#', 'VIN NO', 'VIN ID', 'Vin', 'Vin.', 'Vehicle Vin', 
        'vin', 'IDENT#', 'vin', 'Vehicle ID', 'VehicleRegistrationID', 'VehicleID', 'VIN NUMBER', 'Vehicle Vin']
}
# pd.DataFrame.from_dict(column_renaming_dict, orient='index')

In [16]:
try:
    df_townfile_columns = pd.read_csv('../../1_rawdata/Vehicles_2022/Compiled/2021/townfile_columns.csv').set_index('file')
    updating_files = [f for f in included_files if f not in df_townfile_columns.index.tolist()]
except FileNotFoundError:
    print('vehicle_townfile_columns.csv not exist')
    updating_files = included_files
    df_townfile_columns = pd.DataFrame()

In [17]:
# see what are the columns
all_col = set()
col_dict = {}
double_check_list = []

for f in updating_files:
    f_path = path + f
    if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
        df_f = pd.read_csv(f_path, nrows=1)
    elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
        df_f = pd.read_excel(f_path, nrows=1)
    else:
        continue

    f_col = df_f.columns.tolist()

    # print files that needs to be double checked; skip the files for now
    # (1) with too many columns: check when more than 35
    if len(f_col) > 35:
        print('too many columns: {}, {}'.format(f, len(f_col)))
        double_check_list.append(f)
    # (2) too few columns: less or equal to 3
    elif len(f_col) <= 3:
        print('too few columns: {}, {}'.format(f, len(f_col)))
        double_check_list.append(f)
    # (3) first few columns unnamned
    elif 'unnamed' in f_col[0].lower() or 'unnamed' in f_col[1].lower() or 'unnamed' in f_col[2].lower():
        print('first few columns unnamned: {}'.format(f))
        double_check_list.append(f)
    # update the dict & column collection otherwise
    else:       
        col_dict.update({f:f_col})
        all_col = all_col.union(set(f_col))

In [18]:
# check the files printed above
# add to dict manually if the record looks fine
# or after editing the file
for f in double_check_list:
    f_path = path + f
    if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
        df_f = pd.read_csv(f_path, nrows=1)
    elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
        df_f = pd.read_excel(f_path, nrows=1)
    else:
        continue

    f_col = df_f.columns.tolist()
    col_dict.update({f:f_col})
    all_col = all_col.union(set(f_col))

In [19]:
df_townfile_columns_update = pd.DataFrame.from_dict(col_dict,orient='index')
df_townfile_columns_update.columns = df_townfile_columns_update.columns.astype('str')
df_townfile_columns = df_townfile_columns.append(df_townfile_columns_update)

  df_townfile_columns = df_townfile_columns.append(df_townfile_columns_update)


In [20]:
df_townfile_columns_update.index.name = 'file'
df_townfile_columns.index.name = 'file'

In [21]:
df_townfile_columns_update.to_csv('../1_temperary result/vehicle_townfile_columns_update_{}.csv'.format(datetime.date.today().strftime("%Y%m%d")))
df_townfile_columns_update.to_csv('../../1_rawdata/Vehicles_2022/Compiled/2021/vehicle_townfile_columns_update_{}.csv'.format(datetime.date.today().strftime("%Y%m%d")))
df_townfile_columns.to_csv('../1_temperary result/vehicle_townfile_columns.csv')
df_townfile_columns.to_csv('../../1_rawdata/Vehicles_2022/Compiled/2021/townfile_columns.csv')

counting columns

In [22]:
try:
    df_col_count = pd.read_csv('../1_temperary result/column_counting.csv')
except FileNotFoundError:
    print('column_counting.csv not exist')
    df_col_count = pd.DataFrame()

In [23]:
all_col = list(all_col)

In [24]:
col_counting = {all_col[i]: 0 for i in range(0, len(all_col))}
for f in col_dict:
    for c in col_dict[f]:
        col_counting[c] = col_counting[c] + 1

# sorting
col_counting = dict(sorted(col_counting.items(), key=lambda x: x[1], reverse=True))
# store
df_col_count_update = pd.DataFrame.from_dict(col_counting, orient='index')
df_col_count_update.columns = df_col_count_update.columns.astype('str')
df_col_count = df_col_count.append(df_col_count_update)

  df_col_count = df_col_count.append(df_col_count_update)


In [25]:
df_col_count.to_csv('../1_temperary result/column_counting.csv')
df_col_count_update.to_csv('../1_temperary result/column_counting_update.csv')

In [26]:
len(col_counting)

31

In [27]:
pd.Series([i for i in col_counting]).unique()

array(['STATE', 'CITY', 'TAXPAYER', 'STREET', 'VEHICLE_MAKE', 'STREET2',
       'VEHICLE_CLASS', 'ZIP1', 'VEHICLE_MODEL', 'VEHICLE_YEAR',
       'VEHICLE_ID', 'IN_CO_MAILING_ADDR2', 'GL_EXMPT', 'GL_VALUE',
       'VEHICLE_P_COLOR', 'ZIP1_MAILING_ADDR', 'ZIP2_MAILING_ADDR',
       'ZIP2', 'CITY_MAILING_ADDR', 'STREET_MAILING_ADDR',
       'STREET_MAILING_ADDR2', 'UNIQUE_ID', 'VEHICLE_BODY_STYLE',
       'RECORD_YEAR', 'PRIOR_TAXPAYER', 'RECORD_TYPE',
       'VEHICLE_LIGHT_WEIGHT', 'VEHICLE_GROSS_WEIGHT', 'LIST_NO',
       'IN_CARE_OF', 'STATE_MAILING_ADDR'], dtype=object)

In [28]:
df_townfile_columns_update

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
090_New_Canaan_MV_21.xlsx,LIST_NO,RECORD_YEAR,RECORD_TYPE,UNIQUE_ID,TAXPAYER,PRIOR_TAXPAYER,IN_CARE_OF,STREET,STREET2,CITY,...,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,VEHICLE_ID,VEHICLE_BODY_STYLE,VEHICLE_P_COLOR,VEHICLE_LIGHT_WEIGHT,VEHICLE_GROSS_WEIGHT,GL_VALUE,GL_EXMPT
147_Voluntown_MV_21.xlsx,TAXPAYER,STREET,STREET2,CITY,STATE,ZIP1,VEHICLE_YEAR,VEHICLE_MAKE,VEHICLE_MODEL,VEHICLE_CLASS,...,,,,,,,,,,


columns we want: Name, Street, City, State, Zip, ResStreet, ResCity, ResState, ResZip, Year, Make, Model, VIN </br>
check column_counting_update.csv to see if need to edit

### create run controller

In [29]:
# update after running the codes below
column_matching_dict = {
    'UID': ['uid', 'Unique_ID', 'UID', 'LIST_NO', 'UNIQUE_ID', 'LIST NO', 'Unique.ID', 'LIST #', 'List#', 
        'Unique ID'],
    'name': ['TAXPAYER', 'Name', 'owner', 'NAME', 'taxpayer', 'Taxpayer', 'taxpayer_name', 'name', 'OWNER',
        'PrimaryOwnerFirstName'], 
    'street': ['STREET', 'Street', 'ADDRESS 1', 'ADDRESS', 'address', 'Address', 'st', 'ST', 
        'Address 1', 'PrimaryOwnerAddressLine1'], 
    'city': ['CITY', 'City', 'PrimaryOwnerCity', 'CITY/TOWN'], 
    'state': ['STATE', 'State', 'State ', 'PrimaryOwnerState'], 
    'zip': ['ZIP1', 'Zip1', 'ZIP CODE', 'Zip', 'Zip Code', 'PrimaryOwnerZip'],
    'lease_street': ['STREET_MAILING_ADDR', 'Residence Address', 'Residential Address 1', 'Lessee1ResidencyAddressLine1',
        'RESIDENCE_STREET_ADDR', 'RESIDENTIAL ADDRESS 1', 'RESIDENCE_STREET', 'PrimaryOwnerResidencyAddressLine1', 
        'Street.1'],
    'lease_city': ['CITY_MAILING_ADDR', 'Residential City', 'City.1', 'RESIDENTIAL CITY', 'RESIDENCE_CITY', 
        'PrimaryOwnerResidencyCity'],
    'lease_state': ['STATE_MAILING_ADDR', 'Residential State', 'State.1', 'RESIDENTIAL STATE', 
        'Lessee1ResidencyState', 'PrimaryOwnerResidencyState'],
    'lease_zip': ['ZIP1_MAILING_ADDR', 'Zip1.1', 'Res Zip', 'RESIDENTIAL ZIP CODE', 'Lessee1ResidencyZip',
        'PrimaryOwnerResidencyZip'],
    'vehicle_year': ['VEHICLE_YEAR', 'YEAR', 'Year', 'VEHICLE YEAR', 'year', 'YR', 'MV YEAR'], 
    'vehicle_make': ['VEHICLE_MAKE', 'Make', 'MAKE', 'make', 'Vehicle Make', 'MV MAKE'], 
    'vehicle_model': ['VEHICLE_MODEL', 'MODEL', 'Model', 'MODEL X', 'VEHICLE MODEL', 'Vehicle Model', 'MV MODEL'], 
    'vehicle_class': ['VEHICLE_CLASS', 'CLASS', 'Class', 'class', 'CLASS CODE', 'Vehicle Class', 'MV CLASS'],
    'vehicle_id': ['VEHICLE_ID', 'VIN', 'Vin#', 'VIN NO', 'VIN ID', 'Vin', 'Vin.', 'Vehicle Vin', 
        'vin', 'IDENT#', 'vin', 'Vehicle ID', 'VehicleRegistrationID', 'VehicleID', 'VIN NUMBER', 'Vehicle Vin']
}
# pd.DataFrame.from_dict(column_renaming_dict, orient='index')

In [30]:
# swap key & value
column_renaming_dict = dict()
for key in column_matching_dict:
    for value in column_matching_dict[key]:
        column_renaming_dict.update({value: key})

In [31]:
# where the run controller shall be installed
# rc - run controller - the index of column in the original file matching the target
# renamer - what each column in the original file matching the target
sav_path_rc = '../../1_rawdata\Vehicles_2022/Compiled/2021/vehicle_2021_run_controller.csv'
sav_path_renamer = '../../1_rawdata\Vehicles_2022\Compiled/2021/vehicle_2021_column_renamer.csv'
col = ['record_from'].append(i for i in [column_matching_dict])

# access the file if exist; create if not
try:
    df_rc = pd.read_csv(sav_path_rc)
    df_rename = pd.read_csv(sav_path_renamer)

except FileNotFoundError: 
    print('files not exist')
    df_rc = pd.DataFrame(columns=col)
    df_rename = pd.DataFrame(columns=col)

df_rc_update = pd.DataFrame(columns=col)
df_rename_update = pd.DataFrame(columns=col)

In [32]:
df_townfile_columns = pd.read_csv('../../1_rawdata/Vehicles_2022/Compiled/2021/townfile_columns.csv').set_index('file')

In [33]:
for f in updating_files:
    f_col = df_townfile_columns.loc[f]
    f_dict_rc = {'record_from': f}
    f_dict_rename = {'record_from': f}
    
    for i in range(0, len(f_col)):
        c = f_col[i]s
        if c in column_renaming_dict:
            target_c = column_renaming_dict[c]
            f_dict_rc.update({target_c: i+1})
            f_dict_rename.update({target_c: c})
    
    df_rc_update = df_rc_update.append(f_dict_rc, ignore_index=True)
    df_rename_update = df_rename_update.append(f_dict_rename, ignore_index=True)

  df_rc_update = df_rc_update.append(f_dict_rc, ignore_index=True)
  df_rename_update = df_rename_update.append(f_dict_rename, ignore_index=True)
  df_rc_update = df_rc_update.append(f_dict_rc, ignore_index=True)
  df_rename_update = df_rename_update.append(f_dict_rename, ignore_index=True)


In [34]:
df_rc = df_rc.append(df_rc_update, ignore_index=True).set_index('record_from')
df_rename = df_rename.append(df_rename_update, ignore_index=True).set_index('record_from')

  df_rc = df_rc.append(df_rc_update, ignore_index=True).set_index('record_from')
  df_rename = df_rename.append(df_rename_update, ignore_index=True).set_index('record_from')


In [35]:
df_rc_update.set_index('record_from').to_csv('../1_temperary result/run_controller_update.csv')
df_rename_update.set_index('record_from').to_csv('../1_temperary result/column_renamer_update.csv')

In [36]:
df_rc.to_csv(sav_path_rc)
df_rename.to_csv(sav_path_renamer)

## compile

In [37]:
# load runner/renamer; use renamer here
df_rename = pd.read_csv('../../1_rawdata/Vehicles_2022/Compiled/2021/vehicle_2021_column_renamer.csv')
# set columns we want to look at
col = df_rename.columns.tolist()
# set path as index
df_rename = df_rename.set_index('record_from')

In [38]:
# path of compiled file
sav_path_compiled = '../../1_rawdata/Vehicles_2022/Compiled/2021/vehicle_2021_compiled.csv'
sav_path_compiled_update = '../../1_rawdata/Vehicles_2022/Compiled/2021/vehicle_2021_compiled_updated_{}.csv'.format(datetime.date.today().strftime("%Y%m%d")[2:])

In [39]:
# if a bug is discovered, run the line below
# updating_files = included_files

In [40]:
import time
import warnings

warnings.filterwarnings("ignore")

df_compiled_update = pd.DataFrame(columns=col)
count = 0
t_start = time.time()

for f in updating_files:
    f_path = path + f
    if f.capitalize().endswith('csv') or f.capitalize().endswith('txt'):
        df_f = pd.read_csv(f_path)
    elif f.capitalize().endswith('xlsx') or f.capitalize().endswith('xls'):
        df_f = pd.read_excel(f_path)
    else:
        continue

    # get the renaming dict for f and keep valid columns
    rename_dict_f = {v: k for k, v in df_rename.loc[f][df_rename.loc[f].notnull()].to_dict().items()}
    valid_col_f = [k for k in rename_dict_f]
    df_f = df_f[valid_col_f].rename(columns=rename_dict_f)
    df_f['record_from'] = f        # add a column for path
    df_compiled_update = df_compiled_update.append(df_f, ignore_index=True)

    count = count + 1
    if count % 10 == 0:
        t_end = time.time()
        print('finished the {}th file; time used for the last ten files: {:2f} sec'.format(count, t_end - t_start))
        t_start = t_end

In [41]:
# strip out whitespaces
for c in df_compiled_update.columns:
    df_compiled_update[c] = df_compiled_update[c].astype(str).str.strip()

# set index to path
df_compiled_update = df_compiled_update.set_index('record_from')

In [42]:
# create/update compiled file
try:
    df_compiled = pd.read_csv(sav_path_compiled)
except FileNotFoundError:
    print('compiled file does not exist')
    df_compiled = pd.DataFrame(columns=col)
    
df_compiled = df_compiled.append(df_compiled_update.reset_index(), ignore_index = True).set_index('record_from')

In [56]:
len(df_compiled)

2779306

## harmonization

### zip

In [57]:
def test_int(s):
    if s != s:
        return True
    if type(s) == str:
        return s.isnumeric()
    elif type(s) == float:
        return s.is_integer()

df_compiled['test'] = df_compiled.zip.map(test_int)
df_compiled[df_compiled.test == 0].head()

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,test
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [58]:
# convert zip with more than 5 digits to 5 digits
# then drop ones cannot be converted to int
df_compiled.zip = df_compiled.zip.apply(lambda x: str(x).zfill(5)[:5] if x == x else np.nan)
df_compiled['test'] = df_compiled.zip.map(test_int)
df_compiled.loc[df_compiled.test == 0, 'zip'] = np.nan

### vehicle_year

In [59]:
df_compiled['test'] = df_compiled.vehicle_year.map(test_int)
df_compiled.loc[df_compiled.test == 0].head()

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,test
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [60]:
# convert zip with more than 5 digits to 5 digits
# then drop ones cannot be converted to int
df_compiled.vehicle_year = df_compiled.vehicle_year.apply(lambda x: str(x).zfill(4)[:4] if x == x else np.nan)
df_compiled['test'] = df_compiled.vehicle_year.map(test_int)
df_compiled.loc[df_compiled.test == 0, 'vehicle_year'] = np.nan

In [61]:
df_compiled['test'] = df_compiled.vehicle_year.map(test_int)
df_compiled.loc[df_compiled.test == 0].head()

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,test
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


### lease_zip

In [62]:
df_compiled['test'] = df_compiled.lease_zip.map(test_int)
df_compiled.loc[df_compiled.test == 0].head()

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,test
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
090_New_Canaan_MV_21.xlsx,1ST RIDE & GO CORP,58 PINE ST,NEW CANAAN,CT,6840,2016,FORD,TRANSIT,8,1FBZX2CM2GKA71622,,50002,,,,False
090_New_Canaan_MV_21.xlsx,909 WEST ROAD REVOCABLE LIVING TRUST,909 WEST RD,NEW CANAAN,CT,6840,1996,JAGUA,XJR,25,SAJPX1143TC781554,,50003,,,,False
090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840,2007,FORD,ECONOLIN,3,1FTNE24L07DA36751,,50004,,,,False
090_New_Canaan_MV_21.xlsx,A & N PLUMBING AND HEATING LLC,22 DOWN RIVER RD,NEW CANAAN,CT,6840,2018,RAM,PROMASTE,3,3C6TRVAG6JE110384,,50005,,,,False
090_New_Canaan_MV_21.xlsx,A M SANTELLA COMPANY INC,635 CHEESEPRING RD,NEW CANAAN,CT,6840,2015,RAM,RAM TRUC,3,3C6MR5AJ2FG524435,,50006,,,,False


In [63]:
# convert zip with more than 5 digits to 5 digits
# then drop ones cannot be converted to int
df_compiled.lease_zip = df_compiled.lease_zip.apply(lambda x: str(x).zfill(5)[:5] if x == x else np.nan)
df_compiled['test'] = df_compiled.lease_zip.map(test_int)
df_compiled.loc[df_compiled.test == 0, 'lease_zip'] = np.nan

In [64]:
# check
df_compiled['test'] = df_compiled.lease_zip.map(test_int)
df_compiled.loc[df_compiled.test == 0].head()

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip,test
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


### harmonization

In [65]:
df_compiled = df_compiled.drop(columns=['test'])

In [66]:
df_compiled = df_compiled.replace('nan', np.nan)
df_compiled = df_compiled.replace('Nan', np.nan)
df_compiled = df_compiled.replace('NaN', np.nan)
df_compiled = df_compiled.replace('', np.nan)

In [67]:
# harmonization
df_compiled.name = df_compiled.name.apply(lambda x: x.title() if x == x else np.nan)
df_compiled.street = df_compiled.street.apply(lambda x: x.title() if x == x else np.nan)
df_compiled.city = df_compiled.city.apply(lambda x: x.title() if x == x else np.nan)
df_compiled.zip = df_compiled.zip.apply(lambda x: str(int(float(x))).zfill(5) if x == x else np.nan)
df_compiled.lease_street = df_compiled.lease_street.apply(lambda x: x.title() if x == x else np.nan)
df_compiled.lease_city = df_compiled.lease_city.apply(lambda x: x.title() if x == x else np.nan)
df_compiled.lease_zip = df_compiled.lease_zip.apply(lambda x: str(int(float(x))).zfill(5) if x == x else np.nan)
df_compiled.vehicle_year = df_compiled.vehicle_year.apply(lambda x: str(int(float(x))) if x == x else np.nan)
df_compiled.vehicle_class = df_compiled.vehicle_class.apply(lambda x: str(int(float(x))) if x == x else np.nan)

In [68]:
df_compiled = df_compiled.replace(np.nan,'')

In [69]:
df_compiled

Unnamed: 0_level_0,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,lease_state,lease_zip
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001_Andover_MV_21.csv,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2018,GMC,SIERRA K,3,1GT12UEY2JF286372,,,,,
001_Andover_MV_21.csv,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2021,MERCE,SPRINTER,2,W1Y4DCHY9MT049778,,,,,
001_Andover_MV_21.csv,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2017,MERCE,SPRINTER,3,WD3PE8CD9HP536293,,,,,
001_Andover_MV_21.csv,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2016,FORD,TRANSIT,3,NM0LS7E78G1281925,,,,,
001_Andover_MV_21.csv,A And R Environmental Llc,35 Riverside Dr,Andover,CT,,2014,FREIG,SPRINTER,3,WDYPE8CC9E5824341,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147_Voluntown_MV_21.xlsx,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,1999,YAMAH,YZFR6L,12,JYARJ04E8XA004047,,,,,
147_Voluntown_MV_21.xlsx,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2012,HOMEM,TRAILER,11,CTTRL23049,,,,,
147_Voluntown_MV_21.xlsx,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2013,CADIL,ATS AWD,1,1G6AG5RX6D0160860,,,,,
147_Voluntown_MV_21.xlsx,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2009,FORD,F350 SUP,3,1FTWW31R09EA59881,,,,,


### UID
    drop the original uid; add uid according to row number; format: {last two number of year}_{row num}

In [95]:
try:
    df_compiled = df_compiled.drop(columns='UID')
except KeyError:
    df_compiled

In [100]:
df_compiled = df_compiled.assign(UID=range(1,1+len(df_compiled)))
df_compiled.UID = df_compiled.UID.apply(lambda x: '21_{}'.format(str(x).zfill(7)))

In [101]:
# move UID to beginning
temp_cols=df_compiled.columns.tolist()
index=df_compiled.columns.get_loc('UID')
new_cols=temp_cols[index:index+1] + temp_cols[0:index] + temp_cols[index+1:]
df_compiled=df_compiled[new_cols]
df_compiled

Unnamed: 0_level_0,UID,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,lease_city,lease_state,lease_zip
record_from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001_Andover_MV_21.csv,21_0000001,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2018,GMC,SIERRA K,3,1GT12UEY2JF286372,,,,
001_Andover_MV_21.csv,21_0000002,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2021,MERCE,SPRINTER,2,W1Y4DCHY9MT049778,,,,
001_Andover_MV_21.csv,21_0000003,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2017,MERCE,SPRINTER,3,WD3PE8CD9HP536293,,,,
001_Andover_MV_21.csv,21_0000004,A & R Environmental Llc,35 Riverside Dr,Andover,CT,,2016,FORD,TRANSIT,3,NM0LS7E78G1281925,,,,
001_Andover_MV_21.csv,21_0000005,A And R Environmental Llc,35 Riverside Dr,Andover,CT,,2014,FREIG,SPRINTER,3,WDYPE8CC9E5824341,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147_Voluntown_MV_21.xlsx,21_2779302,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,1999,YAMAH,YZFR6L,12,JYARJ04E8XA004047,,,,
147_Voluntown_MV_21.xlsx,21_2779303,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2012,HOMEM,TRAILER,11,CTTRL23049,,,,
147_Voluntown_MV_21.xlsx,21_2779304,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2013,CADIL,ATS AWD,1,1G6AG5RX6D0160860,,,,
147_Voluntown_MV_21.xlsx,21_2779305,Zonfrilli Joseph V,173 Preston City Rd,Voluntown,CT,06384,2009,FORD,F350 SUP,3,1FTWW31R09EA59881,,,,


## store

In [102]:
# store
df_compiled.to_csv(sav_path_compiled)
df_compiled_update.to_csv(sav_path_compiled_update)

In [88]:
df_test = pd.read_csv(sav_path_compiled, dtype=str)