In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pathlib

from tqdm import tqdm

import re

import usaddress

from difflib import get_close_matches as clmatch

import geopandas as gpd

from shapely import Point

import math

import logging

path = pathlib.Path().resolve()

data_path = path.parent / "Dropbox" / "2019 MV Data by Town" / "Vehicles_2022" / "Compiled"

raw_data = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", chunksize = 1000)

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
logging.basicConfig(filename = 'municipal_data_prep.log', level = logging.INFO,  format='%(levelname)s: %(message)s')

# Full processing

## Prepare data and import matching sets

In [708]:
# raw_data_1000.to_csv(path / "raw_data_1000.csv")

In [3]:
raw_data_1000 = pd.read_csv(path / "data" / "raw_data_1000.csv")

  raw_data_1000 = pd.read_csv(path / "data" / "raw_data_1000.csv")


In [4]:
# Load NHTSA data
nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")

# Simplify the cleaned file
nhtsa_cleaned_simple = nhtsa_cleaned[["VIN", "Manufacturer", "Model", "ModelYear", "FuelTypePrimary", "ElectrificationLevel"]]
nhtsa_cleaned_simple = nhtsa_cleaned_simple.rename(columns = {"VIN":"vin_corrected",
                                                              "Manufacturer" : "Manufacturer Name",
                                                              "ModelYear" : "Model Year",
                                                              "FuelTypePrimary" : "Fuel Type - Primary",
                                                              "ElectrificationLevel" : "Electrification Level"})

# Remove unneeded
raw_data_1000 = raw_data_1000.loc[:, ~raw_data_1000.columns.isin(["Unnamed: 0.1", "Unnamed: 0"])]

  nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")


## Required functions

In [65]:
valid_vins_list = valid_vins(raw_data_1000[["vehicle_id"]][0:100])

In [34]:
matched_vins, updated_list = match_vins(valid_vins, nhtsa_cleaned_simple)

 11%|█▏        | 7/61 [00:02<00:14,  3.80it/s]

'float' object has no attribute 'strip'


100%|██████████| 61/61 [00:21<00:00,  2.86it/s]


In [66]:
valid_vins_list

Unnamed: 0,vehicle_id,vin_corrected
0,1GT12UEY2JF286372,1GT12UEY*JF
1,W1Y4DCHY9MT049778,W1Y4DCHY*MT
2,WD3PE8CD9HP536293,WD3PE8CD*HP
3,NM0LS7E78G1281925,NM0LS7E7*G1
4,WDYPE8CC9E5824341,WDYPE8CC*E5
...,...,...
95,4X4TRPT17HL017673,4X4TRPT1*HL
96,1HD1LL327EC428713,1HD1LL32*EC
97,JM1DKFB77G0135284,JM1DKFB7*G0
98,JT3HP10V7X7135241,JT3HP10V*X7


In [104]:
nhtsa_cleaned_simple

Unnamed: 0,vin_corrected,Manufacturer,Model,ModelYear,FuelTypePrimary,ElectrificationLevel
0,19UYA416*3A,HONDA,CL,2003,Gasoline,
1,19UYA417*3A,HONDA,CL,2003,Gasoline,
2,19UYA424*3A,HONDA,CL,2003,Gasoline,
3,19UYA425*3A,HONDA,CL,2003,Gasoline,
4,19UYA426*3A,HONDA,CL,2003,Gasoline,
...,...,...,...,...,...,...
72131,4S4WX9KD*D4,SUBARU,B 9 Tribeca 7 PASS LIMITED W NAVI&DVD,2013,,
72132,WV3AB470*4H,VOLKSWAGEN,EuroVan,2004,,
72133,WV2KB470*34,VOLKSWAGEN,EuroVan,2003,Gasoline,
72134,WV2MB470*38,VOLKSWAGEN,EuroVan,2003,Gasoline,


In [12]:
def match_vins(valid_vin_list, matching_list):
    """
    Input: A df of valid vins (that could be a valid vin or NA) and match it
    Returns: matched vins, updated matching list    
    """
    match = valid_vin_list.merge(matching_list,
                                 left_on = "vin_corrected",
                                 right_on = "vin_corrected",
                                 how = "left")
    # Get unique unmatched vins
    unmatched_vins = list(match[match["Manufacturer"].isna()]["vin_corrected"].unique())
    
    # Print how many
    logging.info(f"VIN matching: a total of {len(unmatched_vins)} VINs were not matched")
    
    # Variables to download
    variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
    
    # Go get them
    for vin in tqdm(unmatched_vins):
        try:
            
            
            url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{vin.strip()}?format=csv")

            # Download response
            resp_df = pd.read_csv(url)
            
            # Extract needed
            resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
            resp_df.columns = resp_df.iloc[0]
            resp_df = resp_df.drop("variable", axis = 0)
            valid_response = not(resp_df["Fuel Type - Primary"].isna()[0])
            
            # Add back to the DF
            if valid_response:
                for i in range(len(variables)):
                    match.loc[match[match["vin_corrected"]==vin].index, variables[i]] = resp_df[variables[i]][0]
                match.loc[match[match["vin_corrected"]==vin].index, "VIN"] = vin
            else:
                for i in range(len(variables)):
                    match.loc[match[match["vin_corrected"]==vin].index, variables[i]] = np.NaN
        except BaseException as e:
            print(e)
            pass
            
    remaining_unmatched = list(match[match["Manufacturer"].isna()]["vin_corrected"].unique())
    
    logging.info(f"VIN Matching: this number of unmatched VINs was reduced by {len(unmatched_vins) - len(remaining_unmatched)}")
    logging.info(f"VIN Matching: remaining unmatched VINs is {len(remaining_unmatched)}")
    
    matching_list_updated = pd.concat([matching_list, match])
    
    return match, matching_list_updated

In [64]:
def valid_vins(vin_list):
    """Take in a DF of VINs and determine when they are possibly valid"""
    # Check all alphanumerics
    vin_list.loc[:, "vin_alnum_check"] = vin_list["vehicle_id"].str.strip().str.isalnum()
    vin_list.loc[:, "vin_len_check"] = vin_list["vehicle_id"].str.len() >= 11
    vin_list.loc[:, "vin_check"] = vin_list["vin_alnum_check"] & vin_list["vin_len_check"]
    
    # Strip the vins
    vin_list["vin_stripped"] = vin_list["vehicle_id"].str.strip()
    
    # Create 11-long vins
    vin_list.loc[:, "prepared_vins"] = vin_list["vehicle_id"].str[0:8]+"*"+vin_list["vehicle_id"].str[9:11]
    vin_list.loc[vin_list[vin_list["vin_check"]==False].index, "vin_corrected"] = np.NaN
    vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "vin_corrected"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"]
    
    # Clean up
    vin_list = vin_list.drop("prepared_vins", axis = 1)
    vin_list = vin_list[["vehicle_id", "vin_corrected"]]
    
    # Return
    return vin_list

In [61]:
def process_chunk(chunk, chunk_number, matching_list = None):
    logging.info(f"Chunk Number {chunk_number}: commencing processing")
    logging.info(f"Chunk Number {chunk_number}: chunk length is {len(chunk)}")
    
    # Reduce reduce the number of columns
    chunk_simplified = chunk[['record_from', 'name', 'street', 'city',
       'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
       'vehicle_class', 'vehicle_id']]
    
    # Correct the zip codes
    logging.info(f"Chunk Number {chunk_number}: correcting zip codes")
    chunk_simplified.loc[:, "zip_corrected"] = get_valid_zips(chunk_simplified[["zip"]])["zip_corrected"]
    chunk_simplified = chunk_simplified.drop("zip", axis = 1)
    
    # Match the VINs
    logging.info(f"Chunk Number {chunk_number}: matching VIN codes")
    chunk_simplified.loc[:, "vin_corrected"] = valid_vins(chunk_simplified[["vehicle_id"]])["vin_corrected"]
    matched_vins, updated_matching_list = match_vins(chunk_simplified[["vin_corrected"]], matching_list)
    chunk_simplified = chunk_simplified.merge(matched_vins[["vin_corrected", "Manufacturer", "Model", "ModelYear", "FuelTypePrimary", "ElectrificationLevel"]],
                                                            left_on = "vin_corrected",
                                                            right_on = "vin_corrected",
                                                            how = "left")

    return chunk_simplified, updated_matching_list

In [149]:
zp = ZIPProcessor()
vm = VINMatcher(nhtsa_cleaned_simple)

cp = ChunkProcessor(zp, vm, "zip", "vehicle_id")

In [150]:
cp.process_chunk(raw_data_1000[25:50])

100%|██████████| 14/14 [00:04<00:00,  3.47it/s]


Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,zip_corrected,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level
0,001_Andover_MV_21.csv,ADAMS DANIEL R,738 EAST ST,ANDOVER,CT,6232,2003.0,FORD,RANGER,3.0,1FTZR15E53TA27141,6232,1FTZR15E*3T,FORD,Ranger,2003.0,Gasoline,
1,001_Andover_MV_21.csv,ADAMS KARA J,61 HUTCHINSON RD,ANDOVER,CT,6232,2019.0,TOYOT,COROLLA,1.0,JTNK4RBE8K3043671,6232,JTNK4RBE*K3,TOYOTA MOTOR CORPORATION,Corolla,2019.0,Gasoline,
2,001_Andover_MV_21.csv,ADAMS PAMELA D,37 SKINNER HILL RD,ANDOVER,CT,6232,1999.0,MERCE,E320,1.0,WDBJF65H7XA945891,6232,WDBJF65H*XA,MERCEDES-BENZ CARS,E-Class,1999.0,Gasoline,
3,001_Andover_MV_21.csv,ADAPTIVE PROSTHETICS & ORTHOTICS LLC,52 NATIONAL DR,GLASTONBURY,CT,6033,2021.0,VOLVO,XC60 T6,1.0,YV4A22RK2M1743058,6033,YV4A22RK*M1,VOLVO,XC 60,2021.0,Gasoline,
4,001_Andover_MV_21.csv,ADEE ARIA H,11 CENTER ST,ANDOVER,CT,6232,2013.0,KIA,SOUL +/!,1.0,KNDJT2A60D7493220,6232,KNDJT2A6*D7,KIA CORPORATION,Soul,2013.0,Gasoline,
5,001_Andover_MV_21.csv,ADEE JEANNEMARIE,11 CENTER ST,ANDOVER,CT,6232,2019.0,KIA,SOUL +,1.0,KNDJP3A5XK7671469,6232,KNDJP3A5*K7,KIA,Soul,2019.0,Gasoline,
6,001_Andover_MV_21.csv,ADLAM KHENIEL N,1 TIMES FARM RD,ANDOVER,CT,6232,2015.0,NISSA,ALTIMA 2,1.0,1N4AL3AP5FC575989,6232,1N4AL3AP*FC,"NISSAN NORTH AMERICA, INC.",Altima,2015.0,Gasoline,
7,001_Andover_MV_21.csv,ADU-POKU SAMPSON,2 SHADBLOW LN,ANDOVER,CT,6232,2011.0,NISSA,ALTIMA 2,1.0,1N4AL2APXBN503548,6232,1N4AL2AP*BN,"NISSAN NORTH AMERICA, INC.",Altima,2011.0,Gasoline,
8,001_Andover_MV_21.csv,AGUIRRE EVAN M,43 JUROVATY RD,ANDOVER,CT,6232,2011.0,HONDA,CIVIC LX,1.0,2HGFA1F53BH524873,6232,2HGFA1F5*BH,"HONDA OF CANADA MFG., INC.",Civic,2011.0,Gasoline,
9,001_Andover_MV_21.csv,AGUIRRE EVAN M,43 JUROVATY RD,ANDOVER,CT,6232,2014.0,FORD,F150,3.0,1FTMF1EM3EKF11148,6232,1FTMF1EM*EK,FORD MOTOR COMPANY,F-150,2014.0,Gasoline,


In [73]:
class ChunkProcessor():
    def __init__(self, zip_processor, vin_matcher, zip_column_name, vin_column_name):
        self.chunk_number = 0
        self.zip_processor = zip_processor
        self.vin_matcher = vin_matcher
        self.zip_column_name = zip_column_name
        self.vin_column_name = vin_column_name
    
    def process_chunk(self, chunk):
        # Log progress
        logging.info(f"Chunk Number {self.chunk_number}: commencing processing")
        logging.info(f"Chunk Number {self.chunk_number}: chunk length is {len(chunk)}")
        
        # Reduce reduce the number of columns
        chunk_simplified = chunk[['record_from', 'name', 'street', 'city', 
                                  'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
                                  'vehicle_class', 'vehicle_id']].reset_index(drop = True)
        
        # Correct the zip codes
        corrected_zip_codes = self.zip_processor.get_valid_zips(chunk_simplified, self.zip_column_name).reset_index(drop = True)
       
        # Get VIN codes
        matched_vins = self.vin_matcher.match_vins(chunk, self.vin_column_name).reset_index(drop=True)
        
        # Merge
        chunk_processed = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)
        chunk_processed = chunk_processed.join(matched_vins)
        return chunk_processed

In [67]:
class ZIPProcessor():
    def __init__(self):
        pass
    
    def check_valid_zip(self, zip_code):
        zip_str = str(zip_code)
        split_zip = re.split("-", zip_str)
        if len(split_zip) == 2:
            if self.check_valid_zip(split_zip[0]) & self.check_valid_zip(split_zip[1]):
                return 2
            elif self.check_valid_zip(split_zip[0]):
                return 3
            else:
                return 0
        elif len(split_zip) == 1:
            # MUST ADDRESS STARTING "Os"
            # starting_o = re.match('^O', split_zip[0])
            matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
            if matched:
                return 1
            else:
                return 0
        else:
            return 0
        
    def get_valid_zips(self, zip_df, zip_column_name):
        """
        Inputs: A DataFrame with a column called "zip"
        Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
        """
        # Prepare the list to be used
        zip_list = zip_df[[zip_column_name]].rename(columns = {zip_column_name : "zip"}).reset_index(drop = True)

        # Get validity code
        zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: self.check_valid_zip(x))

        # Get indices
        correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
        invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
        two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index

        zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
        zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
        zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]

        return zip_list[["zip_corrected"]]

In [142]:
vm = VINMatcher(nhtsa_cleaned_simple)

In [143]:
vm.match_vins(raw_data_1000[10:20], "vehicle_id")

100%|██████████| 6/6 [00:01<00:00,  3.73it/s]


Unnamed: 0,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level
0,3GTU9DED*KG,GENERAL MOTORS LLC,Sierra,2019.0,Gasoline,
1,1GYKNDRS*KZ,GM,XT 5,2019.0,Gasoline,
2,1GNEVHKW*LJ,GM,Traverse,2020.0,Gasoline,
3,1GNSKPKD*MR,GM,Tahoe,2021.0,Gasoline,
4,4X4FRLD2*F1,"FOREST RIVER, INC.",Real-lite / Rockwood Lite Weight Trailers,2015.0,Not Applicable,Not Applicable
5,3C6TR5DT*GG,FCA,RAM 2500,2016.0,Gasoline,
6,CTUNKNOW*27,,,,,
7,,,,,,
8,1GCDT19Z*K2,GENERAL MOTORS LLC,S-10 Pickup,1989.0,Gasoline,
9,456A2201*KI,WOOD-MIZER PRODUCTS INC,"Wood-Mizer Products, Inc.",1989.0,Not Applicable,Not Applicable


In [135]:
resp_df_dict

Unnamed: 0,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level
0,3GTU9DED*KG,GENERAL MOTORS LLC,Sierra,2019,Gasoline,


In [123]:
vm = VINMatcher(nhtsa_cleaned_simple)

In [124]:
vm.match_vins(raw_data_1000[0:20], "vehicle_id")

 10%|█         | 1/10 [00:00<00:04,  1.97it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 20%|██        | 2/10 [00:00<00:03,  2.14it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 30%|███       | 3/10 [00:01<00:02,  2.52it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 40%|████      | 4/10 [00:01<00:02,  2.53it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 50%|█████     | 5/10 [00:02<00:01,  2.55it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 60%|██████    | 6/10 [00:02<00:01,  2.65it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


 70%|███████   | 7/10 [00:02<00:01,  2.82it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices
'float' object has no attribute 'strip'


 90%|█████████ | 9/10 [00:03<00:00,  3.68it/s]

only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices


100%|██████████| 10/10 [00:03<00:00,  2.87it/s]







Unnamed: 0,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level
0,1GT12UEY*JF,,,,,
1,W1Y4DCHY*MT,MERCEDES-BENZ,Sprinter,2021.0,Diesel,
2,WD3PE8CD*HP,,,,,
3,NM0LS7E7*G1,FORD,Transit Connect,2016.0,Gasoline,
4,WDYPE8CC*E5,,,,,
5,WD3PE8CD*GP,MERCEDES-BENZ,Sprinter,2016.0,Diesel,
6,1GC1KUEY*KF,GM,Silverado HD,2019.0,Diesel,
7,1GTR2VE3*BZ,,,,,
8,3GKALTEV*ML,GM,Terrain,2021.0,Gasoline,
9,3GKALTEV*LL,GM,Terrain,2020.0,Gasoline,


In [139]:
class VINMatcher():
    def __init__(self, initial_matching_list):
        self.matching_list = initial_matching_list
        
    def valid_vins(self, vins_to_correct_df, vin_column):
        """
        Input: A DataFrame containing a "vehicle_id" column of VINs to be corrected
        Output: A DataFrame with two columns: "vehicle_id" and "vin_corrected"
        """
        # Set up
        vin_list = vins_to_correct_df
        
        # Check all alphanumerics
        vin_list.loc[:, "vin_alnum_check"] = vin_list[vin_column].str.strip().str.isalnum()
        vin_list.loc[:, "vin_len_check"] = vin_list[vin_column].str.len() >= 11
        vin_list.loc[:, "vin_check"] = vin_list["vin_alnum_check"] & vin_list["vin_len_check"]

        # Strip the vins
        vin_list["vin_stripped"] = vin_list[vin_column].str.strip()

        # Create 11-long vins
        vin_list.loc[:, "prepared_vins"] = vin_list[vin_column].str[0:8]+"*"+vin_list[vin_column].str[9:11]
        vin_list.loc[vin_list[vin_list["vin_check"]==False].index, "vin_corrected"] = np.NaN
        vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "vin_corrected"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"]

        # Clean up
        vin_list = vin_list.drop("prepared_vins", axis = 1)
        vin_list = vin_list[["vin_corrected"]]

        # Return
        return vin_list
        
    def match_vins(self, df, vin_column):
        """
        Input: A df containing vin columns, that are then corrected, and matched
        Returns: matched vins, updated matching list    
        """
        # Get a list of valid VINs
        valid_vin_list = self.valid_vins(df[[vin_column]], vin_column)
        
        # Attempt a match
        match = valid_vin_list.merge(self.matching_list,
                                     left_on = "vin_corrected",
                                     right_on = "vin_corrected",
                                     how = 'left')
        # Get unique unmatched vins
        unmatched_vins = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())

        # Print how many
        logging.info(f"VIN matching: a total of {len(unmatched_vins)} VINs were not matched")

        # Variables to download
        variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        
        # Go get them
        for vin in tqdm(unmatched_vins):
            try:
                # Try to fetch the unmatched vin
                resp_df = self.fetch_unmatched_vin(vin).reset_index(drop=True)
                # print(resp_df.to_dict())
                # match = pd.concat([match, resp_df]).reset_index(drop=True)
                # match = match[match[match["vin_corrected"]==vin].index[0]]
                for variable in variables:
                    match.loc[match[match["vin_corrected"]==vin].index, match.columns.isin([variable])] = resp_df[variable][0]
                
            except BaseException as e:
                # print(e)
                logging.info(e)
                pass

        remaining_unmatched = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())
        
        logging.info(f"VIN Matching: this number of unmatched VINs was reduced by {len(unmatched_vins) - len(remaining_unmatched)}")
        logging.info(f"VIN Matching: remaining unmatched VINs is {len(remaining_unmatched)}")

        return match
    
    def save_matching_list(self):
        self.matching_list.to_csv(path / "matching_list.csv")
    
    def fetch_unmatched_vin(self, unmatched_vin):
        """
        Input: An unmatched, but corrected VIN
        Output: A matched VIN or NA
        
        """
        variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        
        url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{unmatched_vin.strip()}?format=csv")

        # Download response
        resp_df = pd.read_csv(url)

        # Extract needed
        resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
        resp_df.columns = resp_df.iloc[0]
        resp_df = resp_df.drop("variable", axis = 0)
        resp_df["vin_corrected"] = unmatched_vin
        valid_response = not(resp_df["Fuel Type - Primary"].isna()[0])
        
        self.matching_list = pd.concat([self.matching_list, resp_df]).reset_index(drop = True)
        
        return resp_df
        

In [None]:
class ChunkProcessor(initial_matching_list):
    def __init__():
        self.matching_list = initial_matching_list
    
    def process_chunk(chunk):
        
        

# Test Function

In [57]:
chunks = pd.read_csv(path / "data" / "raw_data_1000.csv", chunksize = 100)

In [58]:
matching_list = nhtsa_cleaned_simple

chunk_number = 0

processed_data = pd.DataFrame([])

for chunk in chunks:
    if chunk_number <= 10: 

        logging.info(f"Commencing processing of chunk number {chunk_number}")

        processed_chunk, matching_list = process_chunk(chunk, chunk_number, matching_list = matching_list)
        matching_list.to_csv(path / "Updated matching list.csv")

        logging.info(f"Completed processing of chunk number {chunk_number}")

        processed_data = pd.concat([processed_data, processed_chunk])

        if chunk_number % 5 == 0:
            filename = f"processed_data_chunk_{chunk_number}.csv"
            logging.info(f"Saving to csv {filename}")
            processed_data.to_csv(path / filename)

        chunk_number += 1
    else:
        logging.info(f"Finishing processing on chunk {chunk_number}")
        break

100%|██████████| 4/4 [00:01<00:00,  3.33it/s]
 50%|█████     | 3/6 [00:00<00:00,  3.84it/s]

'float' object has no attribute 'strip'


100%|██████████| 6/6 [00:01<00:00,  3.83it/s]
100%|██████████| 6/6 [00:01<00:00,  4.49it/s]
100%|██████████| 7/7 [00:01<00:00,  4.15it/s]
100%|██████████| 4/4 [00:00<00:00,  4.22it/s]
100%|██████████| 6/6 [00:01<00:00,  4.08it/s]
100%|██████████| 3/3 [00:00<00:00,  4.52it/s]
 29%|██▊       | 2/7 [00:00<00:00, 11.38it/s]

'float' object has no attribute 'strip'


100%|██████████| 7/7 [00:01<00:00,  4.89it/s]
100%|██████████| 10/10 [00:02<00:00,  4.67it/s]
100%|██████████| 9/9 [00:02<00:00,  3.77it/s]
100%|██████████| 3/3 [00:00<00:00,  3.34it/s]


In [8]:
def get_valid_zips(zip_list):
    """
    Inputs: A DataFrame with a column called "zip"
    Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
    """
    
    # Get validity code
    zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: check_valid_zip(x))
    
    # Get indices
    correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
    invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
    two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index
    
    zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
    zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
    zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]
    
    return zip_list

In [10]:
def check_valid_zip(zip_code):
    zip_str = str(zip_code)
    split_zip = re.split("-", zip_str)
    if len(split_zip) == 2:
        if check_valid_zip(split_zip[0]) & check_valid_zip(split_zip[1]):
            return 2
        elif check_valid_zip(split_zip[0]):
            return 3
        else:
            return 0
    elif len(split_zip) == 1:
        # MUST ADDRESS STARTING "Os"
        # starting_o = re.match('^O', split_zip[0])
        matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
        if matched:
            return 1
        else:
            return 0
    else:
        return 0