In [225]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pathlib

from tqdm import tqdm

import re

import usaddress

from difflib import get_close_matches as clmatch

import geopandas as gpd

from shapely import Point

import math

import logging

path = pathlib.Path().resolve()

data_path = path.parent / "Dropbox" / "2019 MV Data by Town" / "Vehicles_2022" / "Compiled"

processed_chunks_path = path.parent / "Dropbox" / "DOT_Tobin_Collaboration" / "data" / "processed_chunks_2"

matching_list_path = path.parent / "Dropbox" / "DOT_Tobin_Collaboration" / "data" 

logging_path = path.parent / "Dropbox" / "DOT_Tobin_Collaboration" / "data" 

raw_data = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", chunksize = 1000)

pd.options.mode.chained_assignment = None  # default='warn'

In [226]:
logging.basicConfig(filename = logging_path / "municipal_data_prep_1200_test.log", level = logging.INFO,  format='%(levelname)s: %(message)s')

# Full processing

## Prepare data and import matching sets

## Required functions

In [224]:
class ChunkProcessor():
    def __init__(self, chunks, 
                 processed_chunk_path, 
                 number_to_run, 
                 vin_matcher, zip_column_name, vin_column_name):
        self.chunks = chunks
        self.processed_chunks = pd.DataFrame([])
        self.processed_chunks_path = processed_chunk_path
        
        self.matched_chunks = pd.DataFrame([])
        self.unmatched_chunks = pd.DataFrame([])
        self.tally = pd.DataFrame([])
        
        self.number_to_run = number_to_run
        self.chunk_number = 0
        self.vin_matcher = vin_matcher
        self.zip_column_name = zip_column_name
        self.vin_column_name = vin_column_name
        
    def run(self):
        for chunk in self.chunks:
            if self.chunk_number < self.number_to_run:
                self.process_chunk(chunk)
            else:
                logging.info(f"Concluding on chunk number {self.chunk_number}")
                break
    
    def process_chunk(self, chunk):
        try:
            # Display Progress
            if self.chunk_number % 10 == 0:
                print(f"Currently processing chunk number {self.chunk_number}")
            
            # Log progress
            logging.info(f"Chunk Number {self.chunk_number}: commencing processing")
            logging.info(f"Chunk Number {self.chunk_number}: chunk length is {len(chunk)}")
        
            # Reduce reduce the number of columns
            chunk_simplified = chunk[['record_from', 'name', 'street', 'city', 
                                  'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
                                  'vehicle_class', 'vehicle_id']].reset_index(drop = True)
        
            # Get the corrected zip codes, and add them as a column
            corrected_zip_codes = self.get_valid_zips(chunk_simplified, self.zip_column_name).reset_index(drop = True)
            chunk_simplified = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)

            # Get two dataframes: matched vins, and unmatched vins, and a tally
            # Does NOT change the index 
            matched_vins, unmatched_vins, tally = self.vin_matcher.match_vins_simple(chunk_simplified,
                                                                                    self.vin_column_name,
                                                                                   self.chunk_number)
            # Join to the original data, keeping original indices
            matched_vins = chunk_simplified.join(matched_vins)
            unmatched_vins = chunk_simplified.join(unmatched_vins)
            
            # Confirm tally. This number should be the chunk length
            tally["confirm_matched"] = len(matched_vins)
            tally["confirm_unmatched"] = len(unmatched_vins)
            
            # Aggregate
            self.aggregate_save_matched_unmatched(matched_vins,
                                                 unmatched_vins,
                                                 tally)            
            
            # matched_vins = self.vin_matcher.match_vins(chunk, self.vin_column_name).reset_index(drop=True)

            # chunk_processed = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)
            # chunk_processed = chunk_processed.join(matched_vins)
            
            # Aggregate and save
            # self.aggregate_save_chunk(chunk_processed)

            # Update chunk number
            self.chunk_number +=1
            
            # return chunk_processed
        except Exception as e:
            # THIS IS THE CAUSE - if something falls through, we skip the rest of the chunk. 
            logging.error(f"Error encountered on chunk {self.chunk_number}, this means the last chunk to be run was chunk {self.chunk_number-1}")
            logging.error(e)
            print(f"Error encountered on chunk {self.chunk_number}, this means the last chunk to be run was chunk {self.chunk_number-1}")
            print(e)
            self.chunk_number +=1
        
    def aggregate_save_chunk(self, processed_chunk):        
        # Add it to the master DF
        self.processed_chunks = pd.concat([self.processed_chunks, processed_chunk])
        
        if ((self.chunk_number % 10)+1 ==10) & (self.chunk_number>0):
            logging.info(f"Saving aggregated chunks numbers {self.chunk_number-10} - {self.chunk_number}")
            dt_string = now.strftime("%d%m%y_%H%M")
            self.processed_chunks.to_csv(self.processed_chunks_path / f"chunk_number_{self.chunk_number}_{dt_string}.csv")
        
        # Once saved, reset the DF
        self.processed_chunks = pd.DataFrame([])
        
    def aggregate_save_matched_unmatched(self, matched_chunk, unmatched_chunk, tally): 
        """Input: A DataFrame of data for matched vins, a DF for unmatched VINS, and a tally.
        Aggregates this data over 10 chunks, then saves a file
        """
        # Append data to three separate master DFs
        self.matched_chunks = pd.concat([self.matched_chunks, matched_chunk])
        self.unmatched_chunks = pd.concat([self.unmatched_chunks, unmatched_chunk])
        self.tally = pd.concat([self.tally, tally])
        
        # Every 10th chunk, save
        if ((self.chunk_number % 100)+1 ==100) & (self.chunk_number>0):
            # Log
            logging.info(f"Saving chunk {self.chunk_number}")
            
            # Save the three DataFrames
            self.matched_chunks.to_csv(self.processed_chunks_path / f"matched_chunk_number_{self.chunk_number}.csv")
            self.unmatched_chunks.to_csv(self.processed_chunks_path / f"unmatched_chunk_number_{self.chunk_number}.csv")
            self.tally.to_csv(self.processed_chunks_path / f"tally_chunk_number_{self.chunk_number}.csv")

            # Once saved, reset the DF
            self.matched_chunks = pd.DataFrame([])
            self.unmatched_chunks = pd.DataFrame([])
            self.tally = pd.DataFrame([])
            
            
    def check_valid_zip(self, zip_code):
        zip_str = str(zip_code)
        split_zip = re.split("-", zip_str)
        if len(split_zip) == 2:
            if self.check_valid_zip(split_zip[0]) & self.check_valid_zip(split_zip[1]):
                return 2
            elif self.check_valid_zip(split_zip[0]):
                return 3
            else:
                return 0
        elif len(split_zip) == 1:
            # MUST ADDRESS STARTING "Os"
            # starting_o = re.match('^O', split_zip[0])
            matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
            if matched:
                return 1
            else:
                return 0
        else:
            return 0
        
        
    def get_valid_zips(self, zip_df, zip_column_name):
        """
        Inputs: A DataFrame with a column called "zip"
        Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
        """
        # Prepare the list to be used
        zip_list = zip_df[[zip_column_name]].rename(columns = {zip_column_name : "zip"}).reset_index(drop = True)

        # Get validity code
        zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: self.check_valid_zip(x))

        # Get indices
        correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
        invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
        two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index

        zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
        zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
        zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]

        return zip_list[["zip_corrected"]]
    
    
    

In [246]:
class VINMatcher():
    def __init__(self, initial_matching_list):
        self.matching_list = initial_matching_list.drop_duplicates("vin_corrected")
        self.queries = 0
        
    def valid_vins(self, vins_to_correct_df, vin_column):
        """
        Input: A DataFrame containing a "vehicle_id" column of VINs to be corrected
        Output: A DataFrame with two columns: "vehicle_id" and "vin_corrected"
        """
        # Set up
        vin_list = vins_to_correct_df        

        # Check all alphanumerics
        # THIS IS WHERE THE ERROR IS - SOMETIMES HIT A BLANK VIN AND THEREFORE THIS FAILS
        # AS IT IS NOT A STRING
        # I HAVE APPLIED A STRING OPERATION WITHOUT KNOWING IF STRING FIRST
        not_na_mask = vin_list[vin_column].notna()
        na_mask = vin_list[vin_column].isna()
        
        vin_list.loc[not_na_mask, "vin_alnum_check"] = vin_list.loc[not_na_mask, vin_column].astype(str).str.strip().str.isalnum()
        vin_list.loc[not_na_mask, "vin_len_check"] = vin_list.loc[not_na_mask, vin_column].astype(str).str.len() >= 11
        
        vin_list.loc[na_mask, "vin_alnum_check"] = False
        vin_list.loc[na_mask, "vin_len_check"] = False
        
        vin_list["vin_check"] = vin_list["vin_alnum_check"] & vin_list["vin_len_check"]

        # Create 11-long vins  
        vin_list["prepared_vins"] = np.NaN
        vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index,vin_column].astype(str).str[0:8]+"*"+vin_list.loc[vin_list[vin_list["vin_check"]==True].index,vin_column].astype(str).str[9:11]
        vin_list.loc[vin_list[vin_list["vin_check"]==False].index, "vin_corrected"] = np.NaN
        vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "vin_corrected"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"]

        # Clean up
        vin_list = vin_list.drop("prepared_vins", axis = 1)
        vin_list = vin_list[["vin_corrected"]]

        # Return
        return vin_list.reset_index(drop = True)
    
    def match_vins_simple(self, df, vin_column, chunk_number):
        # Get a list of valid VINs
        valid_vin_list = self.valid_vins(df[[vin_column]], vin_column)
        
        # Attempt a match
        match = valid_vin_list.merge(self.matching_list,
                                     left_on = "vin_corrected",
                                     right_on = "vin_corrected",
                                     how = 'left')
        
        # Get rows of DF where VINS matched
        df_vins_matched = match.loc[match["Manufacturer Name"].notna(), :]
        df_vins_unmatched = match.loc[match["Manufacturer Name"].isna(), :]
        
        # Get length
        len_matched = len(df_vins_matched)
        len_unmatched = len(df_vins_unmatched)
        
        # Create df
        tally_dict = {"Chunk Number": [chunk_number],
                      "Matched" : [len_matched],
                      "Unmatched" : [len_unmatched]}
        
        match_unmatched_tally = pd.DataFrame(tally_dict)
        
        return [df_vins_matched, df_vins_unmatched, match_unmatched_tally]
        
    def match_vins(self, df, vin_column):
        """
        Input: A df containing vin columns, that are then corrected, and matched
        Returns: matched vins, updated matching list    
        """
        # Get a list of valid VINs
        valid_vin_list = self.valid_vins(df[[vin_column]], vin_column)
        
        # Attempt a match
        match = valid_vin_list.merge(self.matching_list,
                                     left_on = "vin_corrected",
                                     right_on = "vin_corrected",
                                     how = 'left')
                                      
                                      
                                      
        # Get unique unmatched vins
        unmatched_vins = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())

        # Print how many
        logging.info(f"VIN matching: a total of {len(unmatched_vins)} VINs were not matched")

        # Variables to download
        variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        
        # Go get them
        for vin in tqdm(unmatched_vins):
            try:
                # Try to fetch the unmatched vin
                resp_df = self.fetch_unmatched_vin(vin).reset_index(drop=True)
                for variable in variables:
                    match.loc[match[match["vin_corrected"]==vin].index, match.columns.isin([variable])] = resp_df[variable][0]
                
            except BaseException as e:
                logging.info(e)
                pass

        remaining_unmatched = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())
        
        logging.info(f"VIN Matching: this number of unmatched VINs was reduced by {len(unmatched_vins) - len(remaining_unmatched)}")
        logging.info(f"VIN Matching: remaining unmatched VINs is {len(remaining_unmatched)}")

        return match
    
    def fetch_unmatched_vin(self, unmatched_vin):
        """
        Input: An unmatched, but corrected VIN
        Output: A matched VIN or NA
        
        """
        # Increment the number of times queried
        self.queries +=1
        
        variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        
        url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{unmatched_vin.strip()}?format=csv")

        # Download response
        resp_df = pd.read_csv(url)

        # Extract needed
        resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
        resp_df.columns = resp_df.iloc[0]
        resp_df = resp_df.drop("variable", axis = 0)
        resp_df["vin_corrected"] = unmatched_vin
        valid_response = not(resp_df["Fuel Type - Primary"].isna()[0])
        
        # Update the matching list
        self.matching_list = pd.concat([self.matching_list, resp_df]).reset_index(drop = True)
        
        # If the number of queries is an increment of 100, save the matching list        
        if self.queries % 10 == 0:
            logging.info(f"Saving matching list after {self.queries} queries")
            self.matching_list.to_csv(matching_list_path / "matching_list.csv", index = False)
            self.matching_list = self.matching_list.drop_duplicates("vin_corrected")
        
        return resp_df

# Test Function

In [213]:
processed_chunks_path
number_to_run = 5900

In [149]:
# Load NHTSA data
nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")

# Simplify the cleaned file
nhtsa_cleaned_simple = nhtsa_cleaned[["VIN", "Manufacturer", "Model", "ModelYear", "FuelTypePrimary", "ElectrificationLevel"]]
nhtsa_cleaned_simple = nhtsa_cleaned_simple.rename(columns = {"VIN":"vin_corrected",
                                                              "Manufacturer" : "Manufacturer Name",
                                                              "ModelYear" : "Model Year",
                                                              "FuelTypePrimary" : "Fuel Type - Primary",
                                                              "ElectrificationLevel" : "Electrification Level"})
matching_list = pd.read_csv(matching_list_path / "matching_list.csv")

  nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")


In [227]:
chunks = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 1000)

In [228]:
vm = VINMatcher(nhtsa_cleaned_simple)
cp = ChunkProcessor(chunks, processed_chunks_path, number_to_run, vm, "zip", "vehicle_id")

In [247]:
chunks_2 = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 1000, skiprows=range(1,2930000))

In [248]:
vm_2 = VINMatcher(nhtsa_cleaned_simple)
cp_2 = ChunkProcessor(chunks_2, processed_chunks_path, number_to_run, vm_2, "zip", "vehicle_id")

In [249]:
cp_2.run()

Currently processing chunk number 0
Currently processing chunk number 10
Currently processing chunk number 20
Currently processing chunk number 30
Currently processing chunk number 40
Currently processing chunk number 50
Currently processing chunk number 60
Currently processing chunk number 70
Currently processing chunk number 80
Currently processing chunk number 90
Currently processing chunk number 100
Currently processing chunk number 110
Currently processing chunk number 120
Currently processing chunk number 130
Currently processing chunk number 140
Currently processing chunk number 150
Currently processing chunk number 160
Currently processing chunk number 170
Currently processing chunk number 180
Currently processing chunk number 190
Currently processing chunk number 200
Currently processing chunk number 210
Currently processing chunk number 220
Currently processing chunk number 230
Currently processing chunk number 240
Currently processing chunk number 250
Currently processing ch

KeyboardInterrupt: 

In [218]:
cp.run()

Currently processing chunk number 0
Currently processing chunk number 10
Currently processing chunk number 20
Currently processing chunk number 30
Currently processing chunk number 40
Currently processing chunk number 50
Currently processing chunk number 60
Currently processing chunk number 70
Currently processing chunk number 80
Currently processing chunk number 90
Currently processing chunk number 100
Currently processing chunk number 110
Currently processing chunk number 120
Currently processing chunk number 130
Currently processing chunk number 140
Currently processing chunk number 150
Currently processing chunk number 160
Currently processing chunk number 170
Currently processing chunk number 180
Currently processing chunk number 190
Currently processing chunk number 200
Currently processing chunk number 210
Currently processing chunk number 220
Currently processing chunk number 230
Currently processing chunk number 240
Currently processing chunk number 250
Currently processing ch

Currently processing chunk number 2130
Currently processing chunk number 2140
Currently processing chunk number 2150
Currently processing chunk number 2160
Currently processing chunk number 2170
Currently processing chunk number 2180
Currently processing chunk number 2190
Currently processing chunk number 2200
Currently processing chunk number 2210
Currently processing chunk number 2220
Currently processing chunk number 2230
Currently processing chunk number 2240
Currently processing chunk number 2250
Currently processing chunk number 2260
Currently processing chunk number 2270
Currently processing chunk number 2280
Currently processing chunk number 2290
Currently processing chunk number 2300
Currently processing chunk number 2310
Currently processing chunk number 2320
Currently processing chunk number 2330
Currently processing chunk number 2340
Currently processing chunk number 2350
Currently processing chunk number 2360
Currently processing chunk number 2370
Currently processing chun

Currently processing chunk number 3160
Currently processing chunk number 3170
Currently processing chunk number 3180
Currently processing chunk number 3190
Currently processing chunk number 3200
Currently processing chunk number 3210
Currently processing chunk number 3220
Currently processing chunk number 3230
Currently processing chunk number 3240
Currently processing chunk number 3250
Currently processing chunk number 3260
Currently processing chunk number 3270
Currently processing chunk number 3280
Currently processing chunk number 3290
Currently processing chunk number 3300
Currently processing chunk number 3310
Currently processing chunk number 3320
Currently processing chunk number 3330
Currently processing chunk number 3340
Currently processing chunk number 3350
Currently processing chunk number 3360
Currently processing chunk number 3370
Currently processing chunk number 3380
Currently processing chunk number 3390
Currently processing chunk number 3400
Currently processing chun

Currently processing chunk number 5270
Currently processing chunk number 5280
Currently processing chunk number 5290
Currently processing chunk number 5300
Currently processing chunk number 5310
Currently processing chunk number 5320
Currently processing chunk number 5330
Currently processing chunk number 5340
Currently processing chunk number 5350
Currently processing chunk number 5360
Currently processing chunk number 5370
Currently processing chunk number 5380
Currently processing chunk number 5390
Currently processing chunk number 5400
Currently processing chunk number 5410
Currently processing chunk number 5420
Currently processing chunk number 5430
Currently processing chunk number 5440
Currently processing chunk number 5450
Currently processing chunk number 5460
Currently processing chunk number 5470
Currently processing chunk number 5480
Currently processing chunk number 5490
Currently processing chunk number 5500
Currently processing chunk number 5510
Currently processing chun