In [124]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pathlib

from tqdm import tqdm

import re

# import usaddress

from difflib import get_close_matches as clmatch

import geopandas as gpd

from shapely import Point

import math

import logging

path = pathlib.Path().resolve()

In [116]:
data_path = path.parent 

processed_chunks_path = path.parent / "chunks_new" 

processed_chunks_save_path = path.parent / "processed_chunks_new_save"

matching_list_path = path.parent 

logging_path = path.parent

raw_data = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", chunksize = 1000)

pd.options.mode.chained_assignment = None  # default='warn'

In [115]:
logging.basicConfig(filename = logging_path / "new_chunk_match.log", level = logging.INFO,  format='%(levelname)s: %(message)s')

# Full processing

## Prepare data and import matching sets

## Required functions

In [119]:
class ChunkProcessor():
    def __init__(self, chunks, 
                 processed_chunk_path, 
                 number_to_run, 
                 vin_matcher, zip_column_name, vin_column_name):
        self.chunks = chunks
        self.processed_chunks = pd.DataFrame([])
        self.processed_chunks_path = processed_chunk_path
        
        self.matched_chunks = pd.DataFrame([])
        self.unmatched_chunks = pd.DataFrame([])
        self.tally = pd.DataFrame([])
        
        self.number_to_run = number_to_run
        self.chunk_number = 0
        self.vin_matcher = vin_matcher
        self.zip_column_name = zip_column_name
        self.vin_column_name = vin_column_name
        
    def run(self):
        for chunk in self.chunks:
            if self.chunk_number < self.number_to_run:
                self.process_chunk(chunk)
            else:
                logging.info(f"Concluding on chunk number {self.chunk_number}")
                break
    
    def process_chunk(self, chunk):
        try:
            # Display Progress
            if self.chunk_number % 10 == 0:
                print(f"Currently processing chunk number {self.chunk_number}")
            
            # Log progress
            logging.info(f"Chunk Number {self.chunk_number}: commencing processing")
            logging.info(f"Chunk Number {self.chunk_number}: chunk length is {len(chunk)}")
        
            # Reduce reduce the number of columns
            chunk_simplified = chunk[['record_from', 'name', 'street', 'city', 
                                  'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
                                  'vehicle_class', 'vehicle_id']].reset_index(drop = True)
        
            # Get the corrected zip codes, and add them as a column
            corrected_zip_codes = self.get_valid_zips(chunk_simplified, self.zip_column_name).reset_index(drop = True)
            chunk_simplified = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)

            # Get two dataframes: matched vins, and unmatched vins, and a tally
            # Does NOT change the index 
            matched_vins, unmatched_vins, tally = self.vin_matcher.match_vins_simple(chunk_simplified,
                                                                                    self.vin_column_name,
                                                                                   self.chunk_number)
            # Join to the original data, keeping original indices
            matched_vins = chunk_simplified.join(matched_vins)
            unmatched_vins = chunk_simplified.join(unmatched_vins)
            
            # Confirm tally. This number should be the chunk length
            tally["confirm_matched"] = len(matched_vins)
            tally["confirm_unmatched"] = len(unmatched_vins)
            
            # Aggregate
            self.aggregate_save_matched_unmatched(matched_vins,
                                                 unmatched_vins,
                                                 tally)            
            
            # matched_vins = self.vin_matcher.match_vins(chunk, self.vin_column_name).reset_index(drop=True)

            # chunk_processed = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)
            # chunk_processed = chunk_processed.join(matched_vins)
            
            # Aggregate and save
            # self.aggregate_save_chunk(chunk_processed)

            # Update chunk number
            self.chunk_number +=1
            
            # return chunk_processed
        except Exception as e:
            # THIS IS THE CAUSE - if something falls through, we skip the rest of the chunk. 
            logging.error(f"Error encountered on chunk {self.chunk_number}, this means the last chunk to be run was chunk {self.chunk_number-1}")
            logging.error(e)
            print(f"Error encountered on chunk {self.chunk_number}, this means the last chunk to be run was chunk {self.chunk_number-1}")
            print(e)
            self.chunk_number +=1
        
    def aggregate_save_chunk(self, processed_chunk):        
        # Add it to the master DF
        self.processed_chunks = pd.concat([self.processed_chunks, processed_chunk])
        
        if ((self.chunk_number % 10)+1 ==10) & (self.chunk_number>0):
            logging.info(f"Saving aggregated chunks numbers {self.chunk_number-10} - {self.chunk_number}")
            dt_string = now.strftime("%d%m%y_%H%M")
            self.processed_chunks.to_csv(self.processed_chunks_path / f"chunk_number_{self.chunk_number}_{dt_string}.csv")
        
        # Once saved, reset the DF
        self.processed_chunks = pd.DataFrame([])
        
    def aggregate_save_matched_unmatched(self, matched_chunk, unmatched_chunk, tally): 
        """Input: A DataFrame of data for matched vins, a DF for unmatched VINS, and a tally.
        Aggregates this data over 10 chunks, then saves a file
        """
        # Append data to three separate master DFs
        self.matched_chunks = pd.concat([self.matched_chunks, matched_chunk])
        self.unmatched_chunks = pd.concat([self.unmatched_chunks, unmatched_chunk])
        self.tally = pd.concat([self.tally, tally])
        
        # Every 10th chunk, save
        if ((self.chunk_number % 100)+1 ==100) & (self.chunk_number>0):
            # Log
            logging.info(f"Saving chunk {self.chunk_number}")
            
            # Save the three DataFrames
            self.matched_chunks.to_csv(self.processed_chunks_path / f"matched_chunk_number_{self.chunk_number}.csv")
            self.unmatched_chunks.to_csv(self.processed_chunks_path / f"unmatched_chunk_number_{self.chunk_number}.csv")
            self.tally.to_csv(self.processed_chunks_path / f"tally_chunk_number_{self.chunk_number}.csv")

            # Once saved, reset the DF
            self.matched_chunks = pd.DataFrame([])
            self.unmatched_chunks = pd.DataFrame([])
            self.tally = pd.DataFrame([])
            
            
    def check_valid_zip(self, zip_code):
        zip_str = str(zip_code)
        split_zip = re.split("-", zip_str)
        if len(split_zip) == 2:
            if self.check_valid_zip(split_zip[0]) & self.check_valid_zip(split_zip[1]):
                return 2
            elif self.check_valid_zip(split_zip[0]):
                return 3
            else:
                return 0
        elif len(split_zip) == 1:
            # MUST ADDRESS STARTING "Os"
            # starting_o = re.match('^O', split_zip[0])
            matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
            if matched:
                return 1
            else:
                return 0
        else:
            return 0
        
        
    def get_valid_zips(self, zip_df, zip_column_name):
        """
        Inputs: A DataFrame with a column called "zip"
        Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
        """
        # Prepare the list to be used
        zip_list = zip_df[[zip_column_name]].rename(columns = {zip_column_name : "zip"}).reset_index(drop = True)

        # Get validity code
        zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: self.check_valid_zip(x))

        # Get indices
        correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
        invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
        two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index

        zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
        zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
        zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]

        return zip_list[["zip_corrected"]]
    
    
    

  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])


In [120]:
class VINMatcher():
    def __init__(self, initial_matching_list):
        self.matching_list = initial_matching_list.drop_duplicates("vin_corrected")
        self.queries = 0
        
    def valid_vins(self, vins_to_correct_df, vin_column):
        """
        Input: A DataFrame containing a "vehicle_id" column of VINs to be corrected
        Output: A DataFrame with two columns: "vehicle_id" and "vin_corrected"
        """
        # Set up
        vin_list = vins_to_correct_df        

        # Check all alphanumerics
        # THIS IS WHERE THE ERROR IS - SOMETIMES HIT A BLANK VIN AND THEREFORE THIS FAILS
        # AS IT IS NOT A STRING
        # I HAVE APPLIED A STRING OPERATION WITHOUT KNOWING IF STRING FIRST
        not_na_mask = vin_list[vin_column].notna()
        na_mask = vin_list[vin_column].isna()
        
        vin_list.loc[not_na_mask, "vin_alnum_check"] = vin_list.loc[not_na_mask, vin_column].astype(str).str.strip().str.isalnum()
        vin_list.loc[not_na_mask, "vin_len_check"] = vin_list.loc[not_na_mask, vin_column].astype(str).str.len() >= 11
        
        vin_list.loc[na_mask, "vin_alnum_check"] = False
        vin_list.loc[na_mask, "vin_len_check"] = False
        
        vin_list["vin_check"] = vin_list["vin_alnum_check"] & vin_list["vin_len_check"]

        # Create 11-long vins  
        vin_list["prepared_vins"] = np.NaN
        vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index,vin_column].astype(str).str[0:8]+"*"+vin_list.loc[vin_list[vin_list["vin_check"]==True].index,vin_column].astype(str).str[9:11]
        vin_list.loc[vin_list[vin_list["vin_check"]==False].index, "vin_corrected"] = np.NaN
        vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "vin_corrected"] = vin_list.loc[vin_list[vin_list["vin_check"]==True].index, "prepared_vins"]

        # Clean up
        vin_list = vin_list.drop("prepared_vins", axis = 1)
        vin_list = vin_list[["vin_corrected"]]

        # Return
        return vin_list.reset_index(drop = True)
    
    def match_vins_simple(self, df, vin_column, chunk_number):
        # Get a list of valid VINs
        valid_vin_list = self.valid_vins(df[[vin_column]], vin_column)
        
        # Attempt a match
        match = valid_vin_list.merge(self.matching_list,
                                     left_on = "vin_corrected",
                                     right_on = "vin_corrected",
                                     how = 'left')
        
        # Get rows of DF where VINS matched
        df_vins_matched = match.loc[match["Manufacturer Name"].notna(), :]
        df_vins_unmatched = match.loc[match["Manufacturer Name"].isna(), :]
        
        # Get length
        len_matched = len(df_vins_matched)
        len_unmatched = len(df_vins_unmatched)
        
        # Create df
        tally_dict = {"Chunk Number": [chunk_number],
                      "Matched" : [len_matched],
                      "Unmatched" : [len_unmatched]}
        
        match_unmatched_tally = pd.DataFrame(tally_dict)
        
        return [df_vins_matched, df_vins_unmatched, match_unmatched_tally]
        
    def match_vins(self, df, vin_column):
        """
        Input: A df containing vin columns, that are then corrected, and matched
        Returns: matched vins, updated matching list    
        """
        # Get a list of valid VINs
        valid_vin_list = self.valid_vins(df[[vin_column]], vin_column)
        
        # Attempt a match
        match = valid_vin_list.merge(self.matching_list,
                                     left_on = "vin_corrected",
                                     right_on = "vin_corrected",
                                     how = 'left')
                                      
                                      
                                      
        # Get unique unmatched vins
        unmatched_vins = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())

        # Print how many
        logging.info(f"VIN matching: a total of {len(unmatched_vins)} VINs were not matched")

        # Variables to download
        variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        
        # Go get them
        for vin in tqdm(unmatched_vins):
            try:
                # Try to fetch the unmatched vin
                resp_df = self.fetch_unmatched_vin(vin).reset_index(drop=True)
                for variable in variables:
                    match.loc[match[match["vin_corrected"]==vin].index, match.columns.isin([variable])] = resp_df[variable][0]
                
            except BaseException as e:
                logging.info(e)
                pass

        remaining_unmatched = list(match[match["Manufacturer Name"].isna()]["vin_corrected"].unique())
        
        logging.info(f"VIN Matching: this number of unmatched VINs was reduced by {len(unmatched_vins) - len(remaining_unmatched)}")
        logging.info(f"VIN Matching: remaining unmatched VINs is {len(remaining_unmatched)}")

        return match
    
    def fetch_unmatched_vin(self, unmatched_vin):
        """
        Input: An unmatched, but corrected VIN
        Output: A matched VIN or NA
        
        """
        logging.info(f"Now matching unmatched vin number {self.queries}: {unmatched_vin}")
        
        # Increment the number of times queried
        self.queries +=1

        

        try:
            url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{unmatched_vin}?format=csv")
            resp_df = pd.read_csv(url)

            # Extract required variables
            variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
            resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
            resp_df.columns = resp_df.iloc[0]
            resp_df = resp_df.drop("variable", axis = 0)
            resp_df["vin_corrected"] = unmatched_vin
            valid_response = not(resp_df["Fuel Type - Primary"].isna().iloc[0])

            # Log whether or not the response is valid
            if valid_response:
                logging.info(f"Successfully matched vin number {self.queries}: {unmatched_vin}")
            else:
                logging.info(f"Failed to match vin number {self.queries}: {unmatched_vin}")
            
            # Update the matching list
            self.matching_list = pd.concat([self.matching_list, resp_df]).reset_index(drop = True)
                
        except BaseException as e:
            msg = f"Failed to download csv for in number {self.queries}: {unmatched_vin}"
            
            logging.info(msg)
            logging.info(e)
            
            print(msg)
            print(e)

        
        # If the number of queries is an increment of 100, save the matching list        
        if self.queries % 100 == 0:
            logging.info(f"Saving matching list after {self.queries} queries")
            self.matching_list.to_csv(matching_list_path / f"matching_list_{self.queries}.csv", index = False)

# Test Function

In [213]:
processed_chunks_path
number_to_run = 5900

In [149]:
# Load NHTSA data
nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")

# Simplify the cleaned file
nhtsa_cleaned_simple = nhtsa_cleaned[["VIN", "Manufacturer", "Model", "ModelYear", "FuelTypePrimary", "ElectrificationLevel"]]
nhtsa_cleaned_simple = nhtsa_cleaned_simple.rename(columns = {"VIN":"vin_corrected",
                                                              "Manufacturer" : "Manufacturer Name",
                                                              "ModelYear" : "Model Year",
                                                              "FuelTypePrimary" : "Fuel Type - Primary",
                                                              "ElectrificationLevel" : "Electrification Level"})
matching_list = pd.read_csv(matching_list_path / "matching_list.csv")

  nhtsa_cleaned = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")


In [227]:
chunks = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 1000)

In [228]:
vm = VINMatcher(nhtsa_cleaned_simple)
cp = ChunkProcessor(chunks, processed_chunks_path, number_to_run, vm, "zip", "vehicle_id")

In [247]:
chunks_2 = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 1000, skiprows=range(1,2930000))

In [248]:
vm_2 = VINMatcher(nhtsa_cleaned_simple)
cp_2 = ChunkProcessor(chunks_2, processed_chunks_path, number_to_run, vm_2, "zip", "vehicle_id")

In [249]:
cp_2.run()

Currently processing chunk number 0
Currently processing chunk number 10
Currently processing chunk number 20
Currently processing chunk number 30
Currently processing chunk number 40
Currently processing chunk number 50
Currently processing chunk number 60
Currently processing chunk number 70
Currently processing chunk number 80
Currently processing chunk number 90
Currently processing chunk number 100
Currently processing chunk number 110
Currently processing chunk number 120
Currently processing chunk number 130
Currently processing chunk number 140
Currently processing chunk number 150
Currently processing chunk number 160
Currently processing chunk number 170
Currently processing chunk number 180
Currently processing chunk number 190
Currently processing chunk number 200
Currently processing chunk number 210
Currently processing chunk number 220
Currently processing chunk number 230
Currently processing chunk number 240
Currently processing chunk number 250
Currently processing ch

KeyboardInterrupt: 

# Get VINS

In [25]:
unmatched_filepaths = [file.name for file in processed_chunks_path.iterdir() if file.name[0:2]=="un"]

In [None]:
vin_lookup_table_new = pd.DataFrame()
all_unmatched_vins = pd.DataFrame()

for file_name in tqdm(unmatched_filepaths):
    f = pd.read_csv(processed_chunks_path / file_name)
    vins = f["vin_corrected"].unique()
    df = pd.DataFrame({"file": [file_name] * len(vins),
                       "vin_corrected":vins})
    all_unmatched_vins = pd.concat([all_unmatched_vins, df])

all_unmatched_vins = all_unmatched_vins.drop_duplicates(subset = "vin_corrected")

In [42]:
all_unmatched_vins.to_csv(path.parent / "all_unmatched_vins.csv")

In [35]:
all_unmatched_vins =pd.read_csv(path.parent / "all_unmatched_vins.csv")

In [36]:
all_unmatched_vins = all_unmatched_vins.drop("Unnamed: 0", axis =1).dropna()

In [81]:
all_unmatched_vins_list = all_unmatched_vins["vin_corrected"].to_list()

In [None]:
len(all_unmatched_vins_list)

In [93]:
vm = VINMatcher(pd.DataFrame([]))

In [None]:
for unmatched_vin in tqdm(all_unmatched_vins_list):
    vm.fetch_unmatched_vin(unmatched_vin.strip())

# Merge with nhtsa

In [96]:
nhtsa_cleaned = pd.read_csv(data_path / "NHTSA_cleaned.csv")

  nhtsa_cleaned = pd.read_csv(data_path / "NHTSA_cleaned.csv")


In [98]:
# Simplify the cleaned file
nhtsa_cleaned_simple = nhtsa_cleaned[["VIN", "Manufacturer", "Model", "ModelYear", "FuelTypePrimary", "ElectrificationLevel"]]
nhtsa_cleaned_simple = nhtsa_cleaned_simple.rename(columns = {"VIN":"vin_corrected",
                                                              "Manufacturer" : "Manufacturer Name",
                                                              "ModelYear" : "Model Year",
                                                              "FuelTypePrimary" : "Fuel Type - Primary",
                                                              "ElectrificationLevel" : "Electrification Level"})

In [100]:
queried_vins = pd.read_csv(data_path / "matching_list_197200.csv")

In [102]:
queried_vins.columns

Index(['Manufacturer Name', 'Model', 'Model Year', 'Fuel Type - Primary',
       'Electrification Level', 'vin_corrected'],
      dtype='object')

In [104]:
all_vins = pd.concat([nhtsa_cleaned_simple, queried_vins])

In [106]:
all_vins = all_vins.loc[all_vins["Manufacturer Name"].notna()]

In [108]:
all_vins.to_csv(data_path / "all_vins_nhtsa.csv")

# Run again with new matching list

In [125]:
chunks = pd.read_csv(data_path / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 1000)

In [126]:
number_to_run = 5900
vm = VINMatcher(all_vins)
cp = ChunkProcessor(chunks, processed_chunks_save_path, number_to_run, vm, "zip", "vehicle_id")

# check

In [134]:
matched = pd.read_csv(processed_chunks_save_path / "matched_chunk_number_3099.csv")
unmatched = pd.read_csv(processed_chunks_save_path / "unmatched_chunk_number_3099.csv")

join = matched.copy()
join.loc[join["vin_corrected"].isna()] = unmatched.loc[join["vin_corrected"].isna()]
join

FileNotFoundError: [Errno 2] No such file or directory: '/gpfs/gibbs/project/gillingham/rrn22/processed_chunks_new_save/matched_chunk_number_3099.csv'

In [133]:
matched

Unnamed: 0.1,Unnamed: 0,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,zip_corrected,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level
0,0,110_Plainville_MVData_2019.csv,VELOCCIA ELIZABETH H,140 PICKNEY AVE,PLAINVILLE,CT,6062.0,,ACURA,3.2 TL,1.0,19UUA66276A056130,6062.0,19UUA662*6A,HONDA,TL,2006.0,Gasoline,
1,1,110_Plainville_MVData_2019.csv,VELOCCIA ELIZABETH H,140 PICKNEY AVE,PLAINVILLE,CT,6062.0,,INFIN,Q50/PREM,1.0,JN1BV7AR6FM400066,6062.0,JN1BV7AR*FM,"NISSAN MOTOR COMPANY, LTD",Q50,2015.0,Gasoline,
2,2,110_Plainville_MVData_2019.csv,VELOCCIA VINCENT,140 PICKNEY AVE,PLAINVILLE,CT,6062.0,,KAWAS,VN800,12.0,JKBVNCA17SA009586,6062.0,JKBVNCA1*SA,"KAWASAKI MOTORS, LTD",Vulcan 800,1995.0,,
3,3,110_Plainville_MVData_2019.csv,VELOCCIA VINCENZO,140 PICKNEY AVE,PLAINVILLE,CT,6062.0,,TOYOT,TACOMA A,3.0,5TFUU4EN0CX036008,6062.0,5TFUU4EN*CX,TOYOTA,Tacoma,2012.0,Gasoline,
4,4,110_Plainville_MVData_2019.csv,VELODOTA DONALD E JR,121 CAMP ST,PLAINVILLE,CT,6062.0,,CHEVR,CAMARO S,1.0,2G1FK1EJ7A9148805,6062.0,2G1FK1EJ*A9,GENERAL MOTORS LLC,Camaro,2010.0,Gasoline,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181995,995,128_Simsbury_MVData_2019.csv,KILLE BENJAMIN D,94 GREAT POND RD,SIMSBURY,CT,06070,2015.0,CHEVR,SILVERAD,3,1GCVKREC2FZ403022,6070.0,1GCVKREC*FZ,GENERAL MOTORS LLC,Silverado,2015.0,Gasoline,
181996,996,128_Simsbury_MVData_2019.csv,KIM CHI H,12 JOSHUA DR,WEST SIMSBURY,CT,06092,2017.0,PORSC,MACAN GT,1,WP1AG2A56HLB54792,6092.0,WP1AG2A5*HL,PORSCHE,Macan,2017.0,Gasoline,
181997,997,128_Simsbury_MVData_2019.csv,KIM EDWARD C,145 COOPER AVE UNIT 8,WEATOGUE,CT,06089,2013.0,MERCE,GLK350 4,1,WDCGG8JBXDG114944,6089.0,WDCGG8JB*DG,MERCEDES-BENZ,GLK Class,2013.0,Gasoline,
181998,998,128_Simsbury_MVData_2019.csv,KIM JIHYUN,712D HOPMEADOW ST,SIMSBURY,CT,06070,2010.0,NISSA,ROGUE S/,1,JN8AS5MV7AW109608,6070.0,JN8AS5MV*AW,NISSAN,Rogue,2010.0,Gasoline,
