# Set up

In [5]:
# Warning Management
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Maths
import numpy as np
import math

# Path management
import pathlib

# Progress tracking
from tqdm import tqdm

# Regular Expressions
import re

# Logging
import logging

## Paths and files

In [6]:
path = pathlib.Path().resolve()

In [7]:
vin_matching_list = pd.read_csv(path.parent / "all_vins_nhtsa.csv").drop_duplicates("vin_corrected")

In [8]:
chunks = pd.read_csv(path.parent / "2019-21_data_compiled_RN_100323.csv", header=0, chunksize = 10000)

## Functions used to process dataset

In [9]:
def check_valid_zip(zip_code):
    zip_str = str(zip_code)
    split_zip = re.split("-", zip_str)
    if len(split_zip) == 2:
        if check_valid_zip(split_zip[0]) & check_valid_zip(split_zip[1]):
            return 2
        elif check_valid_zip(split_zip[0]):
            return 3
        else:
            return 0
    elif len(split_zip) == 1:
        # MUST ADDRESS STARTING "Os"
        # starting_o = re.match('^O', split_zip[0])
        matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])
        if matched:
            return 1
        else:
            return 0
    else:
        return 0
    
    
def get_valid_zips(zip_df, zip_column_name):
    """
    Inputs: A DataFrame with a column called "zip"
    Returns: A DataFrame of the same length as the input, with three columns: zip, zip_valid_code, and zip_corrected
    """
    # Prepare the list to be used
    zip_list = zip_df[[zip_column_name]].rename(columns = {zip_column_name : "zip"}).reset_index(drop = True)

    # Get validity code
    zip_list.loc[:, "zip_valid_code"] = zip_list.loc[:, "zip"].apply(lambda x: check_valid_zip(x))

    # Get indices
    correct_zips_indices = zip_list[zip_list["zip_valid_code"]==1].index
    invalid_zips_indices = zip_list[zip_list["zip_valid_code"]==0].index
    two_part_zips_indices = zip_list[zip_list["zip_valid_code"]>1].index

    zip_list.loc[correct_zips_indices, "zip_corrected"] = zip_list.loc[correct_zips_indices, "zip"]
    zip_list.loc[invalid_zips_indices, "zip_corrected"] = np.NaN
    zip_list.loc[two_part_zips_indices, "zip_corrected"] = zip_list.loc[two_part_zips_indices, "zip"].astype(str).str[0:5]

    return zip_list[["zip_corrected"]]

def convert_vin_valid(vin):
    try:
        vin_str = str(vin)
        if len(vin_str) < 11:
            return "NA"
        if " " in vin_str[0:11]:
            return "NA"
        else:
            return vin_str[0:8]+"*"+vin_str[9:11]
    except:
        return "NA"

def return_matched_vins(chunk_number, df, vin_column, matching_list):
    match = df.merge(matching_list,
                    left_on = vin_column,
                    right_on = vin_column,
                    how = 'left')
    
    # Get rows of DF where VINS matched
    df_vins_matched = match.loc[match["Manufacturer Name"].notna(), :]
    df_vins_unmatched = match.loc[match["Manufacturer Name"].isna(), :]
    
    # Get length
    len_matched = len(df_vins_matched)
    len_unmatched = len(df_vins_unmatched)
    len_all = len(match)
    
    # Create df
    tally_dict = {"Chunk Number": [chunk_number],
                  "Matched" : [len_matched],
                  "Unmatched" : [len_unmatched],
                  "All" : [len_all]}
    
    match_unmatched_tally = pd.DataFrame(tally_dict)

    return [match, match_unmatched_tally]

  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])


# Process dataset

In [10]:
output_df = pd.DataFrame([])
output_tally_df = pd.DataFrame([])
i = 0
num_to_run = 590

for chunk in chunks:
    # Check that we haven't gone too far
    if i >= num_to_run:
        break
    
    # Display Progress
    if (i % 10 == 0):
        print(f"Currently processing chunk number {i}")
    
    # Log progress
    # logging.info(f"Chunk Number {self.chunk_number}: commencing processing")
    # logging.info(f"Chunk Number {self.chunk_number}: chunk length is {len(chunk)}")

    # Reduce reduce the number of columns
    chunk_simplified = chunk[['record_from', 'name', 'street', 'city', 
                          'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
                          'vehicle_class', 'vehicle_id']].reset_index(drop = True)

    # Get corrected zips
    corrected_zip_codes = get_valid_zips(chunk_simplified, "zip").reset_index(drop = True)
    chunk_simplified = chunk_simplified.join(corrected_zip_codes).reset_index(drop=True)

    # Get the vin codes, and add them as a column
    chunk_simplified["vin_corrected"] = chunk_simplified["vehicle_id"].apply(lambda x: convert_vin_valid(x))
    chunk_simplified = chunk_simplified.reset_index(drop=True)

    # Get the match and tally
    matches, tally = return_matched_vins(i, chunk_simplified, "vin_corrected", vin_matching_list)

    # Concatenate
    output_df = pd.concat([output_df, matches])
    output_tally_df = pd.concat([output_tally_df, tally])
    
    # Save every 100
    if ((i % 100)+1 ==100) & (i>0):
        output_df.to_csv(path.parent / "processed_chunks_new_new" / f"matching_output{i}.csv")
        output_tally_df.to_csv(path.parent / "processed_chunks_new_new" / f"tally_output{i}.csv")
        output_df = pd.DataFrame([])
        output_tally_df = pd.DataFrame([])

    i += 1

# Final save
output_df.to_csv(path.parent / "processed_chunks_new_new" / f"matching_output{i}.csv")
output_tally_df.to_csv(path.parent / "processed_chunks_new_new" / f"tally_output{i}.csv")

Currently processing chunk number 0
Currently processing chunk number 10
Currently processing chunk number 20
Currently processing chunk number 30
Currently processing chunk number 40
Currently processing chunk number 50
Currently processing chunk number 60
Currently processing chunk number 70
Currently processing chunk number 80
Currently processing chunk number 90
Currently processing chunk number 100
Currently processing chunk number 110
Currently processing chunk number 120
Currently processing chunk number 130
Currently processing chunk number 140
Currently processing chunk number 150
Currently processing chunk number 160
Currently processing chunk number 170
Currently processing chunk number 180
Currently processing chunk number 190
Currently processing chunk number 200
Currently processing chunk number 210
Currently processing chunk number 220
Currently processing chunk number 230
Currently processing chunk number 240
Currently processing chunk number 250
Currently processing ch

# Concatenate all the joined datasets

In [67]:
# Check length - this is the same as the legnth of
length = 0
for file in processed_chunks_path.iterdir():
    if file.name[0:8] == "matching":
        df = pd.read_csv(file, usecols = ["record_from"])
        length += len(df)

print(length)

5787151


In [18]:
processed_chunks_path = path.parent / "processed_chunks_new_new" 
concat_grouped = pd.DataFrame([])
cols = 0

for file in processed_chunks_path.iterdir():
    if file.name[0:8] == "matching":
        # Track progress
        print(file.name)

        # Extract the file and create a counter
        df = pd.read_csv(file, usecols = ["record_from", "zip_corrected", "Fuel Type - Primary"])
        df["count"] = 1
        grouped = df.groupby(by=["record_from", "zip_corrected", "Fuel Type - Primary"]).sum().reset_index()[["record_from", "zip_corrected", "Fuel Type - Primary", "count"]]
        grouped = grouped.astype({'zip_corrected': 'str',
                                  'Fuel Type - Primary' : 'str',
                                  'count': 'int'})
        concat_grouped = pd.concat([concat_grouped, grouped])

matching_output299.csv
matching_output199.csv
matching_output499.csv
matching_output579.csv


  df = pd.read_csv(file, usecols = ["record_from", "zip_corrected", "Fuel Type - Primary"])


matching_output399.csv
matching_output99.csv


In [23]:
concat_grouped.to_csv(path.parent / "mun_nv_reg_zip_type.csv")

In [69]:
concat_grouped["Fuel Type - Primary"].unique()

array(['Gasoline', 'Diesel', 'Electric', 'Flexible Fuel Vehicle (FFV)',
       'Not Applicable', 'Ethanol (E85)', 'Compressed Natural Gas (CNG)',
       'Liquefied Petroleum Gas (propane or LPG)', 'Natural Gas',
       'Fuel Cell'], dtype=object)

# Observe Data

In [42]:
ct_zip_mask = (concat_grouped["zip_corrected"].str[:-2].str.zfill(5).str[0:2])=="06"

In [43]:
ct_entries = concat_grouped[ct_zip_mask]

In [64]:
ct_entries["count"].sum()

4571268

In [55]:
year_2021_mask = ct_entries["record_from"].str.contains("_21")

In [62]:
ct_by_type_zip_21 = ct_entries[year_2021_mask].groupby(["zip_corrected", "Fuel Type - Primary"]).sum().reset_index()
evs_21 = ct_by_type_zip_21[ct_by_type_zip_21["Fuel Type - Primary"] == "Electric"]["count"].sum()

13674