# Set up

In [5]:
# Warning Management
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Maths
import numpy as np
import math

# Path management
import pathlib
import os
import sys

# Progress tracking
from tqdm import tqdm

# Regular Expressions
import re

# Logging
import logging

## Paths and files

In [28]:
# Paths 
path = pathlib.Path().resolve()
if sys.platform == 'linux':
    data_path = path.parent.parent / "data" 
    vin_matching_list = pd.read_csv(data_path / "vin_matching" / "all_vins_nhtsa.csv").drop_duplicates("vin_corrected")
    chunks = pd.read_csv(data_path /  "municipal_dataset_latest"/ f"2019-21_data_compiled_RN_{version}.csv", header=0, chunksize = 10000)
else:
    data_path = path.parent / "data"
    print("YET TO FIX THIS")

# Version of raw data to get
version = "102323"

# Load Vin Matching list and raw data
# vin_matching_list = pd.read_csv(data_path / "vin_matching" / "all_vins_nhtsa.csv").drop_duplicates("vin_corrected")
# chunks = pd.read_csv(data_path /  "municipal_dataset"/ f"2019-21_data_compiled_RN_{version}.csv", header=0, chunksize = 10000)

## Functions used to process dataset

In [13]:
def create_valid_zip(zip):
    try:
        zip_str = str(zip)
        zip_str = zip_str.strip()
        has_dot = re.search(r"\.", zip_str)

        # Get rid of decimal places
        if has_dot:
            zip_str = zip_str[0:re.search(r"\.", zip_str).start()]

        split_zip = re.split("-", zip_str)
        
        if len(split_zip) == 2:
            return create_valid_zip(split_zip[0])
        else:
            # If length is less than 4, return na
            if len(zip_str) < 4:
                return np.NaN
                
            # If length is 4 or 5, check it
            elif((len(zip_str) == 5) | (len(zip_str) == 4)):
                matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)
                if matched:
                    return matched[0].zfill(5)
                else:
                    return np.NaN
            # If the zip is between 5 and 8 (inclusive) long, we assume the first 4 are the first part
            # And the second 4 are the second part
            # There is no other way to do this... 
            elif((len(zip_str) > 5) & (len(zip_str)<9)):
                return create_valid_zip(zip_str[0:4])
            elif (len(zip_str) == 9):
                return create_valid_zip(zip_str[0:5])
            else:
                return np.NaN
        
    except Exception as e:
        print(e)
        return np.NaN


  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", zip_str)


In [17]:
def convert_vin_valid(vin):
    try:
        vin_str = str(vin)
        if len(vin_str) < 11:
            return "NA"
        if " " in vin_str[0:11]:
            return "NA"
        else:
            return vin_str[0:8]+"*"+vin_str[9:11]
    except:
        return "NA"

def return_matched_vins(chunk_number, df, vin_column, matching_list):
    match = df.merge(matching_list,
                    left_on = vin_column,
                    right_on = vin_column,
                    how = 'left')
    
    # Get rows of DF where VINS matched
    df_vins_matched = match.loc[match["Manufacturer Name"].notna(), :]
    df_vins_unmatched = match.loc[match["Manufacturer Name"].isna(), :]
    
    # Get length
    len_matched = len(df_vins_matched)
    len_unmatched = len(df_vins_unmatched)
    len_all = len(match)
    
    # Create df
    tally_dict = {"Chunk Number": [chunk_number],
                  "Matched" : [len_matched],
                  "Unmatched" : [len_unmatched],
                  "All" : [len_all]}
    
    match_unmatched_tally = pd.DataFrame(tally_dict)

    return [match, match_unmatched_tally]

  matched = re.match("^\s*[0-9]*[0-9]{4}\.?0?\s*$", split_zip[0])


# Process dataset

In [14]:
version = "102323"

In [None]:
output_df = pd.DataFrame([])
output_tally_df = pd.DataFrame([])
i = 0
num_to_run = 590

for chunk in tqdm(chunks):
    # Check that we haven't gone too far
    if i >= num_to_run:
        break

    chunk = chunk.reset_index()
    chunk = chunk.rename(columns = {"index" : "original_index"})

    # Reduce reduce the number of columns
    chunk_simplified = chunk[['original_index', 'record_from', 'name', 'street', 'city', 
                          'state', 'zip', 'vehicle_year', 'vehicle_make', 'vehicle_model',
                          'vehicle_class', 'vehicle_id']].reset_index(drop = True)

    # Get corrected zips
    chunk_simplified["zip_corrected"] = chunk_simplified["zip"].apply(lambda x: create_valid_zip(x)).reset_index(drop = True)

    # Get the vin codes, and add them as a column
    chunk_simplified["vin_corrected"] = chunk_simplified["vehicle_id"].apply(lambda x: convert_vin_valid(x))
    chunk_simplified = chunk_simplified.reset_index(drop=True)

    # Get the match and tally
    matches, tally = return_matched_vins(i, chunk_simplified, "vin_corrected", vin_matching_list)

    # Concatenate
    output_df = pd.concat([output_df, matches])
    output_tally_df = pd.concat([output_tally_df, tally])
    
    # Save every 100
    if ((i % 100)+1 ==100) & (i>0):
        dir = path.parent / f"processed_chunks_{version}"
        if not dir.is_dir():
            os.mkdir(dir)
        output_df.to_csv(dir / f"matching_output{i}.csv")
        output_tally_df.to_csv( dir / f"tally_output{i}.csv")
        output_df = pd.DataFrame([])
        output_tally_df = pd.DataFrame([])

    i += 1

In [20]:
# Final save
output_df.to_csv(dir / f"matching_output{i}.csv")
output_tally_df.to_csv(dir / f"tally_output{i}.csv")

# Further checks

# Concatenate all the joined datasets

### Check length

In [23]:
# Check length - this is the same as the legnth of
length = 0
for file in dir.iterdir():
    if file.name[0:8] == "matching":
        print(file.name)
        df = pd.read_csv(file, usecols = ["record_from"])
        length += len(df)

print(length)

matching_output299.csv
matching_output578.csv
matching_output199.csv
matching_output499.csv
matching_output399.csv
matching_output99.csv
5777151


### Check number of VINs missing and not

In [24]:
processed_chunks_path = dir
tally_all = pd.DataFrame([])
cols = 0

for file in processed_chunks_path.iterdir():
    if file.name[0:5] == "tally":
        # Track progress
        print(file.name)

        # Extract the file and create a counter
        df = pd.read_csv(file, usecols = ["Matched", "Unmatched", "All"])
        df["count"] = 1

        # Add to concatenated file
        tally_all = pd.concat([tally_all, df]).reset_index(drop = True)

tally_output99.csv
tally_output578.csv
tally_output299.csv
tally_output399.csv
tally_output199.csv
tally_output499.csv


In [25]:
summary_tally = tally_all.agg(['sum'], axis = 0).rename(columns = {"Matched" : "Matched VINs",
                                                                   "Unmatched": "Unmatched VINs",
                                                                   "All" : "All Entries"})[["Matched VINs", "Unmatched VINs", "All Entries"]]

In [26]:
summary_tally.loc["percentage"] = summary_tally.agg(lambda x: x/5787151 * 100, axis = 0).loc["sum"]
summary_tally.loc["percentage"] = summary_tally.loc["percentage"].map('{:.2f}%'.format)

In [27]:
summary_tally

Unnamed: 0,Matched VINs,Unmatched VINs,All Entries
sum,5523873.0,253278.0,5777151.0
percentage,95.45%,4.38%,99.83%


### Import all all vehicle details

In [30]:
processed_chunks_path = dir
concat_grouped = pd.DataFrame([])
cols = 0

for file in processed_chunks_path.iterdir():
    if file.name[0:8] == "matching":
        # Track progress
        print(file.name)

        # Extract the file and create a counter
        df = pd.read_csv(file, usecols = ["record_from", "zip_corrected", "Fuel Type - Primary"])
        df["count"] = 1

        # Group - note that this drops NAs 
        grouped = df.groupby(by=["record_from", "zip_corrected", "Fuel Type - Primary"]).sum().reset_index()[["record_from", "zip_corrected", "Fuel Type - Primary", "count"]]
        grouped = grouped.astype({'zip_corrected': 'str',
                                  'Fuel Type - Primary' : 'str',
                                  'count': 'int'})

        # Add this to the concatenated file
        concat_grouped = pd.concat([concat_grouped, grouped])

matching_output299.csv
matching_output578.csv
matching_output199.csv
matching_output499.csv
matching_output399.csv
matching_output99.csv


In [31]:
concat_grouped.to_csv(data_path / "municipal_dataset_extracts"  / "mun_nv_reg_zip_type.csv")

In [32]:
concat_grouped

Unnamed: 0,record_from,zip_corrected,Fuel Type - Primary,count
0,001_Andover_MV_21.csv,1040.0,Gasoline,11
1,001_Andover_MV_21.csv,1810.0,Gasoline,20
2,001_Andover_MV_21.csv,2199.0,Gasoline,1
3,001_Andover_MV_21.csv,2860.0,Gasoline,1
4,001_Andover_MV_21.csv,6029.0,Gasoline,1
...,...,...,...,...
13432,169_Woodstock_MV_21.xlsx,90501.0,Gasoline,20
13433,169_Woodstock_MV_21.xlsx,92123.0,Gasoline,1
13434,169_Woodstock_MV_21.xlsx,94538.0,Electric,1
13435,169_Woodstock_MV_21.xlsx,95865.0,Gasoline,13


# Observe Output Data

In [33]:
ct_zip_mask = (concat_grouped["zip_corrected"].str[:-2].str.zfill(5).str[0:2])=="06"

In [34]:
ct_entries = concat_grouped[ct_zip_mask]

In [35]:
ct_entries["count"].sum()

4630466

In [43]:
year_2021_mask = ct_entries["record_from"].str.contains("_21")

In [53]:
ct_by_type_zip_21 = ct_entries.loc[year_2021_mask, ["zip_corrected", "Fuel Type - Primary", "count"]].groupby(["zip_corrected", "Fuel Type - Primary"]).sum().reset_index()
evs_21 = ct_by_type_zip_21[ct_by_type_zip_21["Fuel Type - Primary"] == "Electric"]["count"].sum()

In [72]:
rlp_processed = pd.read_csv(data_path / "rlpolk_data" / "new_vehicle_sales_month_year_zip.csv", index_col = [0])
rlp_evs_21 = rlp_processed.loc[(rlp_processed["year"] == 2021) & (rlp_processed["FuelTypePrimary"] == "Electric")]
rlp_evs_21_yr = rlp_evs_21.groupby("ZIP_CODE").sum()[["VEH_COUNT"]].reset_index()
rlp_evs_21_yr["zip_corrected"] = rlp_evs_21_yr["ZIP_CODE"].astype(str)

In [95]:
municipal_evs_21 = ct_by_type_zip_21[ct_by_type_zip_21["Fuel Type - Primary"] == "Electric"]
municipal_evs_21["zip_corrected"] = municipal_evs_21["zip_corrected"].astype(str).str[:-2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  municipal_evs_21["zip_corrected"] = municipal_evs_21["zip_corrected"].astype(str).str[:-2]


In [102]:
compare = municipal_evs_21.merge(rlp_evs_21_yr,
                                 how = 'left',
                                 right_on = "zip_corrected",
                                 left_on = "zip_corrected").rename(columns = {"count" : "count_municipal",
                                                                              "VEH_COUNT":"count_rlpolk"})

In [107]:
def try_divide(x,y):
    try:
        return x/y
    except:
        return np.NaN

In [108]:
compare["pct"] = compare.apply(lambda x: try_divide(x.count_municipal, x.count_rlpolk), axis = 1)

In [112]:
compare.sort_values("pct", ascending = False).head(30)

Unnamed: 0,zip_corrected,Fuel Type - Primary,count_municipal,ZIP_CODE,count_rlpolk,pct
101,6281,Electric,48,6281.0,6.0,8.0
115,6354,Electric,15,6354.0,2.0,7.5
28,6052,Electric,21,6052.0,3.0,7.0
171,6469,Electric,6,6469.0,1.0,6.0
9,6021,Electric,6,6021.0,1.0,6.0
222,6751,Electric,23,6751.0,4.0,5.75
111,6339,Electric,37,6339.0,7.0,5.285714
126,6375,Electric,20,6375.0,4.0,5.0
161,6447,Electric,44,6447.0,10.0,4.4
138,6401,Electric,29,6401.0,7.0,4.142857


In [113]:
ct_entries[ct_entries["zip_corrected"] == 6281]

Unnamed: 0,record_from,zip_corrected,Fuel Type - Primary,count


In [125]:
# Check
processed_chunks_path = dir
test = pd.DataFrame([])
cols = 0

In [126]:
for file in processed_chunks_path.iterdir():
    if file.name[0:8] == "matching":
        # Track progress
        print(file.name)

        # Extract the file and create a counter
        df = pd.read_csv(file, usecols = ["record_from", "name", "street", "vehicle_make", "vehicle_model", "zip_corrected", "Fuel Type - Primary", "Electrification Level"])
        out = df[df["zip_corrected"] == 6281]
        out = out[out["Fuel Type - Primary"] == "Electric"]
        
        # Add this to the concatenated file
        test = pd.concat([test, out])

matching_output299.csv
matching_output578.csv
matching_output199.csv
matching_output499.csv
matching_output399.csv
matching_output99.csv


In [133]:
test

Unnamed: 0,record_from,name,street,vehicle_make,vehicle_model,zip_corrected,Fuel Type - Primary,Electrification Level
683785,112_Pomfret_MV_21.csv,DELUCA JOHN E,236 W QUASSET RD,CHEVR,BOLT EV,6281.0,Electric,BEV (Battery Electric Vehicle)
970082,169_Woodstock_MVData_2019.csv,AMBERG ANNE H,1041 ROUTE 169,TOYOT,PRIUS PL,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
971530,169_Woodstock_MVData_2019.csv,CHANG ROBERT A,1051 ROUTE 169,MINI,COOPER S,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
971978,169_Woodstock_MVData_2019.csv,COYLE ANDREA T,401 NEW SWEDEN RD,TOYOT,PRIUS PR,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
971989,169_Woodstock_MVData_2019.csv,CRAIG KRISTI L,195 COUNTY RD,CHEVR,VOLT,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
...,...,...,...,...,...,...,...,...
324544,169_Woodstock_MV_21.xlsx,SMITH AARON M,420 BARLOW CEMETERY RD,TOYOT,PRIUS PR,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
324602,169_Woodstock_MV_21.xlsx,SMITH TODD R,976 ROUTE 171,CHEVR,VOLT LT,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
324942,169_Woodstock_MV_21.xlsx,THAYER DEVIN R,296 ROUTE 171 APT 16,TOYOT,PRIUS PR,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
325260,169_Woodstock_MV_21.xlsx,VAZQUEZ-SOTO KARISSA,577 ROUTE 197,NISSA,LEAF S P,6281.0,Electric,BEV (Battery Electric Vehicle)


In [137]:
test[(test["record_from"].str.contains("MV_21"))]

Unnamed: 0,record_from,name,street,vehicle_make,vehicle_model,zip_corrected,Fuel Type - Primary,Electrification Level
683785,112_Pomfret_MV_21.csv,DELUCA JOHN E,236 W QUASSET RD,CHEVR,BOLT EV,6281.0,Electric,BEV (Battery Electric Vehicle)
316814,169_Woodstock_MV_21.xlsx,ALHAQ ZAMEER,32 GREY FOX LNDG,TESLA,MODEL 3,6281.0,Electric,BEV (Battery Electric Vehicle)
316841,169_Woodstock_MV_21.xlsx,AMBERG ANNE H,1041 ROUTE 169,TOYOT,PRIUS PL,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
316842,169_Woodstock_MV_21.xlsx,AMBERG ANNE H,1041 ROUTE 169,TESLA,MODEL 3,6281.0,Electric,BEV (Battery Electric Vehicle)
316994,169_Woodstock_MV_21.xlsx,AUGER RAYMOND W,50 BULL HILL RD,TESLA,MODEL S,6281.0,Electric,BEV (Battery Electric Vehicle)
317127,169_Woodstock_MV_21.xlsx,BARTELS MICHAEL W,31 GREY FOX LNDG,TESLA,MODEL 3,6281.0,Electric,BEV (Battery Electric Vehicle)
317303,169_Woodstock_MV_21.xlsx,BENKHART BRUCE S,45 WOODSTOCK RD,TESLA,MODEL S,6281.0,Electric,BEV (Battery Electric Vehicle)
317433,169_Woodstock_MV_21.xlsx,BERNARD ELIZABETH A,680 RTE 171,CHEVR,VOLT LT,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
318162,169_Woodstock_MV_21.xlsx,CANEDY THOMAS E JR,435 BRICKYARD RD,TOYOT,PRIUS PR,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
318343,169_Woodstock_MV_21.xlsx,CHANG ROBERT A,1051 RTE 169,MINI,COOPER S,6281.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle)
