In [47]:
# --- Import required packages
import pathlib
import pandas as pd
import numpy as np
from itertools import combinations
import os
from tqdm import tqdm
import requests
from datetime import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import warnings
import platform
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)

In [33]:
# Required functions
def get_default_vin_variables():
     variables = ["Make", "Manufacturer Name",
                  "Model", "Model Year", "Body Class",
                  "Trim", "Trim2", "Drive Type", "Base Price ($)",
                  "Fuel Type - Primary",
                  "Electrification Level"]

def fetch_vin_data(vin, variables = None):
        """
        Input: An unmatched, but corrected VIN
        Output: A matched VIN or NA
        
        """
        if not variables:
            variables = ["Manufacturer Name", "Model", "Model Year", "Fuel Type - Primary", "Electrification Level"]
        else:
            variables = variables
        
        url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{vin.strip()}?format=csv")

        # Download response
        resp_df = pd.read_csv(url)

        # Extract needed
        resp_df = resp_df.loc[:, ["variable", "value"]].T
        # resp_df = resp_df.loc[resp_df["variable"].isin(variables), ["variable", "value"]].T
        resp_df.columns = resp_df.iloc[0]
        resp_df = resp_df.drop("variable", axis = 0)
        resp_df["vin_corrected"] = vin
        
        return resp_df

In [34]:
# --- Set paths ----------------
if platform.platform()[0:5] == 'macOS':
    cd = pathlib.Path().resolve().parent
    str_project = cd / "Documents" 
    data_path = cd / "Documents" / "tobin_working_data"
    rlpolk_data_path = data_path / "rlpolk_data"
    vin_matching_pathpackages= data_path / "vin_matching"
else:
    data_path = pathlib.Path().resolve().parent.parent / "rn_home" / "data"
    rlpolk_data_path = data_path / "rlpolk_data"
    vin_matching_pathpackages= data_path / "vin_matching"

In [35]:
# --- Import RLPolk Data (partially matched) ------
rlp_raw = pd.read_csv(rlpolk_data_path / "rlpolk_data_matched.csv", index_col = [0])
evs_only = rlp_raw[rlp_raw["Fuel Type - Primary"] == "Electric"].reset_index(drop = True)
ev_vins = evs_only["vin_corrected"].unique().tolist()

In [None]:
# --- Import Experian Data
# exp_data = pd.read_csv(data_path / "intermediate" / "US_VIN_data_common.csv")
# exp_ct = exp_data[exp_data["state"]=="CONNECTICUT"]
# exp_ct_ev = exp_ct[exp_ct["fueltype"]=="L"]

In [10]:
# Download some characteristics and show what we get - mostly NA
if (True):
    ev_chars = pd.DataFrame([])

    for vin in tqdm(ev_vins):
        # vin = row["vin_corrected"]
        try:
            chars = fetch_vin_data(vin)
            # print(len(chars.columns))
        except:
            pass
        try:   
            ev_chars = pd.concat([ev_chars, chars], axis = 0).reset_index(drop = True)
        except:
            pass


    ev_chars.to_csv(rlpolk_data_path / "rlp_ev_characteristics_012624.csv")

# Check EV sales per model year
rlp_evs_summary = evs_only[["year", "VEH_COUNT"]].groupby("year").sum().reset_index()
# exp_evs_summary = exp_ct_ev[["year", "agg_count"]].groupby("year").sum().reset_index()
print(rlp_evs_summary.head(10))
# print(exp_evs_summary.head(10))
print(rlp_evs_summary.iloc[0:6, 0].sum())

100%|██████████| 712/712 [02:23<00:00,  4.95it/s]


   year  VEH_COUNT
0  2018       4571
1  2019       4042
2  2020       3660
3  2021       8461
4  2022      10046
10100


# Read in characteristics and see what we can get

In [36]:
ev_chars = pd.read_csv(rlpolk_data_path / "rlp_ev_characteristics_012624.csv", index_col = [0])

In [None]:
ev_chars.columns

In [70]:
evs_only_tomerge = evs_only.loc[:, ~evs_only.columns.isin(ev_chars.columns[0:len(ev_chars.columns)-1])]

In [71]:
evs_only_chars = evs_only_tomerge.merge(ev_chars, how = 'left', on='vin_corrected')

In [72]:
ev_notna_summary = evs_only_chars.loc[:, evs_only_chars.columns.isin(ev_chars.columns)]
ev_notna_summary = pd.DataFrame(ev_notna_summary.notna().sum()).reset_index().rename({0: "Count"}, axis = 1)
ev_notna_summary["Percentage"] = ev_notna_summary["Count"]/len(evs_only)

In [73]:
ev_notna_summary.sort_values("Count", ascending = False).head(70)

Unnamed: 0,index,Count,Percentage
0,vin_corrected,29592,1.0
122,Custom Motorcycle Type,27885,0.942315
124,Motorcycle Chassis Type,27885,0.942315
123,Motorcycle Suspension Type,27885,0.942315
9,Manufacturer Name,27885,0.942315
11,Model Year,27885,0.942315
15,Vehicle Type,27885,0.942315
8,Make,27885,0.942315
6,Vehicle Descriptor,27885,0.942315
40,Trailer Body Type,27885,0.942315


In [84]:
vars_of_interest = ["vin_corrected", "Make", "Vehicle Type", "Fuel Type - Primary",
                    "Gross Vehicle Weight From", "Doors", "Number of Seats", "Curb Weight (pounds)",
                    "Transmission Style", "Wheel Base (inches) From", "Engine Brake (hp) From", "Displacement (CI)"]

ev_notna_keyvars = ev_notna_summary.loc[ev_notna_summary["index"].isin(vars_of_interest), :].reset_index(drop = True).sort_values("Percentage", ascending = False)

In [85]:
ev_notna_keyvars

Unnamed: 0,index,Count,Percentage
0,vin_corrected,29592,1.0
1,Make,27885,0.942315
2,Vehicle Type,27885,0.942315
9,Fuel Type - Primary,27800,0.939443
3,Doors,26415,0.89264
6,Number of Seats,21723,0.734084
7,Transmission Style,20834,0.704042
5,Wheel Base (inches) From,13680,0.462287
10,Engine Brake (hp) From,10550,0.356515
8,Displacement (CI),8882,0.300149


In [87]:
print(ev_notna_keyvars.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
index & Count & Percentage \\
\midrule
vin_corrected & 29592 & 1.000000 \\
Make & 27885 & 0.942315 \\
Vehicle Type & 27885 & 0.942315 \\
Fuel Type - Primary & 27800 & 0.939443 \\
Doors & 26415 & 0.892640 \\
Number of Seats & 21723 & 0.734084 \\
Transmission Style & 20834 & 0.704042 \\
Wheel Base (inches) From & 13680 & 0.462287 \\
Engine Brake (hp) From & 10550 & 0.356515 \\
Displacement (CI) & 8882 & 0.300149 \\
Curb Weight (pounds) & 366 & 0.012368 \\
\bottomrule
\end{tabular}



In [None]:
ev_notna_summary.sort_values("XX", ascending = False)