# process_EV_registration_activity.ipynb
**This notebook extracts EV registration counts by ZIP code and by month from the EV title and registration activity data file (scraped by the `scrape_veh_registration_data.ipynb` file) and stores the resulting EV counts in `data/vehicles/ev_counts.csv`.**
+ For each month and for each ZIP code, counts the number of "Original Registration" and "Registration Renewal" entries within the past year (365 days).
+ Likewise, for each month and for each ZIP code, counts the number of unique EV models on the market in WA until that date (month). This is taken as an estimate for the EV model variety at each point in time.
+ Stores resulting dataframe (panel data by month (January 2017 to December 2021) and by ZIP code (all ZIP codes in WA)) in `data/vehicles/ev_counts.csv`.

In [26]:
import pandas as pd
import numpy as np
import datetime as dt
from uszipcode import SearchEngine
search = SearchEngine()

import utils as u
from config.counties_zips import zip_codes,counties

In [27]:
df_raw = pd.read_csv("data/vehicles/Electric_Vehicle_Title_and_Registration_Activity.csv")
df_key = pd.read_csv("config/Electric_Vehicle_Title_and_Registration_Activity_key.csv")

df_raw.columns = df_key["name"]

In [28]:
df_raw.head()

name,ev_type,vin_1-10,model_year,make,model,new_used,sale_price,trans_date,trans_type,trans_year,...,dol_veh_id,legislative_district,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7
0,Plug-in Hybrid Electric Vehicle (PHEV),1G1RB6S55J,2018,CHEVROLET,Volt,Used,0.0,2019-07-01T00:00:00.000,Registration Renewal,2019,...,290959301,22.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,No,No
1,Plug-in Hybrid Electric Vehicle (PHEV),1G1RA6S50H,2017,CHEVROLET,Volt,New,0.0,2016-06-30T00:00:00.000,Original Registration,2016,...,215759750,1.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,6,Actual Mileage,,
2,Plug-in Hybrid Electric Vehicle (PHEV),1G1RA6S50H,2017,CHEVROLET,Volt,New,0.0,2016-06-30T00:00:00.000,Original Title,2016,...,215759750,1.0,TRANSACTION NOT ELIGIBLE: Sale before August 2...,True,False,False,6,Actual Mileage,,
3,Battery Electric Vehicle (BEV),1N4AZ0CP9D,2013,NISSAN,Leaf,Used,0.0,2015-05-06T00:00:00.000,Registration Renewal,2015,...,227913206,5.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,15,Actual Mileage,,
4,Plug-in Hybrid Electric Vehicle (PHEV),1G1RA6S54H,2017,CHEVROLET,Volt,Used,0.0,2019-04-18T00:00:00.000,Registration Renewal,2019,...,190099227,45.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,No,No


In [29]:
# convert transaction date column to date format
df_raw["trans_date"] = pd.to_datetime(df_raw["trans_date"]).dt.date

# create make-model column
df_raw["make_model"] = df_raw["make"] + " " + df_raw["model"]

In [30]:
df = df_raw.copy()
# df = df[df["zip"].isin([1262,98119,98144,99362,99403,99603])]

In [31]:
times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2022) for m in range(1,13)]
# times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2018) for m in range(1,3)]
# times = times[:-1] #because Electric_Vehicle_Population_Size_History_By_County is currently only available until Nov 30, 2021
# zip_codes = df["zip"].unique()
# print(zip_codes, len(zip_codes))

In [32]:
multiindex = pd.MultiIndex.from_product([times, zip_codes], names=["time", "zip"])

In [33]:
columns = ["n_ev", "n_bev", "n_phev", "m_ev", "m_bev", "m_phev"]

In [34]:
# create final dataframe and add county column to it
final_df = pd.DataFrame(index=multiindex, columns=["county"]+columns)
for zip_code in zip_codes:
    final_df.loc[(slice(None), zip_code), "county"] = search.by_zipcode(zip_code).county.replace(" County", "")
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev
time,zip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-31,98001,King,,,,,,
2017-01-31,98002,King,,,,,,
2017-01-31,98003,King,,,,,,
2017-01-31,98004,King,,,,,,
2017-01-31,98005,King,,,,,,
...,...,...,...,...,...,...,...,...
2021-12-31,99363,Walla Walla,,,,,,
2021-12-31,99371,Adams,,,,,,
2021-12-31,99401,Asotin,,,,,,
2021-12-31,99402,Asotin,,,,,,


In [35]:
# Method: Count "Original Registration"s and "Registration Renewal"s within the past year for each date
dff = df.loc[df["trans_type"].isin(["Original Registration", "Registration Renewal"])]
for time in times:
    print(time, "", end="")
    
    dfff = dff.loc[(dff["trans_date"] > time-pd.Timedelta(days=365)) & (dff["trans_date"] < time)]
    
    n_ev = dfff.groupby("zip").size()
    n_bev = dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)"].groupby("zip").size()
    n_phev = dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)"].groupby("zip").size()

    m_ev = len(dfff["make_model"].unique())
    m_bev = len(dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)", "make_model"].unique())
    m_phev = len(dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)", "make_model"].unique())
    
    for zip_code in zip_codes:
        if zip_code in n_ev.index:
            final_df.loc[time, zip_code]["n_ev"] = n_ev[zip_code]
        if zip_code in n_bev.index:
            final_df.loc[time, zip_code]["n_bev"] = n_bev[zip_code]
        if zip_code in n_phev.index:
            final_df.loc[time, zip_code]["n_phev"] = n_phev[zip_code]
        final_df.loc[time, zip_code]["m_ev"] = m_ev
        final_df.loc[time, zip_code]["m_bev"] = m_bev
        final_df.loc[time, zip_code]["m_phev"] = m_phev
print("done")

2017-01-31 2017-02-28 2017-03-31 2017-04-30 2017-05-31 2017-06-30 2017-07-31 2017-08-31 2017-09-30 2017-10-31 2017-11-30 2017-12-31 2018-01-31 2018-02-28 2018-03-31 2018-04-30 2018-05-31 2018-06-30 2018-07-31 2018-08-31 2018-09-30 2018-10-31 2018-11-30 2018-12-31 2019-01-31 2019-02-28 2019-03-31 2019-04-30 2019-05-31 2019-06-30 2019-07-31 2019-08-31 2019-09-30 2019-10-31 2019-11-30 2019-12-31 2020-01-31 2020-02-29 2020-03-31 2020-04-30 2020-05-31 2020-06-30 2020-07-31 2020-08-31 2020-09-30 2020-10-31 2020-11-30 2020-12-31 2021-01-31 2021-02-28 2021-03-31 2021-04-30 2021-05-31 2021-06-30 2021-07-31 2021-08-31 2021-09-30 2021-10-31 2021-11-30 2021-12-31 done


In [36]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev
time,zip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-31,98001,King,73,34,39,43,24,20
2017-01-31,98002,King,32,17,15,43,24,20
2017-01-31,98003,King,54,22,32,43,24,20
2017-01-31,98004,King,397,306,91,43,24,20
2017-01-31,98005,King,194,138,56,43,24,20
...,...,...,...,...,...,...,...,...
2021-12-31,99363,Walla Walla,,,,107,46,65
2021-12-31,99371,Adams,,,,107,46,65
2021-12-31,99401,Asotin,,,,107,46,65
2021-12-31,99402,Asotin,8,8,,107,46,65


In [19]:
# save resulting dataframe to file
final_df.to_csv("data/vehicles/ev_counts.csv")

This concludes the main functionality of this notebook.

From here, this notebook contains supplemental code to study the resulting EV counts
+ e.g.: aggregate ZIP code-level counts by county and compare with records in `Electric_Vehicle_Population_Size_History_By_County.csv`

In [323]:
# calculate county-level counts
multiindex_c = pd.MultiIndex.from_product([times, counties], names=["time", "county"])
final_df_c = pd.DataFrame(index=multiindex_c, columns=columns)

for time in times:
    a = final_df.loc[time].groupby("county").sum()
    for county in counties:
        final_df_c.loc[time, county] = a.loc[county]
final_df_c = final_df_c[["n_ev", "n_bev", "n_phev"]]

In [324]:
final_df_c

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,12563,8531,4032
2017-01-31,Snohomish,2248,1408,840
2017-01-31,Kittitas,30,16,14
2017-01-31,Kitsap,773,514,259
2017-01-31,Whatcom,497,353,144
...,...,...,...,...
2021-11-30,Benton,1061,650,411
2021-11-30,Walla Walla,240,154,86
2021-11-30,Columbia,10,8,2
2021-11-30,Garfield,3,0,3


In [326]:
# final_df_c.to_csv("data/vehicles/ev_counts_by_county_from_registrations_m1.csv")
final_df_c.to_csv("data/vehicles/ev_counts_by_county_from_registrations.csv")

In [276]:
# compare with Electric_Vehicle_Population_Size_History_By_County
df_by_county_raw = pd.read_csv("data/vehicles/Electric_Vehicle_Population_Size_History_By_County.csv")
df_by_county_raw_key = pd.read_csv("config/Electric_Vehicle_Population_Size_History_By_County_key.csv")

df_by_county_raw.columns = df_by_county_raw_key["name"]

df_by_county_raw = df_by_county_raw[df_by_county_raw["state"] == "WA"]

In [277]:
# convert date column to date format
df_by_county_raw["time"] = pd.to_datetime(df_by_county_raw["time"])

In [278]:
df_by_county_raw = df_by_county_raw.dropna(subset=["county"])

In [279]:
df_by_county_raw = df_by_county_raw.set_index(["time", "county"])

In [280]:
df_by_county_raw.head()

Unnamed: 0_level_0,name,state,primary_use,n_bev,n_phev,n_ev,n_icev,n_total,p_ev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-31,Adams,WA,Passenger,2,1,3,13240,13243,0.022653
2017-01-31,Adams,WA,Truck,0,0,0,5556,5556,0.0
2017-01-31,Asotin,WA,Passenger,6,5,11,13825,13836,0.079503
2017-01-31,Asotin,WA,Truck,0,0,0,7204,7204,0.0
2017-01-31,Benton,WA,Passenger,131,134,265,131266,131531,0.201473


In [283]:
#adding up passenger cars and trucks
df_by_county = pd.DataFrame(index=multiindex_c, columns=columns)

for time in times:
    a = df_by_county_raw.loc[time].groupby("county").sum()
    for county in counties:
        df_by_county.loc[time, county] = a.loc[county]
df_by_county = df_by_county[["n_ev", "n_bev", "n_phev"]].astype(int)

In [284]:
df_by_county

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,12850,8841,4009
2017-01-31,Snohomish,2451,1558,893
2017-01-31,Kittitas,35,16,19
2017-01-31,Kitsap,842,563,279
2017-01-31,Whatcom,551,392,159
...,...,...,...,...
2021-11-30,Benton,1085,669,416
2021-11-30,Walla Walla,229,145,84
2021-11-30,Columbia,8,7,1
2021-11-30,Garfield,4,1,3


In [286]:
df_by_county.to_csv("data/vehicles/ev_counts_by_county.csv")

In [327]:
df_by_county_ratio = final_df_c.divide(df_by_county.replace({0: np.nan }))
df_by_county_ratio

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,0.977665,0.964936,1.005737
2017-01-31,Snohomish,0.917177,0.903723,0.940649
2017-01-31,Kittitas,0.857143,1.0,0.736842
2017-01-31,Kitsap,0.918052,0.912966,0.928315
2017-01-31,Whatcom,0.901996,0.90051,0.90566
...,...,...,...,...
2021-11-30,Benton,0.97788,0.971599,0.987981
2021-11-30,Walla Walla,1.048035,1.062069,1.02381
2021-11-30,Columbia,1.25,1.142857,2.0
2021-11-30,Garfield,0.75,0.0,1.0


In [328]:
# df_by_county_ratio.to_csv("data/vehicles/ev_counts_by_county_ratio_m1.csv")
df_by_county_ratio.to_csv("data/vehicles/ev_counts_by_county_ratio.csv")

In [329]:
final_df_WA = final_df_c.groupby("time").sum()
df_WA       = df_by_county.groupby("time").sum()
df_WA_ratio = final_df_WA.divide(df_WA)

In [330]:
# df_WA_ratio.to_csv("data/vehicles/ev_counts_WA_ratio_m1.csv")
df_WA_ratio.to_csv("data/vehicles/ev_counts_WA_ratio.csv")

In [1]:
# Alternative Method: Count "Original Registration"s for each month since Jan 2017
# Problem: Significant overcounting (that also increases over time) of registered EVs since EVs can also leave the WA system (by being sold to out-of-state of scraped)
dff = df[df["trans_type"] == "Original Registration"]
for time in times:
    print(time)
    
    dfff = dff[dff["trans_date"] < time]
    
    n_ev = dfff.groupby("zip").size()
    n_bev = dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)"].groupby("zip").size()
    n_phev = dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)"].groupby("zip").size()

#     m_ev = len(df["make_model"].unique())
    
    for zip_code in zip_codes:
        if zip_code in n_ev.index:
            final_df.loc[time, zip_code]["n_ev"] = n_ev[zip_code]
        if zip_code in n_bev.index:
            final_df.loc[time, zip_code]["n_bev"] = n_bev[zip_code]
        if zip_code in n_phev.index:
            final_df.loc[time, zip_code]["n_phev"] = n_phev[zip_code]

NameError: name 'df' is not defined