# process_EV_registration_activity.ipynb
**This notebook extracts EV registration counts by ZIP code and by month from the EV title and registration activity data file (scraped by the `scrape_veh_registration_data.ipynb` file) and stores the resulting EV counts in `data/vehicles/ev_counts.csv`.**
+ For each month and for each ZIP code, counts the number of "Original Registration" and "Registration Renewal" entries within the past year (365 days).
+ Likewise, for each month and for each ZIP code, counts the number of unique EV models on the market in WA until that date (month). This is taken as an estimate for the EV model variety at each point in time.
+ Stores resulting dataframe (panel data by month (January 2017 to December 2021) and by ZIP code (all ZIP codes in WA)) in `data/vehicles/ev_counts.csv`.

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from uszipcode import SearchEngine
search = SearchEngine()

import utils as u
from config.GLOBAL import *



In [3]:
df_raw = pd.read_csv("data/vehicles/Electric_Vehicle_Title_and_Registration_Activity.csv")
df_key = pd.read_csv("config/Electric_Vehicle_Title_and_Registration_Activity_key.csv")

df_raw.columns = df_key["name"]
df_raw

name,ev_type,vin_1-10,model_year,make,model,new_used,sale_price,trans_date,trans_type,trans_year,...,legislative_district,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,census_tract_2020
0,Battery Electric Vehicle (BEV),5YJ3E1EA3J,2018,TESLA,Model 3,Used,0.00,2020-01-02T00:00:00.000,Registration Renewal,2020,...,5.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,Yes,No,5.303303e+10
1,Battery Electric Vehicle (BEV),5YJ3E1EA3J,2018,TESLA,Model 3,Used,0.00,2020-12-17T00:00:00.000,Registration Renewal,2020,...,5.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,Yes,No,5.303303e+10
2,Battery Electric Vehicle (BEV),5YJ3E1EA3J,2018,TESLA,Model 3,Used,0.00,2021-12-29T00:00:00.000,Registration Renewal,2021,...,5.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,Yes,No,5.303303e+10
3,Battery Electric Vehicle (BEV),5YJ3E1EA3J,2018,TESLA,Model 3,New,53000.00,2019-01-29T00:00:00.000,Original Title,2019,...,5.0,TRANSACTION NOT ELIGIBLE: Sale before August 2...,True,False,False,50,Actual Mileage,Not Applicable,Not Applicable,5.303303e+10
4,Battery Electric Vehicle (BEV),1FTFW1ED3M,2021,FORD,F-150,New,69497.57,2021-09-08T00:00:00.000,Original Title,2021,...,41.0,ERROR: No battery range; TRANSACTION NOT ELIGI...,False,True,False,24,Actual Mileage,Not Applicable,Not Applicable,5.303302e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620980,Plug-in Hybrid Electric Vehicle (PHEV),1G1RG6E49C,2012,CHEVROLET,Volt,Used,0.00,2021-03-17T00:00:00.000,Registration Renewal,2021,...,24.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,Yes,No,5.302700e+10
620981,Plug-in Hybrid Electric Vehicle (PHEV),1G1RG6E49C,2012,CHEVROLET,Volt,Used,0.00,2022-03-21T00:00:00.000,Registration Renewal,2022,...,24.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,Yes,No,5.302700e+10
620982,Plug-in Hybrid Electric Vehicle (PHEV),1G1RG6E49C,2012,CHEVROLET,Volt,Used,11000.00,2017-04-20T00:00:00.000,Original Title,2017,...,24.0,TRANSACTION NOT ELIGIBLE: Sale before August 2019,True,False,True,28277,Actual Mileage,Not Applicable,Not Applicable,5.302700e+10
620983,Plug-in Hybrid Electric Vehicle (PHEV),1G1RG6E49C,2012,CHEVROLET,Volt,Used,0.00,2017-04-20T00:00:00.000,Original Registration,2017,...,24.0,"TRANSACTION NOT ELIGIBLE: Non-sale, registrati...",True,False,False,0,Odometer reading is not collected at time of r...,No,No,5.302700e+10


In [4]:
# convert transaction date column to date format
df_raw["trans_date"] = pd.to_datetime(df_raw["trans_date"]).dt.date

# create make-model column
df_raw["make_model"] = df_raw["make"] + " " + df_raw["model"]

In [5]:
#clean data
df_raw = df_raw[df_raw["state"] == "WA"]

In [6]:
df = df_raw.copy()
# df = df[df["zip"].isin([1262,98119,98144,99362,99403,99603])]

In [7]:
#see how recent the data is
print(df["trans_date"].min())
print(df["trans_date"].max())
print(len(df["census_tract_2020"].unique()))
print(len(df[df["state"]=="WA"]["census_tract_2020"].unique()))

2010-01-06
2022-09-30
1771
1771


In [8]:
times

[datetime.date(2011, 1, 31),
 datetime.date(2011, 2, 28),
 datetime.date(2011, 3, 31),
 datetime.date(2011, 4, 30),
 datetime.date(2011, 5, 31),
 datetime.date(2011, 6, 30),
 datetime.date(2011, 7, 31),
 datetime.date(2011, 8, 31),
 datetime.date(2011, 9, 30),
 datetime.date(2011, 10, 31),
 datetime.date(2011, 11, 30),
 datetime.date(2011, 12, 31),
 datetime.date(2012, 1, 31),
 datetime.date(2012, 2, 29),
 datetime.date(2012, 3, 31),
 datetime.date(2012, 4, 30),
 datetime.date(2012, 5, 31),
 datetime.date(2012, 6, 30),
 datetime.date(2012, 7, 31),
 datetime.date(2012, 8, 31),
 datetime.date(2012, 9, 30),
 datetime.date(2012, 10, 31),
 datetime.date(2012, 11, 30),
 datetime.date(2012, 12, 31),
 datetime.date(2013, 1, 31),
 datetime.date(2013, 2, 28),
 datetime.date(2013, 3, 31),
 datetime.date(2013, 4, 30),
 datetime.date(2013, 5, 31),
 datetime.date(2013, 6, 30),
 datetime.date(2013, 7, 31),
 datetime.date(2013, 8, 31),
 datetime.date(2013, 9, 30),
 datetime.date(2013, 10, 31),
 dateti

In [15]:
# multiindex = pd.MultiIndex.from_product([times, zip_codes], names=["time", "zip"])
# multiindex = pd.MultiIndex.from_product([times, census_tracts_2020], names=["time", "census_tract_2020"])
columns = ["n_ev", "n_bev", "n_phev", "m_ev", "m_bev", "m_phev"]
columns = ["n_ev", "n_bev", "n_phev", "n_ev_new_sales", "n_bev_new_sales", "n_phev_new_sales", "n_ev_used_sales", "n_bev_used_sales", "n_phev_used_sales", "n_ev_renewals", "n_bev_renewals", "n_phev_renewals", "m_ev", "m_bev", "m_phev"]

In [16]:
# create final dataframe and add county and ZIP code column to it
# times_data = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2023) for m in range(1,13)][:-3]
final_df = u.create_empty_df(times, census_tracts_2020, "census_tract_2020")
final_df[columns] = 0
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,n_bev_used_sales,n_phev_used_sales,n_ev_renewals,n_bev_renewals,n_phev_renewals,m_ev,m_bev,m_phev
time,census_tract_2020,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-31,53001950100,Adams,99169,53001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-31,53001950301,Adams,,53001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-31,53001950302,Adams,,53001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2011-01-31,53001950303,Adams,,53001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940003,Yakima,98903,53077,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-09-30,53077940005,Yakima,98948,53077,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-09-30,53077940006,Yakima,98948,53077,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-09-30,53077940007,Yakima,,53077,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
final_df_empty = final_df.copy()

In [None]:
final_df = final_df_empty.copy()

In [12]:
final_df["zip_code"].isna().sum()
final_df.shape

(251544, 18)

In [23]:
# Method: Count "Original Registration"s and "Registration Renewal"s within the past year for each date
final_df[columns] = 0
# dff = df.loc[df["trans_type"].isin(["Original Registration", "Registration Renewal"])]
dff = df
# for time in times:
for time in times[12*6:]:
    print(time, "", end="")
    
    # dfff = dff.loc[(dff["trans_date"] > time-pd.Timedelta(days=365)) & (dff["trans_date"] < time)]
    dfff = dff.loc[(dff["trans_date"] > dt.datetime(time.year-1, time.month, time.day-1).date()) & (dff["trans_date"] < time)]

    #EV counts
    n_ev = dfff.groupby("census_tract_2020").size()
    n_bev = dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)"].groupby("census_tract_2020").size()
    n_phev = dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)"].groupby("census_tract_2020").size()

    #vehicle count (0.0015s*12*10000=3min)
    # n_veh = dfff.groupby("census_tract")["vehicle_record_count"].sum()

    n_ev_new_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Original Registration")].groupby("census_tract_2020").size()
    n_ev_used_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration at Time of Transfer")].groupby("census_tract_2020").size()
    n_ev_renewals = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration Renewal")].groupby("census_tract_2020").size()

    n_bev_new_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Original Registration") & (dff["ev_type"] == "Battery Electric Vehicle (BEV)")].groupby("census_tract_2020").size()
    n_bev_used_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration at Time of Transfer") & (dff["ev_type"] == "Battery Electric Vehicle (BEV)")].groupby("census_tract_2020").size()
    n_bev_renewals = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration Renewal") & (dff["ev_type"] == "Battery Electric Vehicle (BEV)")].groupby("census_tract_2020").size()

    n_phev_new_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Original Registration") & (dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)")].groupby("census_tract_2020").size()
    n_phev_used_sales = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration at Time of Transfer") & (dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)")].groupby("census_tract_2020").size()
    n_phev_renewals = dfff.loc[(dfff["trans_date"] >= dt.datetime(time.year, time.month, 1).date()) & (dfff["trans_type"] == "Registration Renewal") & (dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)")].groupby("census_tract_2020").size()


    #product variety (no. of different EV models)
    m_ev = len(dfff["make_model"].unique())
    m_bev = len(dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)", "make_model"].unique())
    m_phev = len(dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)", "make_model"].unique())
    
    # for zip_code in zip_codes:
    #     if zip_code in n_ev.index:
    #         final_df.loc[time, zip_code]["n_ev"] = n_ev[zip_code]
    #     if zip_code in n_bev.index:
    #         final_df.loc[time, zip_code]["n_bev"] = n_bev[zip_code]
    #     if zip_code in n_phev.index:
    #         final_df.loc[time, zip_code]["n_phev"] = n_phev[zip_code]
    #     final_df.loc[time, zip_code]["m_ev"] = m_ev
    #     final_df.loc[time, zip_code]["m_bev"] = m_bev
    #     final_df.loc[time, zip_code]["m_phev"] = m_phev

    """
    for census_tract in census_tracts_2020:
        if census_tract in n_ev.index:
            final_df.loc[time, census_tract]["n_ev"] = n_ev[census_tract]
        if census_tract in n_bev.index:
            final_df.loc[time, census_tract]["n_bev"] = n_bev[census_tract]
        if census_tract in n_phev.index:
            final_df.loc[time, census_tract]["n_phev"] = n_phev[census_tract]
        final_df.loc[time, census_tract]["m_ev"] = m_ev
        final_df.loc[time, census_tract]["m_bev"] = m_bev
        final_df.loc[time, census_tract]["m_phev"] = m_phev
    #"""

    # """
    # if len(n_veh) > 0:
        # n_veh = n_veh.loc[pd.Index(census_tracts_2020)]
        # print(n_veh)
        # start = timeit.default_timer()
    n_ev = n_ev.filter(items=census_tracts_2020, axis=0)
    n_bev = n_bev.filter(items=census_tracts_2020, axis=0)
    n_phev = n_phev.filter(items=census_tracts_2020, axis=0)
    n_ev_new_sales = n_ev_new_sales.filter(items=census_tracts_2020, axis=0)
    n_bev_new_sales = n_bev_new_sales.filter(items=census_tracts_2020, axis=0)
    n_phev_new_sales = n_phev_new_sales.filter(items=census_tracts_2020, axis=0)
    n_ev_used_sales = n_ev_used_sales.filter(items=census_tracts_2020, axis=0)
    n_bev_used_sales = n_bev_used_sales.filter(items=census_tracts_2020, axis=0)
    n_phev_used_sales = n_phev_used_sales.filter(items=census_tracts_2020, axis=0)
    n_ev_renewals = n_ev_renewals.filter(items=census_tracts_2020, axis=0)
    n_bev_renewals = n_bev_renewals.filter(items=census_tracts_2020, axis=0)
    n_phev_renewals = n_phev_renewals.filter(items=census_tracts_2020, axis=0)

    final_df.loc[(time, slice(None)), "n_ev"] = final_df["n_ev"].add(n_ev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_bev"] = final_df["n_bev"].add(n_bev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_phev"] = final_df["n_phev"].add(n_phev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_ev_new_sales"] = final_df["n_ev_new_sales"].add(n_ev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_bev_new_sales"] = final_df["n_bev_new_sales"].add(n_bev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_phev_new_sales"] = final_df["n_phev_new_sales"].add(n_phev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_ev_used_sales"] = final_df["n_ev_used_sales"].add(n_ev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_bev_used_sales"] = final_df["n_bev_used_sales"].add(n_bev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_phev_used_sales"] = final_df["n_phev_used_sales"].add(n_phev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_ev_renewals"] = final_df["n_ev_renewals"].add(n_ev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_bev_renewals"] = final_df["n_bev_renewals"].add(n_bev, fill_value=0)
    final_df.loc[(time, slice(None)), "n_phev_renewals"] = final_df["n_phev_renewals"].add(n_phev, fill_value=0)

    final_df.loc[(time, slice(None)), "m_ev"] = m_ev
    final_df.loc[(time, slice(None)), "m_bev"] = m_bev
    final_df.loc[(time, slice(None)), "m_phev"] = m_phev
    #"""

final_df[columns] = final_df[columns].astype(int)
final_df["zip_code"] = final_df["zip_code"].convert_dtypes()
final_df[columns] = final_df[columns].replace({0: None})

print("done")

2017-01-31 2017-02-28 2017-03-31 2017-04-30 2017-05-31 2017-06-30 2017-07-31 2017-08-31 2017-09-30 2017-10-31 2017-11-30 2017-12-31 2018-01-31 2018-02-28 2018-03-31 2018-04-30 2018-05-31 2018-06-30 2018-07-31 2018-08-31 2018-09-30 2018-10-31 2018-11-30 2018-12-31 2019-01-31 2019-02-28 2019-03-31 2019-04-30 2019-05-31 2019-06-30 2019-07-31 2019-08-31 2019-09-30 2019-10-31 2019-11-30 2019-12-31 2020-01-31 2020-02-29 2020-03-31 2020-04-30 2020-05-31 2020-06-30 2020-07-31 2020-08-31 2020-09-30 2020-10-31 2020-11-30 2020-12-31 2021-01-31 2021-02-28 2021-03-31 2021-04-30 2021-05-31 2021-06-30 2021-07-31 2021-08-31 2021-09-30 2021-10-31 2021-11-30 2021-12-31 2022-01-31 2022-02-28 2022-03-31 2022-04-30 2022-05-31 2022-06-30 2022-07-31 2022-08-31 2022-09-30 done


In [24]:
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,n_bev_used_sales,n_phev_used_sales,n_ev_renewals,n_bev_renewals,n_phev_renewals,m_ev,m_bev,m_phev
time,census_tract_2020,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-31,53001950100,Adams,99169,53001,,,,,,,,,,,,,,,
2011-01-31,53001950200,Adams,99371,53001,,,,,,,,,,,,,,,
2011-01-31,53001950301,Adams,,53001,,,,,,,,,,,,,,,
2011-01-31,53001950302,Adams,,53001,,,,,,,,,,,,,,,
2011-01-31,53001950303,Adams,,53001,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,2,4,6,2,4,121,60,64
2022-09-30,53077940005,Yakima,98948,53077,2,2,,2,2,,2,2,,2,2,,121,60,64
2022-09-30,53077940006,Yakima,98948,53077,2,2,,2,2,,2,2,,2,2,,121,60,64
2022-09-30,53077940007,Yakima,,53077,1,1,,1,1,,1,1,,1,1,,121,60,64


In [25]:
# save resulting dataframe to file
# final_df.to_csv("data/vehicles/ev_counts.csv")
final_df.to_csv("data/vehicles/ev_counts_.csv")

In [27]:
#express in terms of 2010 census tracts as the index
final_df_2010tracts = final_df.reset_index()
final_df_2010tracts[columns] = final_df_2010tracts[columns].fillna(0)
final_df_2010tracts["census_tract_2010"] = final_df_2010tracts["census_tract_2020"].map(df_tract_20_10.to_dict())
# final_df_2010tracts[["census_tract_2020", "zip_code", "county"]] = final_df_2010tracts[["census_tract_2020", "zip_code", "county"]].astype(str)

final_df_2010tracts2 = final_df_2010tracts[["time", "census_tract_2010", "census_tract_2020", "zip_code"] + columns[:-3]].groupby(["time", "census_tract_2010"]).sum() #for count variables (n_ev, n_bev, n_phev, ...)
final_df_2010tracts3 = final_df_2010tracts[["time", "census_tract_2010", "census_tract_2020", "zip_code"] + columns[-3:]].groupby(["time", "census_tract_2010"]).mean() #product variety (m_ev, m_bev, m_phev)
final_df_2010tracts4 = final_df_2010tracts2.merge(final_df_2010tracts3, on=["time", "census_tract_2010"])
final_df_2010tracts4 = final_df_2010tracts4.drop(["census_tract_2020_y", "zip_code_y"], axis=1)
final_df_2010tracts4 = final_df_2010tracts4.drop(["census_tract_2020_x", "zip_code_x"], axis=1)
# final_df_2010tracts4 = final_df_2010tracts4.rename({"census_tract_2020_x": "census_tract_2020", "zip_code_x": "zip_code"}, axis=1)

# final_df_2010tracts4.to_csv("data/vehicles/ev_counts_2010tracts.csv")
final_df_2010tracts4.to_csv("data/vehicles/ev_counts__2010tracts.csv")
# final_df_2010tracts4.to_csv("data/vehicles/ev_counts_2010tracts_to_2011.csv")
# final_df_2010tracts4.to_csv("data/vehicles/ev_counts_2010tracts_to_2011.csv")
final_df_2010tracts4

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,n_bev_used_sales,n_phev_used_sales,n_ev_renewals,n_bev_renewals,n_phev_renewals,m_ev,m_bev,m_phev
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-31,53001950100,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2011-01-31,53001950200,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2011-01-31,53001950300,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2011-01-31,53001950400,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2011-01-31,53001950500,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,6,4,2,6,4,2,6,4,2,6,4,2,121.0,60.0,64.0
2022-09-30,53077940003,6,2,4,6,2,4,6,2,4,6,2,4,121.0,60.0,64.0
2022-09-30,53077940004,2,2,0,2,2,0,2,2,0,2,2,0,121.0,60.0,64.0
2022-09-30,53077940005,2,2,0,2,2,0,2,2,0,2,2,0,121.0,60.0,64.0


In [67]:
final_df_2010tracts4["census_tract_2020"].unique()

array([ 53001950100,  53001950200, 159005850906, ..., 106155880015,
        53077940005,  53077940006], dtype=int64)

This concludes the main functionality of this notebook.

In [52]:
#EV Population Data
df_pop = pd.read_csv("data/vehicles/Electric_Vehicle_Population_Data.csv")
df_pop.head()

Unnamed: 0,vin_1_10,county,city,state,zip_code,model_year,make,model,ev_type,cafv_type,electric_range,base_msrp,legislative_district,dol_vehicle_id,geocoded_column,electric_utility,_2020_census_tract
0,1N4BZ0CP9G,Snohomish,Arlington,WA,98223,2016,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,84,0,39.0,238236288,,PUGET SOUND ENERGY INC,53061052701
1,1N4AZ0CP5G,King,Seattle,WA,98118,2016,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,84,0,37.0,182735367,,PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033011102
2,1N4BZ1CP7K,Thurston,Olympia,WA,98502,2019,NISSAN,LEAF,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,150,0,22.0,142814556,POINT (-102.71236 22.94812),,53067012002
3,5YJ3E1EBXK,Frederick,Frederick,MD,21704,2019,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,220,0,,114252337,POINT (-77.368491 39.354463),,24021752205
4,5YJ3E1EAXL,King,Bellevue,WA,98008,2020,TESLA,MODEL 3,Battery Electric Vehicle (BEV),Clean Alternative Fuel Vehicle Eligible,266,0,48.0,2075510,,PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA),53033023100


In [53]:
print(len(df_pop["_2020_census_tract"].unique()))
print(len(df_pop[df_pop["state"]=="WA"]["_2020_census_tract"].unique()))

2031
1766


In [66]:
df_pop_counts = pd.DataFrame(index=census_tracts, columns=["county", "zip"]+columns)
df_pop_counts.index.name = "census_tract"

for census_tract in census_tracts:
    n_ev = df_pop.groupby("_2020_census_tract").size()
    n_bev = df_pop.loc[df_pop["ev_type"] == "Battery Electric Vehicle (BEV)"].groupby("_2020_census_tract").size()
    n_phev = df_pop.loc[df_pop["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)"].groupby("_2020_census_tract").size()

    if census_tract in n_ev.index:
        df_pop_counts.loc[census_tract, "n_ev"] = n_ev[census_tract]
    if census_tract in n_bev.index:
        df_pop_counts.loc[census_tract, "n_bev"] = n_bev[census_tract]
    if census_tract in n_phev.index:
        df_pop_counts.loc[census_tract, "n_phev"] = n_phev[census_tract]


In [69]:
df_pop_counts.to_csv("data/vehicles/ev_counts_from_pop.csv")

In [57]:
df_pop_counts = df_pop.groupby("_2020_census_tract").size()
df_pop_counts

_2020_census_tract
1101001400     1
2020000101     1
2090000500     1
2110000500     1
4013318800     2
              ..
53077940007    1
55031030101    1
55089650300    1
56021000702    1
56033000100    1
Length: 2031, dtype: int64

From here, this notebook contains supplemental code to study the resulting EV counts
+ e.g.: aggregate ZIP code-level counts by county and compare with records in `Electric_Vehicle_Population_Size_History_By_County.csv`

In [323]:
# calculate county-level counts
multiindex_c = pd.MultiIndex.from_product([times, counties], names=["time", "county"])
final_df_c = pd.DataFrame(index=multiindex_c, columns=columns)

for time in times:
    a = final_df.loc[time].groupby("county").sum()
    for county in counties:
        final_df_c.loc[time, county] = a.loc[county]
final_df_c = final_df_c[["n_ev", "n_bev", "n_phev"]]

In [324]:
final_df_c

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,12563,8531,4032
2017-01-31,Snohomish,2248,1408,840
2017-01-31,Kittitas,30,16,14
2017-01-31,Kitsap,773,514,259
2017-01-31,Whatcom,497,353,144
...,...,...,...,...
2021-11-30,Benton,1061,650,411
2021-11-30,Walla Walla,240,154,86
2021-11-30,Columbia,10,8,2
2021-11-30,Garfield,3,0,3


In [326]:
# final_df_c.to_csv("data/vehicles/ev_counts_by_county_from_registrations_m1.csv")
final_df_c.to_csv("data/vehicles/ev_counts_by_county_from_registrations.csv")

In [276]:
# compare with Electric_Vehicle_Population_Size_History_By_County
df_by_county_raw = pd.read_csv("data/vehicles/Electric_Vehicle_Population_Size_History_By_County.csv")
df_by_county_raw_key = pd.read_csv("config/Electric_Vehicle_Population_Size_History_By_County_key.csv")

df_by_county_raw.columns = df_by_county_raw_key["name"]

df_by_county_raw = df_by_county_raw[df_by_county_raw["state"] == "WA"]

In [277]:
# convert date column to date format
df_by_county_raw["time"] = pd.to_datetime(df_by_county_raw["time"])

In [278]:
df_by_county_raw = df_by_county_raw.dropna(subset=["county"])

In [279]:
df_by_county_raw = df_by_county_raw.set_index(["time", "county"])

In [280]:
df_by_county_raw.head()

Unnamed: 0_level_0,name,state,primary_use,n_bev,n_phev,n_ev,n_icev,n_total,p_ev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-31,Adams,WA,Passenger,2,1,3,13240,13243,0.022653
2017-01-31,Adams,WA,Truck,0,0,0,5556,5556,0.0
2017-01-31,Asotin,WA,Passenger,6,5,11,13825,13836,0.079503
2017-01-31,Asotin,WA,Truck,0,0,0,7204,7204,0.0
2017-01-31,Benton,WA,Passenger,131,134,265,131266,131531,0.201473


In [283]:
#adding up passenger cars and trucks
df_by_county = pd.DataFrame(index=multiindex_c, columns=columns)

for time in times:
    a = df_by_county_raw.loc[time].groupby("county").sum()
    for county in counties:
        df_by_county.loc[time, county] = a.loc[county]
df_by_county = df_by_county[["n_ev", "n_bev", "n_phev"]].astype(int)

In [284]:
df_by_county

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,12850,8841,4009
2017-01-31,Snohomish,2451,1558,893
2017-01-31,Kittitas,35,16,19
2017-01-31,Kitsap,842,563,279
2017-01-31,Whatcom,551,392,159
...,...,...,...,...
2021-11-30,Benton,1085,669,416
2021-11-30,Walla Walla,229,145,84
2021-11-30,Columbia,8,7,1
2021-11-30,Garfield,4,1,3


In [286]:
df_by_county.to_csv("data/vehicles/ev_counts_by_county.csv")

In [327]:
df_by_county_ratio = final_df_c.divide(df_by_county.replace({0: np.nan }))
df_by_county_ratio

Unnamed: 0_level_0,Unnamed: 1_level_0,n_ev,n_bev,n_phev
time,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-31,King,0.977665,0.964936,1.005737
2017-01-31,Snohomish,0.917177,0.903723,0.940649
2017-01-31,Kittitas,0.857143,1.0,0.736842
2017-01-31,Kitsap,0.918052,0.912966,0.928315
2017-01-31,Whatcom,0.901996,0.90051,0.90566
...,...,...,...,...
2021-11-30,Benton,0.97788,0.971599,0.987981
2021-11-30,Walla Walla,1.048035,1.062069,1.02381
2021-11-30,Columbia,1.25,1.142857,2.0
2021-11-30,Garfield,0.75,0.0,1.0


In [328]:
# df_by_county_ratio.to_csv("data/vehicles/ev_counts_by_county_ratio_m1.csv")
df_by_county_ratio.to_csv("data/vehicles/ev_counts_by_county_ratio.csv")

In [329]:
final_df_WA = final_df_c.groupby("time").sum()
df_WA       = df_by_county.groupby("time").sum()
df_WA_ratio = final_df_WA.divide(df_WA)

In [330]:
# df_WA_ratio.to_csv("data/vehicles/ev_counts_WA_ratio_m1.csv")
df_WA_ratio.to_csv("data/vehicles/ev_counts_WA_ratio.csv")

In [1]:
# Alternative Method: Count "Original Registration"s for each month since Jan 2017
# Problem: Significant overcounting (that also increases over time) of registered EVs since EVs can also leave the WA system (by being sold to out-of-state of scraped)
dff = df[df["trans_type"] == "Original Registration"]
for time in times:
    print(time)
    
    dfff = dff[dff["trans_date"] < time]
    
    n_ev = dfff.groupby("zip").size()
    n_bev = dfff.loc[dff["ev_type"] == "Battery Electric Vehicle (BEV)"].groupby("zip").size()
    n_phev = dfff.loc[dff["ev_type"] == "Plug-in Hybrid Electric Vehicle (PHEV)"].groupby("zip").size()

#     m_ev = len(df["make_model"].unique())
    
    for zip_code in zip_codes:
        if zip_code in n_ev.index:
            final_df.loc[time, zip_code]["n_ev"] = n_ev[zip_code]
        if zip_code in n_bev.index:
            final_df.loc[time, zip_code]["n_bev"] = n_bev[zip_code]
        if zip_code in n_phev.index:
            final_df.loc[time, zip_code]["n_phev"] = n_phev[zip_code]

NameError: name 'df' is not defined