In [1]:
# Warning Management
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Maths
import numpy as np
import math

# Path management
import pathlib
import os
import sys

# Progress tracking
from tqdm import tqdm

# Regular Expressions
import re

# Logging
import logging

# Helpers
from helper_fns import convert_vin_valid, return_matched_vins, create_valid_zip, try_divide

# 1. Import Data

In [2]:
# Version of raw data to get
version = "110723"

# Paths 
path = pathlib.Path().resolve().parent.parent/ "rn_home"

# Set up paths for different systems
if sys.platform == 'linux':
    data_path = path / "data" 
    matched_municipal_path = data_path / "municipal_dataset_matched" / f"municipal_dataset_matched_{version}.csv"
    rlpolk_path = data_path / "rlpolk_data" / "rlpolk_data_matched.csv"
else:
    data_path = path.parent / "data"
    print("YET TO FIX THIS")

In [3]:
matched_municipal_data = pd.read_csv(matched_municipal_path, index_col = [0])
matched_rlpolk_data = pd.read_csv(rlpolk_path, index_col = 0)
matched_rlpolk_data["year"] = matched_rlpolk_data["YEAR_MONTH"].astype(str).str[0:4].astype(int)
matched_rlpolk_data["month"] = matched_rlpolk_data["YEAR_MONTH"].astype(str).str[4:].astype(int)

  matched_municipal_data = pd.read_csv(matched_municipal_path, index_col = [0])


# 2. Comparison

## Set up data

In [4]:
# Get 2021 data for the municipal dataset
matched_municipal_21 = matched_municipal_data[matched_municipal_data["record_from"].str.contains("_MV_21")]

# Drop duplicates
matched_municipal_21_dd = matched_municipal_21.drop_duplicates("vehicle_id")

# Print difference
print(f"There is a difference of {len(matched_municipal_21) - len(matched_municipal_21_dd)} when deduplicating by VINs")

There is a difference of 28152 when deduplicating by VINs


In [5]:
# Get 2021 data for RLPolk
matched_rlpolk_21_filter = ((matched_rlpolk_data["year"] == 2020) & (matched_rlpolk_data["month"] >=7)) | ((matched_rlpolk_data["year"] == 2021)  & (matched_rlpolk_data["month"] <7))
matched_rlpolk_21 = matched_rlpolk_data.loc[matched_rlpolk_21_filter]

## Comparisons

### Compare the number of vehicles between municipal and RLPolk

### Define filters

In [8]:
def get_lease_mask(data):
    mask = data["lease_street"].notna()| data["lease_city"].notna() | data["lease_state"].notna() | data["lease_zip"].notna()
    return mask

def get_model_year_mask(data):
    mask = (data["Model Year"] == 2020) | (data["Model Year"] ==2021) | (data["Model Year"] ==2019)
    return mask

def get_matched_mask(data):
    mask = (data["Manufacturer Name"].notna())
    return mask

def get_rlp_mask(data):
    rlpolk_models = matched_rlpolk_21["Model"].unique().tolist()
    mask = data["Model"].isin(rlpolk_models)
    return mask

In [12]:
# Filter for leased vehicles
lease_mask = get_lease_mask(matched_municipal_21)

# Get model years 20 and 2021
model_year_mask = get_model_year_mask(matched_municipal_21)

# Filter for matches
matched_mask = get_matched_mask(matched_municipal_21)

# Filter for CT
ct_mask = (matched_municipal_21["state"] == "CT") | (matched_municipal_21["zip_corrected"].astype(str).str[:-2].str.zfill(5).str[0:2] == "06")

In [14]:
# Print results
print(f"In the municipal dataset, {sum(~lease_mask & model_year_mask & matched_mask)} vehicles are sold in tax year 2021")
print(f"In the municipal dataset, {sum(model_year_mask & matched_mask)} vehicles are sold in tax year 2021, including leases")
print(f"This is calculated by taking tax year 2021, and filtering for model years > 2020 and 2021")

In the municipal dataset, 392137 vehicles are sold in tax year 2021
In the municipal dataset, 455306 vehicles are sold in tax year 2021, including leases
This is calculated by taking tax year 2021, and filtering for model years > 2020 and 2021


In [15]:
# Do the same for RLPolk
print(f"In the RLPolk Dataset, {matched_rlpolk_21["VEH_COUNT"].sum()} vehicles are sold in FY 2021")

# Try for the CY
print(f"In the RLPolk Dataset, {matched_rlpolk_data[matched_rlpolk_data["year"]==2021]["VEH_COUNT"].sum()} vehicles are sold in CY 2021")

In the RLPolk Dataset, 191187 vehicles are sold in FY 2021
In the RLPolk Dataset, 173449 vehicles are sold in CY 2021


### Compare only those models present in both datasets

In [16]:
# Filter for being in RLP 
mun_rlpolk_model_filter = get_rlp_mask(matched_municipal_21)
num_veh_rlpolk_mun = model_year_mask & mun_rlpolk_model_filter
num_veh_rlpolk_mun_notlease = num_veh_rlpolk_mun & ~lease_mask

# Print results
print(f"In the municipal dataset, {num_veh_rlpolk_mun.sum()} vehicles **with the same models as RLPolk** sold in 2021")
print(f"In the municipal dataset, {num_veh_rlpolk_mun_notlease.sum()} vehicles **with the same models as RLPolk** sold in 2021, excl. leases")

In the municipal dataset, 425607 vehicles **with the same models as RLPolk** sold in 2021
In the municipal dataset, 365708 vehicles **with the same models as RLPolk** sold in 2021, excl. leases


Model filter will not make this perfectly like-for-like. There are some models that are between light and heavy duty trucks, where the class differs, but the model name is the same. The class here depends on the trim level and not the model name. E.g. F250-F350. Class is Gross Vehicle Weight Rating (Weight + How much can carry). If we have a 250 + Dual Wheel Axle, it can carry more, therefore it will be a higher class.

Should also be able to get class using the NHTSA VIN tool.

### Compare EVs

In [18]:
num_veh_rlpolk_mun_evs = model_year_mask & mun_rlpolk_model_filter & (matched_municipal_21["Fuel Type - Primary"] == "Electric")
num_veh_rlpolk_mun_evs_nl = num_veh_rlpolk_mun_evs & ~lease_mask

# Print results
print(f"In the municipal dataset, {num_veh_rlpolk_mun_evs.sum()} EVs **with the same models as RLPolk** sold in 2021")
print(f"In the municipal dataset, {num_veh_rlpolk_mun_evs_nl.sum()} EVs **with the same models as RLPolk** sold in 2021, excl. leases")

In the municipal dataset, 10505 EVs **with the same models as RLPolk** sold in 2021
In the municipal dataset, 8545 EVs **with the same models as RLPolk** sold in 2021, excl. leases


In [19]:
matched_rlpolk_21_evs = matched_rlpolk_21["Fuel Type - Primary"] == "Electric"

# Do the same for RLPolk
print(f"In the RLPolk Dataset, {matched_rlpolk_21[matched_rlpolk_21_evs]["VEH_COUNT"].sum()} EVs sold in FY 2021")

# Try with just calendar year
print(f"In the RLPolk Dataset, {matched_rlpolk_data[(matched_rlpolk_data["year"]==2021) & (matched_rlpolk_data["Fuel Type - Primary"]=="Electric")]["VEH_COUNT"].sum()} EVs sold in CY 2021")

In the RLPolk Dataset, 6086 EVs sold in FY 2021
In the RLPolk Dataset, 8461 EVs sold in CY 2021


### Compare EVs with CT state

In [115]:
len(matched_municipal_21[ct_mask & mun_rlpolk_model_filter &  (matched_municipal_21["Fuel Type - Primary"] == "Electric")])

12193

Could there be information that has been dropped in the municipal compiled file that would be helpful? 
* Check in the compilation code: There are two classes of municipalities in terms of reporting lease data. Some report two
    * Address where the vehicle is parked
    * Address of the owner (sometimes they are separate)
* For the places that only report the owner, so owned and leased vehicles are mixed together, you can replace the zip code so that if the Zip code is not from the town where the file came from, the zip code is matched on the town file name.
    * For most towns in new haven and hartford, there is only one zip code.
* Therefore the risk is that: If we are able to capture all the leases from the towns that only report one zip code.
* If you're in a town, with only one zip code, and we get a zip code from outside the town, we should assume that it is a lease.

**Decision to drop leases**
* May want to drop them - as different economic choice
* But we might want to keep them - don't treat leases and owners differently. We might not want to treat the resale owners differently. A car provides a stream of benefits, and can pay for that stream of benefits in multiple ways. If you are owning it - you don't know how long that stream of benefts is. You can treat them as the same because when you resell the car you get a lump sum payment for the additional stream of benefits.
* Therefore at $t=0$ your expected value of the vehicle should be the same whether or not you lease it or buy it. You are only paying for the benefits while you have it, and the prices should be equal.

**Differences in aggregate**
* Fleets - could be the difference.
* Everything we have is in that folder.

**Ken will ask about the car matching problems**
* What they need on their project: trying to use the RLPolk data to get a more granular time for when cars were purchased. Because municipal data is spanning calendar years.
* Have some updates on matching that I want to talk about, but want to make sure we have time to talk about the empricial strategy - because need most guidance on this. 