In [1]:
# Warning Management
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# DataFrames
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Maths
import numpy as np
import math

# Path management
import pathlib
import os
import sys

# Progress tracking
from tqdm import tqdm

# Regular Expressions
import re

# Logging
import logging

# Helpers
from helper_fns import convert_vin_valid, return_matched_vins, create_valid_zip, try_divide

# 1. Import Data

In [9]:
# Version of raw data to get
version = "102423"

# Paths 
path = pathlib.Path().resolve()

# Set up paths for different systems
if sys.platform == 'linux':
    data_path = path.parent.parent / "data" 
    matched_municipal_path = data_path / "municipal_dataset_matched" / f"municipal_dataset_matched_{version}.csv"
    rlpolk_path = data_path / "rlpolk_data" / "rlpolk_data_matched.csv"
else:
    data_path = path.parent / "data"
    print("YET TO FIX THIS")

In [10]:
matched_municipal_data = pd.read_csv(matched_municipal_path)
matched_rlpolk_data = pd.read_csv(rlpolk_path, index_col = 0)
matched_rlpolk_data["year"] = matched_rlpolk_data["YEAR_MONTH"].astype(str).str[0:4].astype(int)
matched_rlpolk_data["month"] = matched_rlpolk_data["YEAR_MONTH"].astype(str).str[4:].astype(int)

  matched_municipal_data = pd.read_csv(matched_municipal_path)


# 2. Comparison

## Set up data

In [31]:
matched_municipal_21 = matched_municipal_data[matched_municipal_data["record_from"].str.contains("_MV_21")]

In [30]:
matched_rlpolk_21_filter = ((matched_rlpolk_data["year"] == 2020) & (matched_rlpolk_data["month"] >=7)) | ((matched_rlpolk_data["year"] == 2021)  & (matched_rlpolk_data["month"] <7))
matched_rlpolk_21 = matched_rlpolk_data.loc[matched_rlpolk_21_filter]

## Comparisons

### Compare the number of vehicles between municipal and RLPolk

In [39]:
# Filter - we take vehicles with a valid manufacturer name, and Model Year of 2020 or 2021
mun_veh_year_mask = (matched_municipal_21["Model Year"] == 2020) | (matched_municipal_21["Model Year"] ==2021) | (matched_municipal_21["Model Year"] ==2019)
num_veh_mun = (matched_municipal_21["Manufacturer Name"].notna()) & (mun_veh_year_mask)
ct_mask = (matched_municipal_21["state"] == "CT") | (matched_municipal_21["zip_corrected"].astype(str).str[:-2].str.zfill(5).str[0:2] == "06")

# Print results
print(f"In the municipal dataset, {num_veh_mun.sum()} vehicles are sold in tax year 2021")
print(f"This is calculated by taking tax year 2021, and filtering for model years > 2020 and 2021")

In the municipal dataset, 455306 vehicles are sold in tax year 2021
This is calculated by taking tax year 2021, and filtering for model years > 2020 and 2021


In [33]:
# Do the same for RLPolk
print(f"In the RLPolk Dataset, {matched_rlpolk_21["VEH_COUNT"].sum()} vehicles are sold in FY 2021")

# Try for the CY
print(f"In the RLPolk Dataset, {matched_rlpolk_data[matched_rlpolk_data["year"]==2021]["VEH_COUNT"].sum()} vehicles are sold in CY 2021")

In the RLPolk Dataset, 191187 vehicles are sold in FY 2021
In the RLPolk Dataset, 173449 vehicles are sold in CY 2021


### Compare only those models present in both datasets

In [40]:
# To compare apples-for-apples, we filter for vehicle makesand models in RLPolk Only
rlpolk_models = matched_rlpolk_21["Model"].unique().tolist()

# Filter in municipal to those models in RLPolk
mun_rlpolk_model_filter = matched_municipal_21["Model"].isin(rlpolk_models)
num_veh_rlpolk_mun = num_veh_mun & mun_rlpolk_model_filter

# Print results
print(f"In the municipal dataset, {num_veh_rlpolk_mun.sum()} vehicles **with the same models as RLPolk** sold in 2021")

In the municipal dataset, 425607 vehicles **with the same models as RLPolk** sold in 2021


### Compare EVs

In [41]:
num_veh_rlpolk_mun_evs = num_veh_mun & mun_rlpolk_model_filter & (matched_municipal_21["Fuel Type - Primary"] == "Electric")

# Print results
print(f"In the municipal dataset, {num_veh_rlpolk_mun_evs.sum()} EVs **with the same models as RLPolk** sold in 2021")

In the municipal dataset, 10505 EVs **with the same models as RLPolk** sold in 2021


In [92]:
matched_rlpolk_21_evs = matched_rlpolk_21["Fuel Type - Primary"] == "Electric"

# Do the same for RLPolk
print(f"In the RLPolk Dataset, {matched_rlpolk_21[matched_rlpolk_21_evs]["VEH_COUNT"].sum()} EVs sold in FY 2021")

# Try with just calendar year
print(f"In the RLPolk Dataset, {matched_rlpolk_data[(matched_rlpolk_data["year"]==2021) & (matched_rlpolk_data["Fuel Type - Primary"]=="Electric")]["VEH_COUNT"].sum()} EVs sold in CY 2021")

In the RLPolk Dataset, 6086 EVs sold in FY 2021
In the RLPolk Dataset, 8461 EVs sold in CY 2021


### Compare EVs with CT state

In [115]:
len(matched_municipal_21[ct_mask & mun_rlpolk_model_filter &  (matched_municipal_21["Fuel Type - Primary"] == "Electric")])

12193