Add into the assignment loop an additional variable: Month adoption probability (add columns). Then can reduce the columns by having it be a quarter instead of monthly. Doesn't need to be this week - can be next week.

In [1]:
import pathlib
import pandas as pd
import numpy as np
from itertools import combinations
import os
from tqdm import tqdm
import requests
from datetime import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import itertools
import re
import difflib

pd.options.display.max_columns = 30

In [2]:
data_path = pathlib.Path().resolve().parent.parent / "data"
rlpolk_data_path = data_path / "rlpolk_data"
vin_matching_path = data_path / "vin_matching"

In [3]:
rlpolk_matched = pd.read_csv(rlpolk_data_path / "rlpolk_data_matched.csv", index_col = [0])

FileNotFoundError: [Errno 2] No such file or directory: '/gpfs/gibbs/project/gillingham/rrn22/data/rlpolk_data/rlpolk_data_matched.csv'

# Prepare Data

In [4]:
# Create a function that determines the start and end of a financial year
def get_fy(cy, month, startmonth = 10, endmonth = 9):
    """Input: Calendar Year and Month.
    Output: Financial Year
    Parameters: FY includes October 1 - September 30 the next year."""
    if (month<0) | (month>12):
        return False
    elif month>endmonth:
        return cy+1
    else:
        return cy

In [5]:
# Get the financial year for each entry
rlpolk_matched["FY"]=rlpolk_matched.apply(lambda x: get_fy(x.year, x.month), axis = 1)

In [6]:
# Create a list of all unique makes, models, and model years
make_model_list = rlpolk_matched["Manufacturer Name"] +"_"+ rlpolk_matched["Model"]+"_"+rlpolk_matched["Model Year"].astype(str)
make_model_list = make_model_list.unique().tolist()

# Create column
rlpolk_matched["make_model_my"] = rlpolk_matched["Manufacturer Name"] +"_"+ rlpolk_matched["Model"]+"_"+rlpolk_matched["Model Year"].astype(str).str[0:4]

# Extract EVs
ev_make_model_list = rlpolk_matched[rlpolk_matched["Fuel Type - Primary"]=="Electric"]["make_model_my"].unique().tolist()

In [7]:
# Create dataframe
evs_combos = [[x,y]+[0]*12 for x in ev_make_model_list for y in [2018,2019,2020,2021,2022,2023]]
months = ["month_"+str(x).zfill(2) for x in range(1,13)]
base_df = pd.DataFrame(evs_combos, columns = ["make_model_my", "fy"]+months)

# Simplify DF to speed it up
rlpolk_filtered = rlpolk_matched[(rlpolk_matched["Fuel Type - Primary"] == "Electric")]

In [278]:
# Create DF
for make_model_my in tqdm(ev_make_model_list):
    for fy in [2018,2019,2020,2021,2022,2023]:
        for month in months:
            cond = (rlpolk_filtered["Fuel Type - Primary"] == "Electric") &(rlpolk_filtered["make_model_my"]==make_model_my) & (rlpolk_filtered["FY"]==fy) & (rlpolk_filtered["month"]==int(month[-2:]))
            count = rlpolk_filtered[cond]["VEH_COUNT"].sum()
            base_df.loc[(base_df["make_model_my"]==make_model_my) &(base_df["fy"]==fy), month] = count

100%|██████████| 292/292 [02:18<00:00,  2.11it/s]


In [150]:
base_df.to_csv(rlpolk_data_path / "my_fy_month_sales_table_11021400.csv")

In [8]:
# Create percentage columns
months_pct = [x+"_pct" for x in months]

In [9]:
# Define a function to produce a percentage
def get_pct(row):
    try:
        return row.iloc[2:14] / row.iloc[2:14].sum()
    except:
        return row.iloc[2:14] / 1

In [350]:
# Produce percentages and sums
base_df[months_pct] = base_df.apply(lambda x: get_pct(x), axis =1)
base_df["pct_sum"] = base_df[months_pct].apply(lambda x: sum(x), axis = 1)
base_df["nonzero"] = base_df[months].apply(lambda x: sum(x)>0, axis = 1)

In [352]:
base_df.to_csv(rlpolk_data_path / "my_fy_month_sales_table_11021400.csv")

In [10]:
base_df=pd.read_csv(rlpolk_data_path / "my_fy_month_sales_table_11021400.csv", index_col = [0])

# Attempt merging to municipal

In [11]:
municipal_evs = pd.read_csv(data_path / "municipal_dataset_extracts" / "all_evs.csv", index_col = [0])

In [12]:
# Create a financial year column
municipal_evs["fy"] = municipal_evs["record_from"].apply(lambda x: int("20"+re.split("\\.", re.split("_ALTERED", x)[0])[0][-2:]))

# Create a make_model_my column
municipal_evs["make_model_my"] = municipal_evs["Manufacturer Name"]+"_"+municipal_evs["Model"]+"_"+municipal_evs["Model Year"].astype(str).str[0:4]

In [92]:
# Create a function to get the most probable month
def get_most_probable_month(fy, make_model_my, lookup_table, kept = True):
    """Inputs: 
    - Financial Year (integer)
    - make_model_my (string)
    - lookup table with at least the following columns
        - fy
        - make_model_my
        - month_01 --> month_12
        - month_01_pct --> month_12_pct
        - For each make_model_fy, all fys are available, even if 0
    Outputs: a randomly chosen month, or one chosen from the given distribution"""

    # Set the conditions
    cond_fy = lookup_table["fy"] == fy
    cond_make_model_my = lookup_table["make_model_my"] == make_model_my

    # First get all rows for same make model and year
    filtered_make_model_my = lookup_table[cond_make_model_my]

    # Then further filter to get fy
    filtered = filtered_make_model_my.loc[cond_fy].reset_index(drop=True)

    if (len(filtered_make_model_my) == 0):
        # The model_make_my is not present in the dataset - it's a different model not in the RLPolk dataset
        
        # Attempt a closest match
        try:
            poss_matches = difflib.get_close_matches(make_model_my, ev_make_model_list)

            # If we get at least one match, then uise it 
            if len(poss_matches) > 0:
                return get_most_probable_month(fy, poss_matches[0], lookup_table, kept = False)
    
            # If no close match is found, just do random
            else: 
                month = np.random.choice(np.arange(0,12), p=[1/12]*12)
                how = "random"
        except:
            # If for some reason there's an error, return random
            month = np.random.choice(np.arange(0,12), p=[1/12]*12)
            how = "random"
    
        
        
    elif filtered.loc[0, "nonzero"]:
        # The year has a distribution in it - we choose from that year
        distribution = filtered.loc[0, months_pct].tolist()
        month = np.random.choice(np.arange(0,12), p = distribution)
        how = "distribution"
        
    else:
        # The year doesn't have a distributuon, but another one does
        filtered = filtered_make_model_my[filtered_make_model_my["nonzero"]].reset_index(drop=True).iloc[0,:]
        distribution = filtered.loc[months_pct].tolist()
        month = np.random.choice(np.arange(0,12), p = distribution)
        how = "distribution"
        
    return pd.Series([month, how, make_model_my, kept])

## Test

In [42]:
# Test this with a subset
test_subset = municipal_evs[municipal_evs["Fuel Type - Primary"]=="Electric"].iloc[0:10]

In [35]:
cols = ["allocated_month", "allocation_method"]

In [87]:
test_subset[["allocated_month", "allocation_method", "matched_make_model_my", "kept"]] = test_subset.apply(lambda x: get_most_probable_month(x.fy, x.make_model_my, base_df), axis = 1)

In [88]:
test_subset

Unnamed: 0.1,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,zip_corrected,vin_corrected,Unnamed: 0,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level,fy,make_model_my,allocated_month,allocation_method,matched_make_model_my,kept
0,126_Shelton_MV_21.xlsx,JACABACCI ROBERT,5 SHELVIEW DR,SHELTON,CT,6484,2018.0,CHEVR,VOLT LT,1.0,1G1RC6S53JU136992,6484.0,1G1RC6S5*JU,1030.0,GENERAL MOTORS LLC,Volt,2018.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,GENERAL MOTORS LLC_Volt_2018,6,distribution,GENERAL MOTORS LLC_Volt_2018,True
1,126_Shelton_MV_21.xlsx,JACKSON EILEEN M,212 FOX RUN,SHELTON,CT,6484,2014.0,TOYOT,PRIUS PL,1.0,JTDKN3DP6E3061619,6484.0,JTDKN3DP*E3,21233.0,TOYOTA MOTOR CORPORATION,Prius Plug-in,2014.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,TOYOTA MOTOR CORPORATION_Prius Plug-in_2014,4,distribution,TOYOTA MOTOR CORPORATION_Prius Prime_2021,False
2,126_Shelton_MV_21.xlsx,JP MORGAN CHASE BANK NA,P O BOX 901098,FORT WORTH,TX,76101,2020.0,LAND,RANGE RO,1.0,SALWR2RY6LA705798,76101.0,SALWR2RY*LA,49545.0,JAGUAR,Range Rover Sport,2020.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,JAGUAR_Range Rover Sport_2020,10,distribution,JAGUAR_Range Rover Sport_2020,True
3,126_Shelton_MV_21.xlsx,JP MORGAN CHASE BANK NA,P O BOX 901098,FORT WORTH,TX,76101,2020.0,JAGUA,I-PACE S,2.0,SADHB2S10L1F84906,76101.0,SADHB2S1*L1,42790.0,JAGUAR,I PACE S,2020.0,Electric,BEV (Battery Electric Vehicle),2021,JAGUAR_I PACE S_2020,10,distribution,JAGUAR_I PACE S_2020,True
4,126_Shelton_MV_21.xlsx,KALMANIDIS NAZIKO M,124 WELLS VIEW RD,SHELTON,CT,6484,2019.0,TESLA,MODEL 3,1.0,5YJ3E1EB2KF421964,6484.0,5YJ3E1EB*KF,2432.0,"TESLA, INC.",Model 3,2019.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2019",3,distribution,"TESLA, INC._Model 3_2019",True
5,126_Shelton_MV_21.xlsx,KELKAR SWATI G,235 DEER RUN,SHELTON,CT,6484,2019.0,TESLA,MODEL 3,1.0,5YJ3E1EA0KF297863,6484.0,5YJ3E1EA*KF,1637.0,"TESLA, INC.",Model 3,2019.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2019",6,distribution,"TESLA, INC._Model 3_2019",True
6,126_Shelton_MV_21.xlsx,KENNEDY THOMAS J JR,37 TEN COAT LN,SHELTON,CT,6484,2017.0,FORD,FUSION S,1.0,3FA6P0PU3HR121606,6484.0,3FA6P0PU*HR,2593.0,"FORD MOTOR COMPANY, MEXICO",Fusion,2017.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,"FORD MOTOR COMPANY, MEXICO_Fusion_2017",0,distribution,"FORD MOTOR COMPANY, MEXICO_Fusion_2017",True
7,126_Shelton_MV_21.xlsx,KHALIL ESSAM M,151 MEADOW ST,SHELTON,CT,6484,2021.0,TESLA,MODEL 3,1.0,5YJ3E1EB2MF849696,6484.0,5YJ3E1EB*MF,54685.0,"TESLA, INC.",Model 3,2021.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2021",2,distribution,"TESLA, INC._Model 3_2021",True
8,126_Shelton_MV_21.xlsx,KIRN STEVEN F,11 MEETING HOUSE LN,HUNTINGTON,CT,6484,2021.0,AUDI,E-TRON P,1.0,WA1VAAGE8MB004375,6484.0,WA1VAAGE*MB,1131.0,VOLKSWAGEN,e tron,2021.0,Electric,BEV (Battery Electric Vehicle),2021,VOLKSWAGEN_e tron_2021,3,distribution,VOLKSWAGEN_e tron_2021,True
9,126_Shelton_MV_21.xlsx,KITCHENMAX LLC,22 FALMOUTH DR,SHELTON,CT,6484,2020.0,TESLA,MODEL X,3.0,5YJXCAE28LF298101,6484.0,5YJXCAE2*LF,61904.0,TESLA,Model X,2020.0,Electric,BEV (Battery Electric Vehicle),2021,TESLA_Model X_2020,9,distribution,TESLA_Model X_2020,True


In [19]:
get_most_probable_month(2021, "GENERAL MOTORS LLC_Volt_2018", base_df)

[3, 'distribution']

## Apply

In [91]:
# Apply
municipal_evs[["allocated_month", "allocation_method", "matched_make_model_my", "kept"]] = municipal_evs.apply(lambda x: get_most_probable_month(x.fy, x.make_model_my, base_df), axis = 1)

In [97]:
municipal_evs.to_csv(data_path / "municipal_dataset_extracts" / "muncipal_evs_month_distributed_11021540.csv")

## Check which didn't work

Our aim is to see whether this is called by slight errors in how things are written out...

In [93]:
# Check teslas
unmatched_teslas = municipal_evs[(municipal_evs["allocation_method"] != "distribution") & (municipal_evs["Manufacturer Name"].str.contains("TESLA"))]
unmatched_teslas["make_model_my"].unique()

array([nan], dtype=object)

In [64]:
# Check teslas in RLPolk
rlpolk_teslas = [x for x in ev_make_model_list if "TESLA" in x]
rlpolk_teslas[0:5]

['TESLA, INC._Model 3_2022',
 'TESLA, INC._Model 3_2023',
 'TESLA_Model Y_2022',
 'TESLA, INC._Model 3_2019',
 'TESLA_Model X_2020']

In [71]:
difflib.get_close_matches('TOYOTA MOTOR CORPORATION_Prius Plug-in_2014', ev_make_model_list)

['TOYOTA MOTOR CORPORATION_Prius Prime_2021',
 'TOYOTA MOTOR CORPORATION_Prius Prime_2019',
 'TOYOTA MOTOR CORPORATION_Prius Prime_2018']

In [81]:
get_most_probable_month(2021,'TESLA, INC._Model S_2016', base_df)

0                          10
1                distribution
2    TESLA, INC._Model S_2021
3                       False
dtype: object

Thus we see that the issue for teslas - even when we look at more of them - is that in the RLPolk Dataset we only find ones for which the model years are much newer....