# rlpolk_probabilities
**Description:** This notebook implements functions to:
* Create a blank probability distribution from RLPolk Data (here, we apply only to EVs in the RLPolk dataset, but you could apply it to all vehicles).
* Fill in the blank probability, to create a probability distribution over months for every $\text{VIN} \times \text{Financial Year}$ combination in the RLPolk DataSet
* Draw a month from a distribution for each EV in the municipal dataset (again, you could apply this to all vehicles)
* Merge this to the municipal EV dataset.

In [37]:
# Path management
import pathlib
import os

# Dataframes and math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from itertools import combinations

# Progress management
from tqdm import tqdm

# Matching
import re
import difflib

pd.options.display.max_columns = 30

In [2]:
home_path = pathlib.Path().resolve().parent.parent/"rn_home"
data_path = home_path / "data"
rlpolk_data_path = data_path / "rlpolk_data"
vin_matching_path = data_path / "vin_matching"

# Define functions

In [4]:
# Create a function that determines the start and end of a financial year
def get_fy(cy, month, startmonth = 10, endmonth = 9):
    """Input: Calendar Year and Month.
    Output: Financial Year
    Parameters: FY includes October 1 - September 30 the next year."""
    if (month<0) | (month>12):
        return False
    elif month>endmonth:
        return cy+1
    else:
        return cy

In [3]:
# Function to create a base DF for the distributions
def create_empty_distribution(input_df, key, additional_cols, yrs):
    """Creates an empty probability distribution DataFrame.
    Inputs:
    - An input dataframe.
    - A unique key to build a distribution for each value of
    - Additional columns to keep
    - Years to build distribution for
    
    Outputs:
    - An empty probability distribution.
    - The number of rows will be: (Number of unique keys (i.e. input_df[key].nunique()) x number of years)
    - The number of columns will be: 1 (key) + 1 (year) + len(additional_cols) + 12 (months)
    - In addition, for each of the unique keys, the empty distribution will keep the additional columns
    (e.g., VINs may be the unique key, but for each VIN, we keep some additional columns such as the model name - these are not used for matching later)
    """
    
    # Remove duplicates of the key. We assume that the values of "additional_cols" are identical for all rows with the same value of the key
    dup_removed = input_df.drop_duplicates(key)
    
    # Get a unique list of keys to create a distribution
    lst = input_df[key].unique().tolist()

    # Create all combinations of the unique keys and the years
    combos = [[x,y]+[0]*12 for x in lst for y in yrs]

    # Create month columns
    months = ["month_"+str(x).zfill(2) for x in range(1,13)]

    # Create the empty distribution
    base_df = pd.DataFrame(combos, columns = [key, "fy"]+months)

    # Merge in additional columns for information
    base_df = base_df.merge(dup_removed[additional_cols+[key]],
                            left_on =  key,
                            right_on = key,
                            how = 'left')

    # Reorder the columns
    base_df = base_df[[key]+additional_cols+["fy"]+months]

    # Return the empty distribution
    return base_df

In [5]:
# Function to create probability distribution
def create_distribution(df, df_key_col, base_df, base_df_key_col, fys):
    """
    Creates a distribution of sales over months and financial years, for different values of a unique key.
    Inputs:
    - df: This is the raw data from which to build the distribution
    - df_key_col: This is the name of the column in df, which contains the unique keys
    - base_df: This is the empty distribution
    - base_df_key_col: The name of the column in base_df containing the unique keys
    - fys: The financial years for which to build the distribution
    Outputs:
    - A filled in probability distribution. For each combination of the unique keys and financial years, the
    distribution is a distribution of sales over the 12 months. Note that the output DataFrame will also contain rows
    for FY*key combinations for which there were NO sales, as long as that key was sold at least once, in df. 
    """

    # Output
    out_df = base_df.copy()
    
    # Get the keys and the months (keys = e.g. VINs)
    keys = base_df[base_df_key_col].unique().tolist()
    months = [x for x in base_df.columns.tolist() if "month" in x]
    
    # Loop through the input df to create the distribution
    for key in tqdm(keys):
        for fy in fys:
            for month in months:
                # Get the rows of the input_df pertaining to that combination of key, fy, and month
                cond = (df[df_key_col]==key) & (df["FY"]==fy) & (df["month"]==int(month[-2:]))
                # Count the vehicles sold and input into dataframe
                count = df[cond]["VEH_COUNT"].sum()
                out_df.loc[(base_df[base_df_key_col]==key) &(base_df["fy"]==fy), month] = count

    # Return
    return out_df

In [6]:
# Define a function to produce a percentage
def get_pct(row, index):
    try:
        return row.iloc[index:] / row.iloc[index:].sum()
    except:
        return row.iloc[index:] / 1

In [38]:
# Create a function to get the most probable month
def get_most_probable_month(fy_to_find,
                            vin_to_find,
                            lookup_table,
                            lookup_table_dist_cols,
                            check_column):
    """
    Poorly named function - does not get the most probable month. Instead draws a month from a distribution, for a given financial year
    and vin code.
    Inputs:
    - A financial year to find
    - A VIN code to find
    - A lookup table (of distributions)
    - A list of column names containing the distribution in that lookup table
    - The name of the column in the lookup table, to check, to see if at least one vehicle was sold for that FY x VIN combination
    
    Outputs
    - A month, chosen either from the real distribution, or randomly, and some extra details. 
    
    Method
    - We first check if the VIN even exists in the lookup table (note, you could change this to be any unique key). If it does NOT,
    return a random month, marking the allocation method as "random"
    - Then, check if the VIN is present in the DF, but not for that financial year (NOTE: this is unnecessary if you use the functions above,
    since every key x FY combination is created) OR the VIN and FY are present BUT there were no sales made in that year. If this is the case, 
    we instead take the distribution for the same VIN but ANOTHER year. E.g. We search "Tesla Model 3, FY =2020" and get back "Tesla Model 3, FY = 2022".
    - If the VIN is present, for that FY, and sales were made, we get the distribution from that column.

    NOTE: Could make this function better - if the VIN is not found, then search for the make and model... 
    """
    
    # Set the conditions
    cond_fy = lookup_table["fy"] == fy_to_find
    cond_vin = lookup_table["vin_corrected"] == vin_to_find

    # Check if the vin is present, and if the vin and fy are present
    vin_present = (sum(cond_vin) > 0)
    vin_fy_present = (sum(cond_vin & cond_fy) > 0)
    
    # If the vin and fy are present, check that a non-zero number were actually sold that year
    vin_fy_present_zero = (vin_present) and (vin_fy_present) and (lookup_table.loc[(cond_fy & cond_vin), check_column].item() ==0)

    # If the 10-digit VIN is not in the lookup table, return a random distribution
    if not vin_present:
        month = np.random.choice(np.arange(0,12), p=[1/12]*12)
        how = "random"
        return pd.Series([month, how, "vin_not_matched", np.NaN, np.NaN, np.NaN, np.NaN])

    # If the 10-digit vin is present, but not for that FY, then we choose the first FY for which it is available
    # Alternatively if 10-digit vin and FY are present, but no models sold, do the same
    elif (vin_present and not(vin_fy_present)) or vin_fy_present_zero:
        filtered_lookup_table = lookup_table[cond_vin].reset_index(drop=True)
        filtered_lookup_table = filtered_lookup_table[(filtered_lookup_table[check_column]!=0)].reset_index(drop=True).iloc[[0],:]

    # If the 10-digit VIN is present, for that FY, and non-zero are sold, we take that
    else:
        filtered_lookup_table = lookup_table[cond_vin & cond_fy].reset_index(drop=True)
        
    # Extract information
    matched_fy = filtered_lookup_table.loc[0, "fy"]
    matched_vin = filtered_lookup_table.loc[0, "vin_corrected"]
    matched_make = filtered_lookup_table.loc[0, "Manufacturer Name"]
    matched_model = filtered_lookup_table.loc[0, "Model"]
    matched_model_year = filtered_lookup_table.loc[0, "Model Year"]

    # Get distribution and month
    distribution = filtered_lookup_table.loc[0, lookup_table_dist_cols].tolist()
    month = np.random.choice(np.arange(0,12), p = distribution)
    how = "distribution"

    return pd.Series([month, how, matched_vin, matched_fy, matched_make, matched_model, matched_model_year])

# Prepare Data

In [None]:
# Import data
rlpolk_matched = pd.read_csv(rlpolk_data_path / "rlpolk_data_matched.csv", index_col = [0])

# Get financial year and make model
rlpolk_matched["FY"]=rlpolk_matched.apply(lambda x: get_fy(x.year, x.month), axis = 1)
rlpolk_filtered = rlpolk_matched[(rlpolk_matched["Fuel Type - Primary"] == "Electric")]
rlpolk_filtered["Model Year"] = rlpolk_filtered["Model Year"].astype(int)

# Create distribution

In [74]:
empty_distribution_vins = create_empty_distribution(rlpolk_filtered, "vin_corrected", ["Manufacturer Name", "Model", "Model Year"], [2018,2019,2020,2021,2022,2023])

In [None]:
ev_dist = create_distribution(rlpolk_filtered, "vin_corrected", empty_distribution_vins, "vin_corrected", [2018,2019,2020,2021,2022,2023])

**Convert to percentages**

In [108]:
ev_dist[months_pct] = ev_dist.apply(lambda x: get_pct(x, 5), axis =1)
ev_dist["pct_sum"] = ev_dist[months_pct].apply(lambda x: sum(x), axis = 1)

**Save**

In [113]:
ev_dist.to_csv(rlpolk_data_path / "vin_distribution_111523.csv")

In [39]:
ev_dist.head(5)

Unnamed: 0,vin_corrected,Manufacturer Name,Model,Model Year,fy,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,month_01_pct,month_02_pct,month_03_pct,month_04_pct,month_05_pct,month_06_pct,month_07_pct,month_08_pct,month_09_pct,month_10_pct,month_11_pct,month_12_pct,pct_sum
0,1C4JJXP6*P,FCA US LLC,Wrangler,2023,2018,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1C4JJXP6*P,FCA US LLC,Wrangler,2023,2019,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1C4JJXP6*P,FCA US LLC,Wrangler,2023,2020,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1C4JJXP6*P,FCA US LLC,Wrangler,2023,2021,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1C4JJXP6*P,FCA US LLC,Wrangler,2023,2022,0,0,0,0,0,0,0,0,37,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


# Load

In [7]:
ev_dist = pd.read_csv(rlpolk_data_path / "vin_distribution_111523.csv", index_col = [0])

# Apply Distribution to Municipal Data

In [None]:
# Import the data 
municipal_evs = pd.read_csv(data_path / "municipal_dataset_matched" / "municipal_dataset_matched_110723.csv", index_col =[0])

# Extract only electric vehicles
elec_mask = municipal_evs["Fuel Type - Primary"]=="Electric"
municipal_evs = municipal_evs[elec_mask].reset_index(drop=True)

# Create a financial year column
municipal_evs["fy"] = municipal_evs["record_from"].apply(lambda x: int("20"+re.split("\\.", re.split("_ALTERED", x)[0])[0][-2:]))

In [10]:
# Get the months columns
months = ["month_"+str(x).zfill(2) for x in range(1,13)]
months_pct = [x+"_pct" for x in months]

In [None]:
# Get the distribution
df_to_concat=municipal_evs[["fy", "vin_corrected"]].apply(lambda x: get_most_probable_month(x.fy, x.vin_corrected[0:10], ev_dist, months_pct, "pct_sum"), axis =1)
df_to_concat.columns = ["allocated_month", "allocation_method", "matched_vin", "matched_fy", "matched_make", "matched_model", "matched_model_year"]

In [28]:
# Add to the initial df
municipal_evs = pd.concat([municipal_evs, df_to_concat], axis = 1)

In [34]:
# Show the percent that matched
pct = sum(municipal_evs["matched_vin"]=="vin_not_matched")*100 / len(municipal_evs) 
print(f"{100-pct:0.2f}% of VINs were matched, {pct:0.2f}% of VINs were not matched. For these, months are allocated randomly.")

80.25% of VINs were matched, 19.75% of VINs were not matched


In [40]:
municipal_evs.to_csv(data_path / "rlpolk_data" / "municipal_evs_with_month_distribution_111723.csv")

In [19]:
# Test this with a subset
test_subset = municipal_evs[municipal_evs["Fuel Type - Primary"]=="Electric"].iloc[0:10]

In [20]:
cols = ["allocated_month", "allocation_method"]

In [21]:
test_subset[["allocated_month", "allocation_method", "matched_make_model_my", "kept"]] = test_subset.apply(lambda x: get_most_probable_month(x.fy, x.make_model_my, base_df), axis = 1)

In [53]:
test_subset

Unnamed: 0.1,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,zip_corrected,vin_corrected,Unnamed: 0,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level,fy,make_model_my,allocated_month,allocation_method,matched_make_model_my,kept
0,126_Shelton_MV_21.xlsx,JACABACCI ROBERT,5 SHELVIEW DR,SHELTON,CT,6484,2018.0,CHEVR,VOLT LT,1.0,1G1RC6S53JU136992,6484.0,1G1RC6S5*JU,1030.0,GENERAL MOTORS LLC,Volt,2018.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,GENERAL MOTORS LLC_Volt_2018,6,distribution,GENERAL MOTORS LLC_Volt_2018,True
1,126_Shelton_MV_21.xlsx,JACKSON EILEEN M,212 FOX RUN,SHELTON,CT,6484,2014.0,TOYOT,PRIUS PL,1.0,JTDKN3DP6E3061619,6484.0,JTDKN3DP*E3,21233.0,TOYOTA MOTOR CORPORATION,Prius Plug-in,2014.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,TOYOTA MOTOR CORPORATION_Prius Plug-in_2014,1,distribution,TOYOTA MOTOR CORPORATION_Prius Prime_2021,False
2,126_Shelton_MV_21.xlsx,JP MORGAN CHASE BANK NA,P O BOX 901098,FORT WORTH,TX,76101,2020.0,LAND,RANGE RO,1.0,SALWR2RY6LA705798,76101.0,SALWR2RY*LA,49545.0,JAGUAR,Range Rover Sport,2020.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,JAGUAR_Range Rover Sport_2020,1,distribution,JAGUAR_Range Rover Sport_2020,True
3,126_Shelton_MV_21.xlsx,JP MORGAN CHASE BANK NA,P O BOX 901098,FORT WORTH,TX,76101,2020.0,JAGUA,I-PACE S,2.0,SADHB2S10L1F84906,76101.0,SADHB2S1*L1,42790.0,JAGUAR,I PACE S,2020.0,Electric,BEV (Battery Electric Vehicle),2021,JAGUAR_I PACE S_2020,10,distribution,JAGUAR_I PACE S_2020,True
4,126_Shelton_MV_21.xlsx,KALMANIDIS NAZIKO M,124 WELLS VIEW RD,SHELTON,CT,6484,2019.0,TESLA,MODEL 3,1.0,5YJ3E1EB2KF421964,6484.0,5YJ3E1EB*KF,2432.0,"TESLA, INC.",Model 3,2019.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2019",3,distribution,"TESLA, INC._Model 3_2019",True
5,126_Shelton_MV_21.xlsx,KELKAR SWATI G,235 DEER RUN,SHELTON,CT,6484,2019.0,TESLA,MODEL 3,1.0,5YJ3E1EA0KF297863,6484.0,5YJ3E1EA*KF,1637.0,"TESLA, INC.",Model 3,2019.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2019",3,distribution,"TESLA, INC._Model 3_2019",True
6,126_Shelton_MV_21.xlsx,KENNEDY THOMAS J JR,37 TEN COAT LN,SHELTON,CT,6484,2017.0,FORD,FUSION S,1.0,3FA6P0PU3HR121606,6484.0,3FA6P0PU*HR,2593.0,"FORD MOTOR COMPANY, MEXICO",Fusion,2017.0,Electric,PHEV (Plug-in Hybrid Electric Vehicle),2021,"FORD MOTOR COMPANY, MEXICO_Fusion_2017",0,distribution,"FORD MOTOR COMPANY, MEXICO_Fusion_2017",True
7,126_Shelton_MV_21.xlsx,KHALIL ESSAM M,151 MEADOW ST,SHELTON,CT,6484,2021.0,TESLA,MODEL 3,1.0,5YJ3E1EB2MF849696,6484.0,5YJ3E1EB*MF,54685.0,"TESLA, INC.",Model 3,2021.0,Electric,BEV (Battery Electric Vehicle),2021,"TESLA, INC._Model 3_2021",3,distribution,"TESLA, INC._Model 3_2021",True
8,126_Shelton_MV_21.xlsx,KIRN STEVEN F,11 MEETING HOUSE LN,HUNTINGTON,CT,6484,2021.0,AUDI,E-TRON P,1.0,WA1VAAGE8MB004375,6484.0,WA1VAAGE*MB,1131.0,VOLKSWAGEN,e tron,2021.0,Electric,BEV (Battery Electric Vehicle),2021,VOLKSWAGEN_e tron_2021,2,distribution,VOLKSWAGEN_e tron_2021,True
9,126_Shelton_MV_21.xlsx,KITCHENMAX LLC,22 FALMOUTH DR,SHELTON,CT,6484,2020.0,TESLA,MODEL X,3.0,5YJXCAE28LF298101,6484.0,5YJXCAE2*LF,61904.0,TESLA,Model X,2020.0,Electric,BEV (Battery Electric Vehicle),2021,TESLA_Model X_2020,9,distribution,TESLA_Model X_2020,True


In [26]:
get_most_probable_month(2021, "GENERAL MOTORS LLC_Volt_2018", base_df)

0                               4
1                    distribution
2    GENERAL MOTORS LLC_Volt_2018
3                            True
dtype: object

## Check which didn't work

Our aim is to see whether this is called by slight errors in how things are written out...

In [36]:
# Check teslas
unmatched_teslas = municipal_evs[(municipal_evs["allocation_method"] != "distribution") & (municipal_evs["Manufacturer Name"].str.contains("TESLA"))]
unmatched_teslas

Unnamed: 0,original_index,record_from,name,street,city,state,zip,vehicle_year,vehicle_make,vehicle_model,vehicle_class,vehicle_id,lease_street,UID,lease_city,...,zip_corrected,vin_corrected,Manufacturer Name,Model,Model Year,Fuel Type - Primary,Electrification Level,fy,allocated_month,allocation_method,matched_vin,matched_fy,matched_make,matched_model,matched_model_year
24,13404,110_Plainville_MV_21.xlsx,REILLY CHRISTOPHER P,1 SANDSTONE RD,PLAINVILLE,CT,6062,2015,TESLA,MODEL S,1,5YJSA1E26FF117498,,,,...,6062.0,5YJSA1E2*FF,"TESLA, INC.",Model S,2015.0,Electric,BEV (Battery Electric Vehicle),2021,2,random,vin_not_matched,,,,
51,21278,101_North_Haven_MV_21.xlsx,CARDOZO JOSEPH J,71 CLINTONVILLE RD,NORTH HAVEN,CT,6473,2014,TESLA,MODEL S,1,5YJSA1S19EFP62169,,53485.0,,...,6473.0,5YJSA1S1*EF,"TESLA, INC.",Model S,2014.0,Electric,BEV (Battery Electric Vehicle),2021,6,random,vin_not_matched,,,,
126,36941,101_North_Haven_MV_21.xlsx,RJM II ASSOCIATES LLC,281 WASHINGTON AVE,NORTH HAVEN,CT,6473,2013,TESLA,MODEL S,1,5YJSA1CN4DFP19861,,69148.0,,...,6473.0,5YJSA1CN*DF,"TESLA, INC.",Model S,2013.0,Electric,BEV (Battery Electric Vehicle),2021,0,random,vin_not_matched,,,,
127,36942,101_North_Haven_MV_21.xlsx,RJM II ASSOCIATES LLC,281 WASHINGTON AVE,NORTH HAVEN,CT,6473,2013,TESLA,MODEL S,1,5YJSA1CN7DFP27582,,69149.0,,...,6473.0,5YJSA1CN*DF,"TESLA, INC.",Model S,2013.0,Electric,BEV (Battery Electric Vehicle),2021,0,random,vin_not_matched,,,,
237,85028,070_Killingworth_MV_21.xlsx,ALEKSIEJUK JOANNA,119 BURR HILL RD,KILLINGWORTH,CT,6419,2016,TESLA,MODEL S,1.0,5YJSA1E43GF136285,,,,...,6419.0,5YJSA1E4*GF,"TESLA, INC.",Model S,2016.0,Electric,BEV (Battery Electric Vehicle),2021,9,random,vin_not_matched,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30230,5767995,44_East_Lyme_MVData_2019.csv,MASCARO JOSEPH ANTHONY,53 OSWEGATCHIE HILLS RD,NIANTIC,CT,6357.0,2015,TESLA,MODEL S,,5YJSA1H24FFP71864,,509996.0,,...,6357.0,5YJSA1H2*FF,"TESLA, INC.",Model S,2015.0,Electric,BEV (Battery Electric Vehicle),2019,3,random,vin_not_matched,,,,
30269,5777948,56_Granby_MVData_2019.csv,DEGLING DAVID A,396 SALMON BROOK ST,GRANBY,CT,6035.0,2016,TESLA,MODEL S,1.0,5YJSA1E28GF119948,,,,...,6035.0,5YJSA1E2*GF,"TESLA, INC.",Model S,2016.0,Electric,BEV (Battery Electric Vehicle),2019,11,random,vin_not_matched,,,,
30272,5778437,56_Granby_MVData_2019.csv,EMERY JENNY P,71 LOOMIS ST,NORTH GRANBY,CT,6060.0,2013,TESLA,MODEL S,1.0,5YJSA1BG7DFP05066,,,,...,6060.0,5YJSA1BG*DF,"TESLA, INC.",Model S,2013.0,Electric,BEV (Battery Electric Vehicle),2019,7,random,vin_not_matched,,,,
30286,5781619,56_Granby_MVData_2019.csv,LILLANEY ARUN G,295 FELDSPAR RDG,GLASTONBURY,CT,6033.0,2016,TESLA,MODEL S,1.0,5YJSA1E25GF160912,,,,...,6033.0,5YJSA1E2*GF,"TESLA, INC.",Model S,2016.0,Electric,BEV (Battery Electric Vehicle),2019,7,random,vin_not_matched,,,,
