In [1]:
import pandas as pd
import numpy as np
import replicaEVSE.load_curve as sim
import replicaEVSE.datautils as simdu
import os
import joblib
import dask.dataframe as dd
from tqdm import tqdm

%reload_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', None)

datadir = '../../data/'
mode = 'PRIVATE_AUTO'
test = False

# Sample the number of people/vehicles by county segment and year

see  `scripts/sample_counties.py` to run 

In [2]:


#Created in the EIA_data_download.ipynb notebook
existing_load=pd.read_csv(datadir+'EIA_demand_summary.csv') 
if test:
    # df = pd.read_parquet(os.path.join(datadir, 'wa_pop_and_trips_subsample.parquet'))
    df = pd.read_parquet(os.path.join(datadir, 'wa_pop_and_trips_sorted_county.parquet'))
    df = df.head(10000)
    df = df.loc[df['mode'] == mode]
    simulation_id = 'dev'

else: 
    # read in the joined trips and population data sets
    merged_df = pd.read_parquet(os.path.join(datadir, 'wa_pop_and_trips_sorted_county.parquet'))

    # right now, only look at private auto trips
    df = merged_df.loc[merged_df['mode'] == mode]
    # take out the mobile and commercial MHDV

### TODO: revisit taking out mobile home owners
df = df[(df['building_type'] != 'mobile') & (df['building_type'] != None)]



In [3]:
counties = pd.read_parquet(datadir+'/population_counties_dataset.parquet', engine='pyarrow')
df = pd.merge(df, counties, on='person_id', how='left')

In [4]:
df.columns

Index(['activity_id', 'person_id', 'mode', 'travel_purpose',
       'previous_activity_type', 'start_time', 'end_time', 'distance_miles',
       'vehicle_type', 'origin_bgrp', 'origin_bgrp_lat', 'origin_bgrp_lng',
       'destination_bgrp', 'destination_bgrp_lat', 'destination_bgrp_lng',
       'origin_land_use_l1', 'origin_land_use_l2', 'origin_building_use_l1',
       'origin_building_use_l2', 'destination_land_use_l1',
       'destination_land_use_l2', 'destination_building_use_l1',
       'destination_building_use_l2', 'origin_lat', 'origin_lng',
       'destination_lat', 'destination_lng', 'weekday', 'household_id',
       'BLOCKGROUP', 'BLOCKGROUP_work', 'BLOCKGROUP_school', 'TRACT',
       'TRACT_work', 'TRACT_school', 'age_group', 'age', 'sex', 'race',
       'ethnicity', 'individual_income_group', 'individual_income',
       'employment', 'education', 'school_grade_attending', 'industry',
       'household_role', 'subfamily_number', 'subfamily_relationship',
       'commute_mo

### Stock rollover model from Gerard of Evs by county, year, vehicle segment, engine (EV, PHEV) and housing (sfh, mfh).

`ev_df`

In [5]:
stock_rollover = pd.read_csv(datadir+'ldv_population_output_adjusted.csv')
efficiency = pd.read_csv(datadir+'vehicle_inputs.csv')

In [6]:
personal = ['Personal Sedan', 'Personal Crossover', 'Personal Truck/SUV']
commercial = ['Commercial Sedan', 'Commercial Crossover', 'Commercial Truck/SUV']
for cartype in personal + commercial:
    for powertrain in  ['EV', 'PHEV']:
        cond = (efficiency['Vehicle_type']==cartype) & ~efficiency['Powertrain'].isin(['ICE-G', 'ICE-D', 'FCEV']) & (efficiency['Powertrain']==powertrain) & (efficiency['Vintage'] == 2022)
        ef = efficiency[cond]['Efficiency'].values[0]
        print(cartype, powertrain, ef)

Personal Sedan EV 0.25
Personal Sedan PHEV 80.0
Personal Crossover EV 0.3
Personal Crossover PHEV 80.0
Personal Truck/SUV EV 0.49
Personal Truck/SUV PHEV 80.0
Commercial Sedan EV 0.25
Commercial Sedan PHEV 80.0
Commercial Crossover EV 0.3
Commercial Crossover PHEV 80.0
Commercial Truck/SUV EV 0.49
Commercial Truck/SUV PHEV 80.0


# extract the population ldv data from the stock rollover model 
 - What to do about housing == 'other'? 

### include both EV and PHEV

In [7]:
ev_cond = stock_rollover['Powertrain'].isin(['EV', 'PHEV']).reset_index(drop=True)
nev_df = stock_rollover[ev_cond].copy()
nev_df = nev_df[nev_df['domicile'] != 'other'].copy()
nev_df.drop(columns=['Unnamed: 0'], inplace=True)


See the change in stock over the years

We ingest the number of vehicles in each segment for each year and sample from that subset of the trips table for each county segment and year. This takes > 300 minutes for all the years and segments

In [8]:
nev_df

Unnamed: 0,County,Vehicle_type,domicile,Powertrain,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035
0,Adams,Commercial Crossover,mfh,EV,0,0,0,0,0,0,0,1,1,1,1,1,1,1
3,Adams,Commercial Crossover,mfh,PHEV,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Adams,Commercial Crossover,sfh,EV,0,0,1,1,1,2,2,2,3,3,3,5,5,5
7,Adams,Commercial Crossover,sfh,PHEV,0,1,1,1,1,1,1,1,1,1,1,1,1,1
8,Adams,Commercial Sedan,mfh,EV,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,Yakima,Personal Sedan,sfh,PHEV,98,2433,4815,6453,7994,9117,10046,10703,11606,12252,12842,13552,14206,14859
1840,Yakima,Personal Truck/SUV,mfh,EV,34,1780,3504,5143,6768,8393,9963,11285,12851,14524,16279,18064,19987,22314
1843,Yakima,Personal Truck/SUV,mfh,PHEV,18,1063,1757,2265,2617,2945,3170,3367,3593,3845,4102,4380,4667,5020
1844,Yakima,Personal Truck/SUV,sfh,EV,133,6521,12813,18796,24713,30637,36335,41141,46932,53211,59811,66703,74132,83107


In [9]:
# get the unique people in the dataframe
pop_df = df.drop_duplicates(subset=['person_id'])[['person_id', 'home_cty', 'building_type']]

In [10]:
# test run on a single year ~14 mins
# final_df = simdu.sample_people_by_county(df, nev_df, year=2025, fraction=None)
# final_df.head()[['person_id', 'distance_miles', 'destination_county', 'building_type', 'engine', 'segment', 'efficiency', 'year']]

### Can parallelize this by looping over years

In [16]:
if False:
    year_list = np.arange(2022, 2036, 1)
    joblib.Parallel(verbose=10, n_jobs=4)(joblib.delayed(simdu.run_and_save_sampled_populations)(
        df,
        nev_df, 
        year,
        datadir,
        ) for year in year_list)


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Selecting people from each county in year=2022...14 mins per year
Selecting people from each county in year=2023...14 mins per year
Selecting people from each county in year=2024...14 mins per year
Selecting people from each county in year=2025...14 mins per year


Bad pipe message: %s [b'\x06s\xaa\x98k\xcf\xb3s\x8c9\xaf\xe0\xa9\x91\xb7K@A @Ky\x99S\xd3\x88\x97UR\x9e\xcaT\\/Sz\xb1\xd6B\x15']
Bad pipe message: %s [b'\xbc\xe3\x90u\x91\x07\x12\xec;$-\xda\x93\xf0o\xe1\xdf.\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0']
Bad pipe message: %s [b"/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x00"]
Bad pipe message: %s [b'8\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00']
Bad pipe message: %s [b'\x00\t127.0.0.1']
Bad pipe message: %s [b'9\xcdZ\x94\x84\xb9\x7f\x0c\xbe\x00\xed\x00\x8f\x00\x14\xbf\xc8\xf3\x00\x00\xa2', b'\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x0

Selecting people from each county in year=2026...14 mins per year
Selecting people from each county in year=2027...14 mins per year
Selecting people from each county in year=2028...14 mins per year
Selecting people from each county in year=2029...14 mins per year


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed: 38.0min


Selecting people from each county in year=2030...14 mins per year
Selecting people from each county in year=2031...14 mins per year
Selecting people from each county in year=2032...14 mins per year
Selecting people from each county in year=2033...14 mins per year


In [2]:
# check outputs
df_2035 = pd.read_parquet(os.path.join(datadir, 'county_samples/county_sample_2035.parquet'))

In [14]:
len(df_2035)

30547656

In [3]:
df_2034 = pd.read_parquet(os.path.join(datadir, 'county_samples/county_sample_2034.parquet'))

In [15]:
len(df_2034)

30547656

In [16]:
cnty_cond = df_2034[df_2034['destination_county']]
df_2034.drop_duplicates(subset=['person_id']).shape, df_2035.drop_duplicates(subset=['person_id']).shape

((252329, 73), (252329, 73))

In [6]:
df_2034[['person_id', 'distance_miles', 'destination_county', 'building_type', 'engine', 'segment', 'efficiency', 'year']]

Unnamed: 0,person_id,distance_miles,destination_county,building_type,engine,segment,efficiency,year
10297,10234240303577394052,1.883100,"Adams County, WA",single_family,EV,Commercial Crossover,0.3,2034
10298,10234240303577394052,1.508437,"Adams County, WA",single_family,EV,Commercial Crossover,0.3,2034
10299,10234240303577394052,1.508437,"Adams County, WA",single_family,EV,Commercial Crossover,0.3,2034
10300,10234240303577394052,1.883100,"Adams County, WA",single_family,EV,Commercial Crossover,0.3,2034
10301,10234240303577394052,1.611954,"Adams County, WA",single_family,EV,Commercial Crossover,0.3,2034
...,...,...,...,...,...,...,...,...
51722247,12974764336925645660,2.129131,"Yakima County, WA",single_family,PHEV,Personal Truck/SUV,0.9,2034
51722248,12974764336925645660,2.131551,"Yakima County, WA",single_family,PHEV,Personal Truck/SUV,0.9,2034
51722249,12974764336925645660,2.129131,"Yakima County, WA",single_family,PHEV,Personal Truck/SUV,0.9,2034
51725545,12975743187318417717,2.232699,"Yakima County, WA",single_family,PHEV,Personal Truck/SUV,0.9,2034


# why are they all the same length?

In [18]:
year_list = np.arange(2022, 2024, 1)
df_list = []
for year in year_list:
    print(year)
    df_i = pd.read_parquet(os.path.join(datadir, 'county_samples/county_sample_{}.parquet'.format(year)))
    print(len(df_i.drop_duplicates(subset=['person_id'])))
    # df_list.append()

# df_out = pd.concat(df_list)

2022
252329
2023
252329


In [18]:
len(pop_df), len(df)

(5046603, 25405990)

# make toy data

In [9]:
ev_cond = stock_rollover['Powertrain'].isin(['EV', 'PHEV']).reset_index(drop=True)
# ev_cond = (stock_rollover['Powertrain']=='EV') | (stock_rollover['Powertrain']=='PHEV')
# nev_df = stock_rollover[(stock_rollover['Powertrain']=='EV')].copy()
nev_df = stock_rollover[ev_cond].copy()
nev_df = nev_df[nev_df['domicile'] != 'other'].copy()
nev_df.drop(columns=['Unnamed: 0'], inplace=True)


# ev_sfh_only = ev_sfh_only[ev_sfh_only['Powertrain']=='EV'].copy()
segmentslist = ['Personal Sedan', 'Personal Crossover']# 'Personal Truck/SUV']
nev_df = nev_df[nev_df['Vehicle_type'].isin(segmentslist)].copy()
nev_df.head(10)

Unnamed: 0,County,Vehicle_type,domicile,Powertrain,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035
24,Adams,Personal Crossover,mfh,EV,0,2,4,5,7,8,9,10,11,13,15,18,20,26
27,Adams,Personal Crossover,mfh,PHEV,0,2,3,3,4,4,4,4,4,4,4,4,4,4
28,Adams,Personal Crossover,sfh,EV,0,9,18,28,38,47,54,62,69,78,86,99,109,119
31,Adams,Personal Crossover,sfh,PHEV,0,8,14,19,23,26,28,30,32,33,35,37,40,42
32,Adams,Personal Sedan,mfh,EV,0,83,173,239,299,339,374,395,420,431,442,455,461,467
35,Adams,Personal Sedan,mfh,PHEV,1,53,106,144,179,203,224,238,258,269,284,297,307,321
36,Adams,Personal Sedan,sfh,EV,12,357,724,984,1223,1389,1526,1608,1709,1766,1806,1859,1900,1924
39,Adams,Personal Sedan,sfh,PHEV,10,225,441,590,729,828,909,965,1041,1094,1141,1198,1252,1304
72,Asotin,Personal Crossover,mfh,EV,0,1,2,4,5,6,8,10,12,14,19,23,27,31
75,Asotin,Personal Crossover,mfh,PHEV,0,1,2,3,3,3,3,3,3,3,3,3,3,3


In [64]:
county_list = nev_df['County'].unique()
multiyear_list = []
unique_df = df.drop_duplicates(subset=['person_id'])[['person_id', 'home_cty', 'building_type']]
for year in [2022, 2023]:
    total_ev = 0
    total_pev = 0
    reduced_df = []
    year = str(year)
    
    for cnty in county_list:

        # slice the nev dataframe to only include the county
        nvehicles_sub = nev_df[nev_df['County']==cnty]
        num_evs_to_select = nvehicles_sub[nvehicles_sub['Powertrain']=='EV'][year].values[0]
        num_pevs_to_select = nvehicles_sub[nvehicles_sub['Powertrain']=='PHEV'][year].values[0]
        
        county_str = cnty + ' County, WA'
        county_df = unique_df[unique_df['home_cty'] == county_str]

        # do the sampling for the EVs and PHEVs
        selected_ev = county_df.person_id.sample(n=num_evs_to_select, replace=False, random_state=42)

        # make sure we dont count people twice. assuming 1 vehicle per person
        notselected_ev = county_df[~county_df['person_id'].isin(selected_ev)]
        selected_pev = notselected_ev.person_id.sample(n=num_pevs_to_select, replace=False, random_state=42) 
   
        # grab only those selected people from the original dataframe
        county_ev_df = df[(df['person_id'].isin(selected_ev))].copy()
        county_pev_df = df[(df['person_id'].isin(selected_pev))].copy()
        county_ev_df['engine'] = 'EV'
        county_pev_df['engine'] = 'PHEV'


        county_ev_df['year'] = year
        county_pev_df['year'] = year
        reduced_df.append(county_ev_df)
        reduced_df.append(county_pev_df)

        total_ev += num_evs_to_select
        total_pev += num_pevs_to_select
        
        # print(len(selected), county_df.shape[0], county_df.drop_duplicates(subset=['person_id']).shape[0])

    

    
    final_df = pd.concat(reduced_df)
    print("total number of people per year", total_ev+total_pev, "number of unique people year", len(final_df.drop_duplicates(subset=['person_id'])))
    multiyear_list.append(final_df)
multiyear_df = pd.concat(multiyear_list)

total number of people per year 45193 number of unique people year 45193
total number of people per year 130043 number of unique people year 130043


In [73]:
len(final_df.drop_duplicates(subset=['person_id'])), final_df.shape[0]

(130043, 675709)

In [15]:
nvehicles_sub = nev_df[nev_df['County']=='Yakima'].copy()
population_df = pop_df.copy()
nvehicles_sub

Unnamed: 0,County,Vehicle_type,domicile,Powertrain,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035
1824,Yakima,Personal Crossover,mfh,EV,0,24,57,99,143,184,221,264,305,358,405,464,518,586
1827,Yakima,Personal Crossover,mfh,PHEV,0,17,32,43,52,59,64,70,75,83,90,99,105,115
1828,Yakima,Personal Crossover,sfh,EV,1,92,217,371,533,685,821,977,1128,1326,1512,1753,1967,2226
1831,Yakima,Personal Crossover,sfh,PHEV,1,67,124,164,197,221,239,260,279,304,328,355,380,410
1832,Yakima,Personal Sedan,mfh,EV,49,1025,2121,2918,3665,4197,4651,4933,5274,5476,5632,5838,5994,6103
1835,Yakima,Personal Sedan,mfh,PHEV,25,688,1363,1828,2264,2581,2846,3032,3290,3470,3634,3836,4021,4209
1836,Yakima,Personal Sedan,sfh,EV,190,3676,7593,10433,13097,14997,16613,17616,18838,19560,20117,20849,21404,21785
1839,Yakima,Personal Sedan,sfh,PHEV,98,2433,4815,6453,7994,9117,10046,10703,11606,12252,12842,13552,14206,14859


In [19]:
year = str(2022)

# Create a list to keep track of selected individuals for each combination
already_sampled_people = []

# Iterate over the county DataFrame and sample individuals from the population DataFrame
for _, row in nvehicles_sub.iterrows():
    county = row['County']
    vehicle_type = row['Vehicle_type']
    domicile = row['domicile']
    count = row[year]
    # engine = row['Powertrain']
    powertrain = row['Powertrain']

    if count < 0:
        count = 0

    # slice the datafrane to only include people with the correct domicile
    if domicile == 'sfh':
        building_type = 'single_family'
        domicile_cond = population_df['building_type'] == 'single_family'
    else:
        domicile_cond = population_df['building_type'] != 'single_family'

    # slice the unique dataframe to only include the county
    county_str = county + ' County, WA'
    county_cond = population_df['home_cty'] == county_str
    
    
    # filter the population based on county and domicile
    filtered_population = population_df[(county_cond) & (domicile_cond)]
    print(f'Number of people in county and domicile = {domicile}: {filtered_population.shape[0]}')
    
    # exclude already selected individuals for this combination
    already_sampled_cond = filtered_population['person_id'].isin(already_sampled_people)
    filtered_people = filtered_population[~already_sampled_cond].copy()
    
    print(f'Filtered population not in the previous sample: {domicile} = {filtered_people.shape[0]}')
    # sample 'count' number of individuals
    sampled_population = filtered_people.sample(n=count, replace=False, random_state=42)
    sampled_individuals = sampled_population['person_id'].to_list()
    
    # Update the selected individuals dictionary
    already_sampled_people.extend(sampled_individuals)

    print(f'Sampled {count} individuals from County: {county}, Vehicle Type: {vehicle_type}, Domicile: {domicile}, Powertrain: {powertrain} \n')



Number of people in county and domicile = mfh: 24001
Filtered population not in the previous sample: mfh = 24001
Sampled 0 individuals from County: Yakima, Vehicle Type: Personal Crossover, Domicile: mfh, Powertrain: EV 

Number of people in county and domicile = mfh: 24001
Filtered population not in the previous sample: mfh = 24001
Sampled 0 individuals from County: Yakima, Vehicle Type: Personal Crossover, Domicile: mfh, Powertrain: PHEV 

Number of people in county and domicile = sfh: 124277
Filtered population not in the previous sample: sfh = 124277
Sampled 1 individuals from County: Yakima, Vehicle Type: Personal Crossover, Domicile: sfh, Powertrain: EV 

Number of people in county and domicile = sfh: 124277
Filtered population not in the previous sample: sfh = 124276
Sampled 1 individuals from County: Yakima, Vehicle Type: Personal Crossover, Domicile: sfh, Powertrain: PHEV 

Number of people in county and domicile = mfh: 24001
Filtered population not in the previous sample: mfh

In [26]:
# grab only those selected people from the original dataframe
cnty_df = df[(df['person_id'].isin(sampled_individuals))].copy()
cnty_df['engine'] = powertrain
cnty_df['segment'] = vehicle_type
cnty_df['efficiency'] = simdu.segment_efficiency(vehicle_type)
cnty_df['year'] = str(2022)
ctny_df = simdu.phev_efficiency_milage(cnty_df, powertrain) 
ctny_df

Unnamed: 0,activity_id,person_id,mode,travel_purpose,previous_activity_type,start_time,end_time,distance_miles,vehicle_type,origin_bgrp,origin_bgrp_lat,origin_bgrp_lng,destination_bgrp,destination_bgrp_lat,destination_bgrp_lng,origin_land_use_l1,origin_land_use_l2,origin_building_use_l1,origin_building_use_l2,destination_land_use_l1,destination_land_use_l2,destination_building_use_l1,destination_building_use_l2,origin_lat,origin_lng,destination_lat,destination_lng,weekday,household_id,BLOCKGROUP,BLOCKGROUP_work,BLOCKGROUP_school,TRACT,TRACT_work,TRACT_school,age_group,age,sex,race,ethnicity,individual_income_group,individual_income,employment,education,school_grade_attending,industry,household_role,subfamily_number,subfamily_relationship,commute_mode,tenure,migration,household_size,household_income_group,household_income,family_structure,vehicles,building_type,resident_type,language,lat,lng,lat_work,lng_work,lat_school,lng_school,wfh,charge_type,destination_county,home_cty,home_st,engine,segment,efficiency,year
184175,2030702661201869804,13086013628987033997,PRIVATE_AUTO,SCHOOL,HOME,0 days 13:47:16,0 days 14:27:15,19.414174,,530770031001,46.673521,-120.502937,530379754013,47.004629,-120.534679,residential,single_family,residential,single_family,civic_institutional,education,civic_institutional,education,46.695539,-120.503394,47.003996,-120.545497,saturday,17936814450731958864,530770031001,530770031001,530379754013,53077003100,53077003100,53037975401,18_24,23.0,M,white,not_hispanic_or_latino,40000_80000,45074.0,employed,high_school,not_attending_school,naics11,non_relative,0,,driving,renter,same_house,2_person,40000_75000,73955.0,nonfamily_single,3_plus,single_family,core,english,46.695539,-120.503394,,,47.003996,-120.545497,worked_in_person,PUBLIC,"Kittitas County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
184176,6829714716442902805,13086013628987033997,PRIVATE_AUTO,HOME,OTHER_ACTIVITY_TYPE,0 days 13:57:22,0 days 14:09:14,2.528796,,530770032002,46.652605,-120.550361,530770031001,46.673521,-120.502937,civic_institutional,education,civic_institutional,education,residential,single_family,residential,single_family,46.657084,-120.548546,46.695539,-120.503394,thursday,17936814450731958864,530770031001,530770031001,530379754013,53077003100,53077003100,53037975401,18_24,23.0,M,white,not_hispanic_or_latino,40000_80000,45074.0,employed,high_school,not_attending_school,naics11,non_relative,0,,driving,renter,same_house,2_person,40000_75000,73955.0,nonfamily_single,3_plus,single_family,core,english,46.695539,-120.503394,,,47.003996,-120.545497,worked_in_person,HOME,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
184177,9167428751569341243,13086013628987033997,PRIVATE_AUTO,SHOP,SOCIAL,0 days 15:20:13,0 days 15:26:23,1.900654,,530770031003,46.717096,-120.601637,530770031001,46.673521,-120.502937,residential,single_family,residential,single_family,commercial,retail,commercial,retail,46.706925,-120.523823,46.664250,-120.519696,saturday,17936814450731958864,530770031001,530770031001,530379754013,53077003100,53077003100,53037975401,18_24,23.0,M,white,not_hispanic_or_latino,40000_80000,45074.0,employed,high_school,not_attending_school,naics11,non_relative,0,,driving,renter,same_house,2_person,40000_75000,73955.0,nonfamily_single,3_plus,single_family,core,english,46.695539,-120.503394,,,47.003996,-120.545497,worked_in_person,PUBLIC,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
184178,8918589629257640941,13086013628987033997,PRIVATE_AUTO,EAT,SCHOOL,0 days 17:29:38,0 days 18:06:14,20.600679,,530379754013,47.004629,-120.534679,530770016023,46.617901,-120.481570,civic_institutional,education,civic_institutional,education,commercial,retail,commercial,retail,47.003996,-120.545497,46.605731,-120.483374,saturday,17936814450731958864,530770031001,530770031001,530379754013,53077003100,53077003100,53037975401,18_24,23.0,M,white,not_hispanic_or_latino,40000_80000,45074.0,employed,high_school,not_attending_school,naics11,non_relative,0,,driving,renter,same_house,2_person,40000_75000,73955.0,nonfamily_single,3_plus,single_family,core,english,46.695539,-120.503394,,,47.003996,-120.545497,worked_in_person,PUBLIC,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
184179,1614998230709356215,13086013628987033997,PRIVATE_AUTO,HOME,SHOP,0 days 19:08:55,0 days 19:42:43,6.671157,,530770016022,46.593199,-120.460963,530770031001,46.673521,-120.502937,commercial,retail,commercial,retail,residential,single_family,residential,single_family,46.601787,-120.479854,46.695539,-120.503394,saturday,17936814450731958864,530770031001,530770031001,530379754013,53077003100,53077003100,53037975401,18_24,23.0,M,white,not_hispanic_or_latino,40000_80000,45074.0,employed,high_school,not_attending_school,naics11,non_relative,0,,driving,renter,same_house,2_person,40000_75000,73955.0,nonfamily_single,3_plus,single_family,core,english,46.695539,-120.503394,,,47.003996,-120.545497,worked_in_person,HOME,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24777048,10938815464645671202,12572807974929600932,PRIVATE_AUTO,HOME,WORK,0 days 15:42:37,0 days 15:46:23,0.758627,,530770012011,46.581500,-120.525117,530770010004,46.589243,-120.555320,civic_institutional,education,civic_institutional,education,residential,single_family,residential,single_family,46.585013,-120.526094,46.587896,-120.551026,thursday,5013236797699972131,530770010004,530770012011,,53077001000,53077001201,,25_34,31.0,F,white,not_hispanic_or_latino,20000_40000,30257.0,employed,advanced_degree,not_attending_school,naics624410,head_of_household,0,,driving,owner,same_house,4_person,40000_75000,65912.0,married_couple,2,single_family,core,english,46.587896,-120.551026,46.585013,-120.526094,,,worked_in_person,HOME,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
24887271,14448532793066132217,12636991097152244102,PRIVATE_AUTO,SOCIAL,HOME,0 days 04:46:00,0 days 05:48:42,2.545092,,530770018004,46.237576,-119.966418,530770019022,46.248998,-119.910264,residential,single_family,residential,single_family,residential,single_family,residential,single_family,46.250742,-119.987126,46.248098,-119.909294,saturday,7905182364825212866,530770018004,530770018004,,53077001800,53077001800,,25_34,28.0,M,white,hispanic_or_latino,20000_40000,30304.0,employed,k_12,not_attending_school,naics11,relative,0,,driving,renter,same_house,6_person,40000_75000,60608.0,married_couple,3_plus,single_family,core,spanish,46.250742,-119.987126,46.251728,-120.032248,,,worked_in_person,PUBLIC,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
24887272,528121279085358245,12636991097152244102,PRIVATE_AUTO,WORK,HOME,0 days 12:39:00,0 days 13:21:29,1.692945,,530770018004,46.237576,-119.966418,530770018004,46.237576,-119.966418,residential,single_family,residential,single_family,agriculture,agriculture,agriculture,agriculture,46.250742,-119.987126,46.251728,-120.032248,thursday,7905182364825212866,530770018004,530770018004,,53077001800,53077001800,,25_34,28.0,M,white,hispanic_or_latino,20000_40000,30304.0,employed,k_12,not_attending_school,naics11,relative,0,,driving,renter,same_house,6_person,40000_75000,60608.0,married_couple,3_plus,single_family,core,spanish,46.250742,-119.987126,46.251728,-120.032248,,,worked_in_person,WORK,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022
24887273,14072600602801378416,12636991097152244102,PRIVATE_AUTO,HOME,SOCIAL,0 days 14:26:00,0 days 14:45:23,2.545092,,530770019022,46.248998,-119.910264,530770018004,46.237576,-119.966418,residential,single_family,residential,single_family,residential,single_family,residential,single_family,46.248098,-119.909294,46.250742,-119.987126,saturday,7905182364825212866,530770018004,530770018004,,53077001800,53077001800,,25_34,28.0,M,white,hispanic_or_latino,20000_40000,30304.0,employed,k_12,not_attending_school,naics11,relative,0,,driving,renter,same_house,6_person,40000_75000,60608.0,married_couple,3_plus,single_family,core,spanish,46.250742,-119.987126,46.251728,-120.032248,,,worked_in_person,HOME,"Yakima County, WA","Yakima County, WA",Washington,PHEV,Personal Sedan,0.9,2022


In [29]:
nvehicles_sub['2022'].sum(), ctny_df.shape[0], ctny_df.drop_duplicates(subset=['person_id']).shape[0]

(364, 645, 98)

In [22]:
124275 - 190

124085

# this is the most recent model that should be working on full dataset.

In [10]:
# slice the unique dataframe to only include people in that county
county = 'Yakima'
county_str = county + ' County, WA'
county_cond = pop_df['home_cty'] == county_str 
county_pop_df = pop_df[county_cond].copy()  
county_trip_df = df[df['home_cty'] == county_str].copy() 
nvehicles_sub = nev_df[nev_df['County'] == county].copy()
df.shape, county_trip_df.shape, pop_df.shape, county_pop_df.shape

((25405990, 71), (930192, 71), (5046603, 3), (148278, 3))

In [45]:
def exclusionary_sampler(df: pd.DataFrame, population_df: pd.DataFrame, nev_df: pd.DataFrame, county: str, year: str) -> pd.DataFrame:
    year = str(year)
    
    # subset the nev_df to only include the county
    # do we prioritize the the most number of vehicles
    # this option has the most remainders
    # nvehicles_sub = nev_df.sort_values(by=year, ascending=False)
    
    # or by ~least number of vehicles
    nvehicles_sub = nev_df

    # Create a list to keep track of selected individuals for each combination
    already_sampled_people = []

    # Create a list to store the cnty dataframes.
    cnty_df_list = []

    

    # Iterate over the county DataFrame and sample individuals from the population DataFrame
    for _, row in nvehicles_sub.iterrows():
        county = row['County']
        vehicle_type = row['Vehicle_type']
        domicile = row['domicile']
        count = row[year]
        # engine = row['Powertrain']
        powertrain = row['Powertrain']

        if count < 0:
            count = 0

        # slice the datafrane to only include people with the correct domicile
        if domicile == 'sfh':
            domicile_cond = population_df['building_type'] == 'single_family'
        else:
            domicile_cond = population_df['building_type'] != 'single_family'

        # filter the county population based on domicile
        filtered_population = population_df[(domicile_cond)]
        print(
            f'Total # of people in county and domicile = {domicile}: {filtered_population.shape[0]}')

        # exclude already selected individuals for this combination
        already_sampled_cond = filtered_population['person_id'].isin(
            already_sampled_people)
        filtered_people = filtered_population[~already_sampled_cond].copy()

        # check if there are more vehicles than people left in the county
        # if so draw from the general population
        if filtered_people.shape[0] <= count:
            
            tot_people_in_sub = filtered_people.shape[0]
            remainder = count - tot_people_in_sub
            # in this case the sampled pop is the entire county leftover.
            sampled_population = filtered_people
            print("total people in county and domicile", tot_people_in_sub)
            print("remainder", remainder)
            print(vehicle_type, domicile, powertrain)
            
            # filtered population is only a cut on the domicile
            # some people with the same domicile will have
            # multiple vehicles.
            sampled_from_domicile_population = population_df.sample(
                n=remainder, replace=False, random_state=42)
            full_county_plus_sampled_poeple = pd.concat([sampled_population, sampled_from_domicile_population])
            sampled_individuals = full_county_plus_sampled_poeple['person_id'].to_list()    
            
            # filtered_people = population_df.copy()
        else:
            # sample 'count' number of individuals
            sampled_population = filtered_people.sample(
                n=count, replace=False, random_state=42)
            sampled_individuals = sampled_population['person_id'].to_list()

        # Update the selected individuals list
        already_sampled_people.extend(sampled_individuals)

        print(f'Sampled {count} individuals from County: {county}, Vehicle Type: {vehicle_type}, Domicile: {domicile}, Powertrain: {powertrain} \n')

        # grab only those selected people with this combination
        # of county, domicile, and vehicle type and powertrain.
        cnty_df = df[(df['person_id'].isin(sampled_individuals))].copy()
        cnty_df['engine'] = powertrain
        cnty_df['segment'] = vehicle_type
        cnty_df['efficiency'] = simdu.segment_efficiency(vehicle_type)
        cnty_df['year'] = str(year)
        ctny_df = simdu.phev_efficiency_milage(cnty_df, powertrain)
        ctny_df['charge_type'] = ctny_df.apply(simdu.map_charge_type, axis=1)
        cnty_df_list.append(cnty_df)

    full_county_df = pd.concat(cnty_df_list)
    return full_county_df

In [46]:
full_county_df = exclusionary_sampler(county_trip_df, county_pop_df, nvehicles_sub, county, 2035)

Total # of people in county and domicile = mfh: 24001
Sampled 1 individuals from County: Yakima, Vehicle Type: Commercial Crossover, Domicile: mfh, Powertrain: EV 

Total # of people in county and domicile = mfh: 24001
Sampled 0 individuals from County: Yakima, Vehicle Type: Commercial Crossover, Domicile: mfh, Powertrain: PHEV 

Total # of people in county and domicile = sfh: 124277
Sampled 13 individuals from County: Yakima, Vehicle Type: Commercial Crossover, Domicile: sfh, Powertrain: EV 

Total # of people in county and domicile = sfh: 124277
Sampled 1 individuals from County: Yakima, Vehicle Type: Commercial Crossover, Domicile: sfh, Powertrain: PHEV 

Total # of people in county and domicile = mfh: 24001
Sampled 36 individuals from County: Yakima, Vehicle Type: Commercial Sedan, Domicile: mfh, Powertrain: EV 

Total # of people in county and domicile = mfh: 24001
Sampled 29 individuals from County: Yakima, Vehicle Type: Commercial Sedan, Domicile: mfh, Powertrain: PHEV 

Total #

In [37]:
nvehicles_sub.sort_values(by='2022', ascending=False)

Unnamed: 0,County,Vehicle_type,domicile,Powertrain,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035
1836,Yakima,Personal Sedan,sfh,EV,190,3676,7593,10433,13097,14997,16613,17616,18838,19560,20117,20849,21404,21785
1844,Yakima,Personal Truck/SUV,sfh,EV,133,6521,12813,18796,24713,30637,36335,41141,46932,53211,59811,66703,74132,83107
1839,Yakima,Personal Sedan,sfh,PHEV,98,2433,4815,6453,7994,9117,10046,10703,11606,12252,12842,13552,14206,14859
1847,Yakima,Personal Truck/SUV,sfh,PHEV,65,3862,6381,8226,9505,10701,11522,12233,13039,13915,14799,15789,16813,18077
1832,Yakima,Personal Sedan,mfh,EV,49,1025,2121,2918,3665,4197,4651,4933,5274,5476,5632,5838,5994,6103
1840,Yakima,Personal Truck/SUV,mfh,EV,34,1780,3504,5143,6768,8393,9963,11285,12851,14524,16279,18064,19987,22314
1835,Yakima,Personal Sedan,mfh,PHEV,25,688,1363,1828,2264,2581,2846,3032,3290,3470,3634,3836,4021,4209
1843,Yakima,Personal Truck/SUV,mfh,PHEV,18,1063,1757,2265,2617,2945,3170,3367,3593,3845,4102,4380,4667,5020
1812,Yakima,Commercial Sedan,sfh,EV,8,18,36,45,57,65,74,79,88,95,101,112,118,126
1820,Yakima,Commercial Truck/SUV,sfh,EV,6,143,319,507,711,930,1156,1360,1618,1914,2241,2574,2940,3381


In [32]:
full_county_df.drop_duplicates(subset=['person_id']).shape, county_pop_df.shape, full_county_df.shape

((119316, 75), (148278, 3), (753335, 75))