In [1]:
import pandas as pd
import dask.dataframe as dd
datadir = '../../data/'

In [2]:
ev_df = pd.DataFrame({
    'year': [2022, 2022, 2023, 2023],
    'county': ['King', 'Spokane', 'King', 'Spokane'],
    'nEV': [5, 2, 6, 3],
    'efficiency_kwh_mi': [0.3, 0.3, 0.4, 0.4],
})
ev_df

Unnamed: 0,year,county,nEV,efficiency_kwh_mi
0,2022,King,5,0.3
1,2022,Spokane,2,0.3
2,2023,King,6,0.4
3,2023,Spokane,3,0.4


In [3]:
import numpy as np


In [4]:
import pandas as pd
import numpy as np

# Sample dataframe with people and their home county
people_df = pd.DataFrame({
    'person_id': np.arange(20),
    'County': ['King']*10 + ['Spokane']*10,
})
# Population dataframe with county, year, and number of people to select
ev_df = pd.DataFrame({
    'Year': [2022, 2022, 2023, 2023],
    'County': ['King', 'Spokane', 'King', 'Spokane'],
    'nEV': [5, 2, 6, 3],
    'efficiency_kwh_mi': [0.3, 0.3, 0.4, 0.4],
})

selected_people = []
for _, row in ev_df.iterrows():
    county = row['County']
    year = row['Year']
    efficiency_kwh_mi = row['efficiency_kwh_mi']
    num_to_select = row['nEV']
    county_df = people_df[(people_df['County'] == county)]
    selected = county_df.sample(n=num_to_select, replace=False, random_state=42)
    # Add the year column to the selected dataframe
    selected['Year'] = year  
    # Add the efficiency column to the selected dataframe
    selected['efficiency_kwh_mi'] = efficiency_kwh_mi  
    selected_people.append(selected)

# Concatenate the selected people into a final dataframe
final_df = pd.concat(selected_people)
final_df  

Unnamed: 0,person_id,County,Year,efficiency_kwh_mi
8,8,King,2022,0.3
1,1,King,2022,0.3
5,5,King,2022,0.3
0,0,King,2022,0.3
7,7,King,2022,0.3
18,18,Spokane,2022,0.3
11,11,Spokane,2022,0.3
8,8,King,2023,0.4
1,1,King,2023,0.4
5,5,King,2023,0.4


# to make this work, we need to add counties to the large dataframe. 

The problem with this is that commercial trips dont aren't modeled in the population table so they do not have home counties. We can use distination blockgroups for them and then match the blockgroups to counties. 

In [2]:
df = pd.read_parquet(datadir+'/wa_pop_and_trips_sorted.parquet') # len = 51727268
counties = pd.read_parquet(datadir+'/population_counties_dataset.parquet', engine='pyarrow')
cdf = pd.merge(df, counties, on='person_id', how='left')

# change data types for consistency
cdf['home_cty'] = cdf['home_cty'].astype(str)
cdf['home_st'] = cdf['home_st'].astype(str)

cdf.to_parquet(datadir+'/wa_pop_and_trips_sorted_county.parquet', engine='pyarrow')

In [2]:
cdf = dd.read_parquet(datadir+'/wa_pop_and_trips_sorted_county.parquet', engine='pyarrow')

In [6]:
cdf['mode'].value_counts()

mode
PRIVATE_AUTO         27124460
CARPOOL              16417151
WALKING               4937625
COMMERCIAL            1452450
OTHER_TRAVEL_MODE      632101
ON_DEMAND_AUTO         465640
PUBLIC_TRANSIT         406620
BIKING                 291221
Name: count, dtype: int64

In [13]:
cdf.loc[cdf['mode'] == 'PRIVATE_AUTO']['home_cty']

0             Grant County, WA
1             Grant County, WA
6             Grant County, WA
7             Grant County, WA
8             Grant County, WA
                   ...        
51727263       King County, WA
51727264       King County, WA
51727265       King County, WA
51727266    Spokane County, WA
51727267    Spokane County, WA
Name: home_cty, Length: 27124460, dtype: object

In [3]:
# read in blockgroup info
bg_df = dd.read_csv(datadir+'blockgroup_counties.csv')
bg_df['destination_bgrp'] = bg_df.destination_bgrp.astype(str)
bg_df['destination_county'] = bg_df.County.astype(str)

In [9]:
weekday = 'thursday'
mode = 'COMMERCIAL'
vehicle_type = 'HEAVY_COMMERCIAL'
hdvs = cdf.loc[cdf['vehicle_type'] == vehicle_type]
# merge with the blockgroup county info
hdvs_cnty = pd.merge(hdvs, bg_df, on='destination_bgrp', how='left')

In [4]:
merged_df = dd.merge(cdf, bg_df, on='destination_bgrp', how='left')

In [12]:
merged_df[['home_cty', 'County']]

Unnamed: 0,home_cty,County
0,"Grant County, WA",Grant County
1,"Grant County, WA",Grant County
2,"Grant County, WA",Grant County
3,"Grant County, WA",Grant County
4,"Grant County, WA",Grant County
...,...,...
57909189,"King County, WA",King County
57909190,"King County, WA",King County
57909191,"King County, WA",King County
57909192,"Spokane County, WA",Spokane County


In [15]:
merged_df[['home_cty', 'County']].value_counts()

home_cty              County            
King County, WA       King County           13053211
Pierce County, WA     Pierce County          5351887
                      Washington (state)     5351887
Snohomish County, WA  Snohomish County       4745474
Spokane County, WA    Spokane County         3398524
                                              ...   
Wasco County, OR      Adams County                 1
Multnomah County, OR  San Juan County              1
Wallowa County, OR    Columbia County              1
Clatsop County, OR    Whatcom County               1
Shoshone County, ID   Lincoln County               1
Name: count, Length: 2510, dtype: int64

In [18]:
counts = merged_df['home_cty'].value_counts(dropna=False)

In [22]:
counts

home_cty
King County, WA           14216415
Pierce County, WA         11450816
Snohomish County, WA       5535369
Spokane County, WA         3447740
Clark County, WA           2932717
                            ...   
Carter County, MT                1
Sweet Grass County, MT           1
Lincoln County, ID               1
Clark County, ID                 1
Power County, ID                 1
Name: count, Length: 168, dtype: int64


In [23]:
merged_df['home_cty'].unique()

array(['Grant County, WA', 'Clark County, WA', 'Kitsap County, WA',
       'Thurston County, WA', 'nan', 'Snohomish County, WA',
       'Jefferson County, WA', 'Spokane County, WA', 'Whatcom County, WA',
       'King County, WA', 'Pierce County, WA', 'Island County, WA',
       'Klickitat County, WA', 'Mason County, WA', 'Cowlitz County, WA',
       'Benton County, WA', 'Multnomah County, OR', 'Skagit County, WA',
       'Kittitas County, WA', 'Yakima County, WA', 'Chelan County, WA',
       'Franklin County, WA', 'Grays Harbor County, WA',
       'Stevens County, WA', 'Okanogan County, WA', 'Lincoln County, WA',
       'Clallam County, WA', 'Walla Walla County, WA',
       'Bonner County, ID', 'Lewis County, WA', 'Latah County, ID', '\\N',
       'Pend Oreille County, WA', 'Whitman County, WA',
       'Washington County, OR', 'Douglas County, WA',
       'Skamania County, WA', 'Adams County, WA', 'Marion County, OR',
       'Wahkiakum County, WA', 'Asotin County, WA', 'Pacific County,

In [24]:
merged_df['County'].unique()

array(['Grant County', 'Clark County', 'King County', 'Kitsap County',
       'Pierce County', 'Washington (state)', 'Thurston County',
       'Spokane County', 'Snohomish County', 'Jefferson County',
       'Skagit County', 'Whatcom County', 'Island County',
       'Klickitat County', 'Lewis County', 'Mason County',
       'Cowlitz County', 'Benton County', 'Yakima County',
       'Kittitas County', 'Chelan County', 'Franklin County',
       'Grays Harbor County', 'Douglas County', 'Walla Walla County',
       'Stevens County', 'Okanogan County', 'Whitman County',
       'Lincoln County', 'Clallam County', 'Pend Oreille County',
       'Asotin County', 'Adams County', 'Skamania County',
       'Pacific County', 'Wahkiakum County', 'Columbia County',
       'Ferry County', 'San Juan County', 'Garfield County'], dtype=object)

In [5]:
merged_df['destination_county'] = merged_df['County'] + ', WA'

In [6]:
merged_df.to_parquet(datadir+'wa_pop_and_trips_sorted_county_2.parquet', engine='pyarrow')