In [57]:
import pandas as pd
import os

### Active bidders preprocessing

In [58]:
active_buyers = pd.read_csv('../data/raw/active_buyers.csv')
active_buyers.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Consumer,TN,71973155,496756,774468-MSTRBDR-41de7231-2f92-41ec-afe2-9236d15...,7300.0,2025-10-23,2023,CADI,XT6,36434.0,38925.0,0.0,7,3965
1,Consumer,TN,84613455,307472,307472cprt_dmmy_MASTERBDR_774468_309348@copart...,1750.0,2025-10-22,2017,CADI,XTS,16824.0,13225.0,15591.11,9,3618
2,Dismantler,CA,84636425,648809,info@cps.bike,90.0,2025-10-22,2025,HYUN,VENUE,21000.0,23000.0,6836.0,5,1008
3,Dismantler,AL,81820645,418252,duartedlm@gmail.com,450.0,2025-10-23,2016,ACUR,TLX,19085.0,14475.0,15839.0,3,141
4,Consumer,TX,70296175,865646,lg954932@gmail.com,175.0,2025-10-24,2004,MAZD,6,4715.72,0.0,4715.72,4,2


In [59]:
active_buyers.shape

(315231, 15)

In [60]:
active_buyers.isnull().sum()

mbr_lic_type                        842
mbr_state                             0
lot_nbr                               0
buyer_nbr                             0
mbr_email                             0
max_bid                               0
inv_dt                                0
lot_year                              0
lot_make_cd                           0
grp_model                          9044
acv                                   0
plug_lot_acv                          0
repair_cost                           0
total_unique_buyers_on_that_lot       0
total_unique_lots_bid_by_buyers       0
dtype: int64

### Missing value imputations based on:
1. mbr lic type: Mode
2. mbr state: Mode
3. grp model : Mode within year and make

In [61]:
def _fill_grp_model_year_make(group: pd.DataFrame) -> pd.DataFrame:
    """Helper: fill grp_model within (lot_year, lot_make_cd)."""
    mode_val = group['grp_model'].mode()
    if not mode_val.empty:
        group['grp_model'] = group['grp_model'].fillna(mode_val[0])
    return group

def _fill_grp_model_make(group: pd.DataFrame) -> pd.DataFrame:
    """Helper: fill grp_model within lot_make_cd."""
    mode_val = group['grp_model'].mode()
    if not mode_val.empty:
        group['grp_model'] = group['grp_model'].fillna(mode_val[0])
    return group

def fill_missing_grp_model(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing 'grp_model' using hierarchical mode logic:
    1. Within (lot_year, lot_make_cd)
    2. Within (lot_make_cd)
    3. Drop remaining rows where grp_model is still NaN
    """
    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])
    return df

In [62]:
def clean_active_buyers(df: pd.DataFrame) -> pd.DataFrame:

    df['mbr_lic_type'] = df['mbr_lic_type'].fillna(df['mbr_lic_type'].mode()[0])
    df['mbr_state'] = df['mbr_state'].fillna(df['mbr_state'].mode()[0])
    df['mbr_lic_type'] = df['mbr_lic_type'].replace('Automotive Related Business', 'General Business')

    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])

    df['acv'] = df['acv'].mask(df['acv']<=0, df['plug_lot_acv'])

    return df

In [63]:
active_buyers = clean_active_buyers(active_buyers)

  df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
  df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)


In [64]:
active_buyers

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Consumer,TN,71973155,496756,774468-MSTRBDR-41de7231-2f92-41ec-afe2-9236d15...,7300.0,2025-10-23,2023,CADI,XT6,36434.00,38925.0,0.00,7,3965
1,Consumer,TN,84613455,307472,307472cprt_dmmy_MASTERBDR_774468_309348@copart...,1750.0,2025-10-22,2017,CADI,XTS,16824.00,13225.0,15591.11,9,3618
2,Dismantler,CA,84636425,648809,info@cps.bike,90.0,2025-10-22,2025,HYUN,VENUE,21000.00,23000.0,6836.00,5,1008
3,Dismantler,AL,81820645,418252,duartedlm@gmail.com,450.0,2025-10-23,2016,ACUR,TLX,19085.00,14475.0,15839.00,3,141
4,Consumer,TX,70296175,865646,lg954932@gmail.com,175.0,2025-10-24,2004,MAZD,6,4715.72,0.0,4715.72,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315226,General Business,AR,61362375,807902,elvillaautosales@gmail.com,30.0,2025-10-27,2009,JEP,WRANGLER,4141.00,7850.0,8825.67,8,5
315227,Dismantler,TN,81744215,489065,bobmcap@yahoo.com,2150.0,2025-10-24,2004,JEP,WRANGLER,9465.20,0.0,9465.20,9,23
315228,Dealer,FL,67261295,115359,masterbidllc@gmail.com,2800.0,2025-10-22,2008,JEP,WRANGLER,11200.00,8400.0,0.00,15,135
315229,Dealer,GA,89850035,445680,iamg2022@yahoo.com,7400.0,2025-10-24,2014,JEP,WRANGLER,12350.00,12350.0,0.00,2,32


In [65]:
active_buyers.isnull().sum()

mbr_lic_type                       0
mbr_state                          0
lot_nbr                            0
buyer_nbr                          0
mbr_email                          0
max_bid                            0
inv_dt                             0
lot_year                           0
lot_make_cd                        0
grp_model                          0
acv                                0
plug_lot_acv                       0
repair_cost                        0
total_unique_buyers_on_that_lot    0
total_unique_lots_bid_by_buyers    0
dtype: int64

In [66]:
active_buyers['buyer_nbr'].nunique()

29820

In [67]:
active_buyers.to_csv('../data/processed/active_buyers.csv', index=False)

### Non Active Bidders Preprocessing

In [68]:
non_active_buyers = pd.read_csv('../data/raw/non_active_buyers.csv')
non_active_buyers.head()

Unnamed: 0,mbr_lic_type,mbr_state,mbr_nbr,mbr_email
0,General Business,HI,813714,misterrozayroze@gmail.com
1,Consumer,AK,683392,dnkampong@gmail.com
2,Consumer,ME,688950,mecawilliam1981agosto@gmail.com
3,General Business,AK,916734,jon.boehmler@gmail.com
4,Consumer,HI,932785,tavaressunnee845@gmail.com


In [69]:
def clean_non_active_buyers(df: pd.DataFrame) -> pd.DataFrame:

    df['mbr_lic_type'] = df['mbr_lic_type'].fillna(df['mbr_lic_type'].mode()[0])
    df['mbr_state'] = df['mbr_state'].fillna(df['mbr_state'].mode()[0])
    df['mbr_lic_type'] = df['mbr_lic_type'].replace('Automotive Related Business', 'General Business')
    df = df.rename(columns={'mbr_lic_type': 'buyer_type'})

    return df

In [70]:
non_active_buyers = clean_non_active_buyers(non_active_buyers)

In [71]:
non_active_buyers.head()

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
0,General Business,HI,813714,misterrozayroze@gmail.com
1,Consumer,AK,683392,dnkampong@gmail.com
2,Consumer,ME,688950,mecawilliam1981agosto@gmail.com
3,General Business,AK,916734,jon.boehmler@gmail.com
4,Consumer,HI,932785,tavaressunnee845@gmail.com


In [72]:
non_active_buyers['buyer_type'].value_counts()

buyer_type
Consumer            174083
Dealer               13978
General Business      4358
Dismantler            2664
Export                1329
Name: count, dtype: int64

In [73]:
non_active_buyers.isnull().sum()

buyer_type    0
mbr_state     0
mbr_nbr       0
mbr_email     0
dtype: int64

In [74]:
non_active_buyers['mbr_nbr'].nunique()

196412

In [75]:
non_active_buyers.to_csv('../data/processed/non_active_buyers.csv', index=False)

### Popular Lots Preprocessing

In [92]:
popular_lots = pd.read_csv('../data/raw/popular_lots.csv')

In [93]:
def clean_popular_lots(popular_df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and ranks popular lots data:
    - Replace 'Automotive Related Business' → 'General Business'
    - Fill missing buyer_type with mode
    - Fill grp_model hierarchically
    - Deduplicate and keep top 6 per (buyer_type, mbr_state)
    """
    # Replace business type
    popular_df['buyer_type'] = popular_df['buyer_type'].replace(
        'Automotive Related Business', 'General Business'
    )

    # Fill buyer_type with mode
    mode_val = popular_df['buyer_type'].mode()
    if not mode_val.empty:
        popular_df['buyer_type'] = popular_df['buyer_type'].fillna(mode_val[0])

    # Fill grp_model using make-level mode
    popular_df = popular_df.groupby('grp_model', group_keys=False).apply(_fill_grp_model_make)

    # Replace acv with plug_lot_acv where acv is 0 or negative
    popular_df['median_acv'] = popular_df['median_acv'].mask(popular_df['median_acv']<=0, popular_df['median_plug_lot_acv'])

    # Sort + deduplicate
    popular_df_sorted = (
        popular_df
        .sort_values(['buyer_type', 'mbr_state', 'cnt'], ascending=[True, True, False])
        .drop_duplicates(subset=['buyer_type', 'mbr_state', 'lot_make_cd', 'grp_model'])
    )

    # Rank within buyer_type + state
    popular_df_sorted['rank_clean'] = (
        popular_df_sorted.groupby(['buyer_type', 'mbr_state']).cumcount() + 1
    )

    # Keep only top 6
    return popular_df_sorted[popular_df_sorted['rank_clean'] <= 6]

In [94]:
popular_lots = clean_popular_lots(popular_lots)

  popular_df = popular_df.groupby('grp_model', group_keys=False).apply(_fill_grp_model_make)


In [95]:
popular_lots

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
180,Consumer,AK,JEP,COMPASS,1,16089.00,12825.0,17984.54,1,1,1
181,Consumer,AK,CHEV,EQUINOX,1,14942.00,16325.0,14463.44,1,2,2
182,Consumer,AK,HOND,RIDGELINE,1,33517.00,35600.0,31893.54,1,3,3
183,Consumer,AK,NISS,ROGUE,1,14197.89,14325.0,14197.89,1,4,4
184,Consumer,AK,JEP,RENEGADE,1,13625.00,13625.0,0.00,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
865,General Business,TX,NISS,VERSA,1,14552.00,14550.0,7337.38,1,6,6
474,General Business,VA,TOYT,COROLLA,3,16431.00,16950.0,16376.31,1,1,1
588,General Business,WA,CHRY,PACIFICA,1,17982.00,25050.0,15576.33,1,1,1
589,General Business,WA,HOND,FIT,1,14224.00,14300.0,18035.98,1,2,2


In [96]:
popular_lots[(popular_lots['buyer_type']=='Consumer') & (popular_lots['mbr_state']=='NJ')]

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
209,Consumer,NJ,FORD,MUSTANG,7,21931.0,28175.0,18924.64,1,1,1
210,Consumer,NJ,HOND,CRV,5,33115.0,32650.0,0.0,1,2,2
211,Consumer,NJ,NISS,SENTRA,5,19097.0,22700.0,17159.33,1,3,3
212,Consumer,NJ,TOYT,CAMRY,5,22745.0,27150.0,18632.89,1,4,4
213,Consumer,NJ,SUBA,FORESTER,4,14807.0,22925.0,15394.49,1,5,5
214,Consumer,NJ,TOYT,COROLLA,4,19121.0,20075.0,13365.99,1,6,6


In [97]:
popular_lots.isnull().sum()

buyer_type             0
mbr_state              0
lot_make_cd            0
grp_model              0
cnt                    0
median_acv             0
median_plug_lot_acv    0
median_repair_cost     0
model_rank             0
rank                   0
rank_clean             0
dtype: int64

In [98]:
popular_lots.to_csv('../data/processed/popular_lots.csv', index=False)

### Upcoming Lots Preprocessing

In [99]:
upcoming_lots = pd.read_csv('../data/raw/upcoming_lots.csv')

In [100]:
def clean_upcoming_lots(df: pd.DataFrame) -> pd.DataFrame:

    mode_val = df['damage_type_desc'].mode()
    if not mode_val.empty:
        df['damage_type_desc'] = df['damage_type_desc'].fillna(mode_val[0])

    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])

    df['acv'] = df['acv'].mask(df['acv']<=0, df['plug_lot_acv'])

    return df

In [101]:
upcoming_lots = clean_upcoming_lots(upcoming_lots)

  df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
  df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)


In [102]:
upcoming_lots.isnull().sum()

lot_nbr             0
lot_year            0
lot_make_cd         0
grp_model           0
damage_type_desc    0
repair_cost         0
acv                 0
plug_lot_acv        0
auc_dt              0
proquote_amt        0
dtype: int64

In [103]:
upcoming_lots.to_csv('../data/processed/upcoming_lots.csv', index=False)
active_buyers.to_csv('../data/processed/active_buyers.csv', index=False)
popular_lots.to_csv('../data/processed/popular_lots.csv', index=False)
non_active_buyers.to_csv('../data/processed/non_active_buyers.csv', index=False)

### Dividing in groups , Collaborative vs one-to-one

In [104]:
def divide_in_groups(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    This function split the data frame into Collaborative and 1-1 filtering groups
    """
    data_high = df[df['total_unique_lots_bid_by_buyers']>=7]
    data_low = df[df['total_unique_lots_bid_by_buyers']<7]

    return data_high, data_low

### Splitting in odd(test) and even(holdout)

In [105]:
def odd_even_split(df: pd.DataFrame, buyer_col) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    This function split the data frame into Test and Holdout groups based on odd and even buyer numbers
    """
    df = df.copy()
    df['last_digit'] = df[buyer_col] % 10
    holdout_df = df[df['last_digit'] % 2 == 0].drop(columns='last_digit')
    test_df = df[df['last_digit'] % 2 != 0].drop(columns='last_digit')

    return holdout_df, test_df

In [106]:
def save_split_data(df: pd.DataFrame, output_path: str) -> None:
    """This function will save the split data into csv files"""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f" Saved: {output_path} ({len(df):,} rows)")


In [107]:
active_buyers = pd.read_csv('../data/processed/active_buyers.csv')
nonactive_buyers = pd.read_csv('../data/processed/non_active_buyers.csv')

In [108]:
print(f"Number of active buyers: {active_buyers['buyer_nbr'].nunique()}")

## Group split
data_high, data_low = divide_in_groups(active_buyers)
print(f"Number of active buyers in CF: {data_high['buyer_nbr'].nunique()}")
print(f"Number of active buyers in one-to-one: {data_low['buyer_nbr'].nunique()}")

## Test vs Holdout split
holdout_df_cf, test_df_cf = odd_even_split(data_high, buyer_col='buyer_nbr')
print(f"Number of CF buyers in Test: {test_df_cf['buyer_nbr'].nunique()}")
print(f"Number of CF buyers in Control: {holdout_df_cf['buyer_nbr'].nunique()}")

holdout_df_onetoone, test_df_onetoone = odd_even_split(data_low, buyer_col='buyer_nbr')
print(f"Number of one-to-one buyers in Test: {test_df_onetoone['buyer_nbr'].nunique()}")
print(f"Number of one-to-one buyers in Control: {holdout_df_onetoone['buyer_nbr'].nunique()}")
## Non-active buyers
print(f"Number of non-active buyers: {nonactive_buyers['mbr_nbr'].nunique()}")
holdout_df_nonactive, test_df_nonactive = odd_even_split(nonactive_buyers, buyer_col='mbr_nbr')
print(f"Number of non-active buyers in Test: {test_df_nonactive['mbr_nbr'].nunique()}")
print(f"Number of non-active buyers in Control: {holdout_df_nonactive['mbr_nbr'].nunique()}")

Number of active buyers: 29820
Number of active buyers in CF: 7312
Number of active buyers in one-to-one: 22508
Number of CF buyers in Test: 3618
Number of CF buyers in Control: 3694
Number of one-to-one buyers in Test: 11250
Number of one-to-one buyers in Control: 11258
Number of non-active buyers: 196412
Number of non-active buyers in Test: 98100
Number of non-active buyers in Control: 98312


In [109]:
test_df_nonactive.to_csv('../data/split/nonactive_test.csv', index=False)
holdout_df_nonactive.to_csv('../data/split/nonactive_holdout.csv', index=False)

test_df_cf.to_csv('../data/split/cf_test.csv', index=False)
holdout_df_cf.to_csv('../data/split/cf_holdout.csv', index=False)

test_df_onetoone.to_csv('../data/split/one_to_one_test.csv', index=False)
holdout_df_onetoone.to_csv('../data/split/one_to_one_holdout.csv', index=False)

### Westlake

In [52]:
import pandas as pd
df_wl = pd.read_csv('../data/westlake/wl_lots.csv')

In [53]:
df_wl.head()

Unnamed: 0,lot_nbr,lot_stg,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,84820275,50,2023,DODG,CHARGER,FRONT END,7776.0,24820.0,29250.0,,9558.85
1,87468275,40,2017,HYUN,SONATA,MINOR DENT/SCRATCHES,3218.0,8600.0,10400.0,,0.0
2,89514045,40,2014,GMC,SIERRA,MINOR DENT/SCRATCHES,15422.0,10060.0,13025.0,,2683.11
3,86683585,50,2010,GMC,YUKON,MECHANICAL,18297.0,7920.0,8250.0,,886.33
4,89451755,40,2014,RAM,1500,NORMAL WEAR,0.0,12920.0,16600.0,,0.0


In [54]:
df_wl.isnull().sum()

lot_nbr               0
lot_stg               0
lot_year              0
lot_make_cd           0
grp_model             1
damage_type_desc      0
repair_cost           0
acv                   0
plug_lot_acv          0
auc_dt              222
proquote_amt          0
dtype: int64

In [55]:
df_wl['grp_model'] = df_wl['grp_model'].fillna(df_wl['grp_model'].mode()[0])

In [56]:
df_wl.to_csv('../data/westlake/wl_lots_cleaned.csv',index=False)