In [1]:
import pandas as pd
import os

### Active bidders preprocessing

In [13]:
active_buyers = pd.read_csv('../data/raw/active_buyers.csv')
active_buyers.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Consumer,NV,66043575,720911,zdemian0@gmail.com,1300.0,2025-08-08,2014,HYUN,SONATA,325.0,7350.0,0.0,12,1
1,Dealer,SD,64790825,67677,swish@itctel.com,2450.0,2025-08-18,2017,CHEV,CRUZE,13001.0,11000.0,10252.0,5,20
2,Consumer,AK,62290835,600349,dwightgrossnickle10@gmail.com,20.0,2025-08-06,2016,CHEV,EQUINOX,7220.0,8550.0,6849.0,8,295
3,Dealer,NH,68126475,514361,allan_kouba@hotmail.com,9300.0,2025-09-11,2014,BMW,X5,0.0,10475.0,0.0,10,946
4,Dealer,NH,65413175,219425,realdz1969@aol.com,275.0,2025-09-02,2011,FORD,TAURUS,7022.0,5525.0,7563.73,4,2798


In [14]:
active_buyers.shape

(3833551, 15)

In [15]:
active_buyers.isnull().sum()

mbr_lic_type                        14377
mbr_state                               0
lot_nbr                                 0
buyer_nbr                               0
mbr_email                               0
max_bid                                 0
inv_dt                                  0
lot_year                                0
lot_make_cd                             0
grp_model                          103716
acv                                     0
plug_lot_acv                            0
repair_cost                             0
total_unique_buyers_on_that_lot         0
total_unique_lots_bid_by_buyers         0
dtype: int64

### Missing value imputations based on:
1. mbr lic type: Mode
2. mbr state: Mode
3. grp model : Mode within year and make

In [16]:
def _fill_grp_model_year_make(group: pd.DataFrame) -> pd.DataFrame:
    """Helper: fill grp_model within (lot_year, lot_make_cd)."""
    mode_val = group['grp_model'].mode()
    if not mode_val.empty:
        group['grp_model'] = group['grp_model'].fillna(mode_val[0])
    return group

def _fill_grp_model_make(group: pd.DataFrame) -> pd.DataFrame:
    """Helper: fill grp_model within lot_make_cd."""
    mode_val = group['grp_model'].mode()
    if not mode_val.empty:
        group['grp_model'] = group['grp_model'].fillna(mode_val[0])
    return group

def fill_missing_grp_model(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing 'grp_model' using hierarchical mode logic:
    1. Within (lot_year, lot_make_cd)
    2. Within (lot_make_cd)
    3. Drop remaining rows where grp_model is still NaN
    """
    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])
    return df

In [17]:
def clean_active_buyers(df: pd.DataFrame) -> pd.DataFrame:

    df['mbr_lic_type'] = df['mbr_lic_type'].fillna(df['mbr_lic_type'].mode()[0])
    df['mbr_state'] = df['mbr_state'].fillna(df['mbr_state'].mode()[0])
    df['mbr_lic_type'] = df['mbr_lic_type'].replace('Automotive Related Business', 'General Business')

    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])

    df['acv'] = df['acv'].mask(df['acv']<=0, df['plug_lot_acv'])

    return df

In [18]:
active_buyers = clean_active_buyers(active_buyers)

  df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
  df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)


In [19]:
active_buyers

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Consumer,NV,66043575,720911,zdemian0@gmail.com,1300.0,2025-08-08,2014,HYUN,SONATA,325.0,7350.0,0.00,12,1
1,Dealer,SD,64790825,67677,swish@itctel.com,2450.0,2025-08-18,2017,CHEV,CRUZE,13001.0,11000.0,10252.00,5,20
2,Consumer,AK,62290835,600349,dwightgrossnickle10@gmail.com,20.0,2025-08-06,2016,CHEV,EQUINOX,7220.0,8550.0,6849.00,8,295
3,Dealer,NH,68126475,514361,allan_kouba@hotmail.com,9300.0,2025-09-11,2014,BMW,X5,10475.0,10475.0,0.00,10,946
4,Dealer,NH,65413175,219425,realdz1969@aol.com,275.0,2025-09-02,2011,FORD,TAURUS,7022.0,5525.0,7563.73,4,2798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3833546,Dealer,MA,59103975,396746,phillip.messina577@gmail.com,4250.0,2025-08-19,2017,JEP,WRANGLER,19173.0,23200.0,14798.21,7,286
3833547,Dismantler,TN,65029795,764700,rhonda.colding@roadtestedparts.com,3100.0,2025-08-05,2013,JEP,WRANGLER,13422.0,12775.0,12069.54,4,5445
3833548,Dismantler,TN,70660125,489065,bobmcap@yahoo.com,1300.0,2025-10-06,2003,JEP,WRANGLER,6942.0,0.0,6772.00,7,191
3833549,Dismantler,AL,80331235,506597,sabinomotors2015@hotmail.com,300.0,2025-09-22,2015,JEP,WRANGLER,18277.0,13125.0,29417.39,6,287


In [20]:
active_buyers.isnull().sum()

mbr_lic_type                       0
mbr_state                          0
lot_nbr                            0
buyer_nbr                          0
mbr_email                          0
max_bid                            0
inv_dt                             0
lot_year                           0
lot_make_cd                        0
grp_model                          0
acv                                0
plug_lot_acv                       0
repair_cost                        0
total_unique_buyers_on_that_lot    0
total_unique_lots_bid_by_buyers    0
dtype: int64

In [21]:
active_buyers['buyer_nbr'].nunique()

80757

In [22]:
active_buyers.to_csv('../data/processed/active_buyers.csv', index=False)

### Non Active Bidders Preprocessing

In [24]:
non_active_buyers = pd.read_csv('../data/raw/non_active_buyers.csv')
non_active_buyers.head()

Unnamed: 0,mbr_lic_type,mbr_state,mbr_nbr,mbr_email
0,Consumer,VT,773347,hybridepoxyfloors@icloud.com
1,Consumer,DE,157710,travisbarron.tb@gmail.com
2,Consumer,HI,819770,greg@athomehawaii.com
3,Consumer,ME,119551,Bquimby20@gmail.com
4,Consumer,DE,621209,info@dooillc.org


In [25]:
def clean_non_active_buyers(df: pd.DataFrame) -> pd.DataFrame:

    df['mbr_lic_type'] = df['mbr_lic_type'].fillna(df['mbr_lic_type'].mode()[0])
    df['mbr_state'] = df['mbr_state'].fillna(df['mbr_state'].mode()[0])
    df['mbr_lic_type'] = df['mbr_lic_type'].replace('Automotive Related Business', 'General Business')
    df = df.rename(columns={'mbr_lic_type': 'buyer_type'})

    return df

In [26]:
non_active_buyers = clean_non_active_buyers(non_active_buyers)

In [27]:
non_active_buyers.head()

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
0,Consumer,VT,773347,hybridepoxyfloors@icloud.com
1,Consumer,DE,157710,travisbarron.tb@gmail.com
2,Consumer,HI,819770,greg@athomehawaii.com
3,Consumer,ME,119551,Bquimby20@gmail.com
4,Consumer,DE,621209,info@dooillc.org


In [28]:
non_active_buyers['buyer_type'].value_counts()

buyer_type
Consumer            130711
Dealer                7238
General Business      2825
Dismantler            1204
Export                 709
Name: count, dtype: int64

In [29]:
non_active_buyers.isnull().sum()

buyer_type    0
mbr_state     0
mbr_nbr       0
mbr_email     0
dtype: int64

In [30]:
non_active_buyers['mbr_nbr'].nunique()

142687

In [31]:
non_active_buyers.to_csv('../data/processed/non_active_buyers.csv', index=False)

### Popular Lots Preprocessing

In [32]:
popular_lots = pd.read_csv('../data/raw/popular_lots.csv')

In [33]:
def clean_popular_lots(popular_df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and ranks popular lots data:
    - Replace 'Automotive Related Business' → 'General Business'
    - Fill missing buyer_type with mode
    - Fill grp_model hierarchically
    - Deduplicate and keep top 6 per (buyer_type, mbr_state)
    """
    # Replace business type
    popular_df['buyer_type'] = popular_df['buyer_type'].replace(
        'Automotive Related Business', 'General Business'
    )

    # Fill buyer_type with mode
    mode_val = popular_df['buyer_type'].mode()
    if not mode_val.empty:
        popular_df['buyer_type'] = popular_df['buyer_type'].fillna(mode_val[0])

    # Fill grp_model using make-level mode
    popular_df = popular_df.groupby('lot_make_cd', group_keys=False).apply(_fill_grp_model_make)

    # Replace acv with plug_lot_acv where acv is 0 or negative
    popular_df['median_acv'] = popular_df['median_acv'].mask(popular_df['median_acv']<=0, popular_df['median_plug_lot_acv'])

    # Sort + deduplicate
    popular_df_sorted = (
        popular_df
        .sort_values(['buyer_type', 'mbr_state', 'cnt'], ascending=[True, True, False])
        .drop_duplicates(subset=['buyer_type', 'mbr_state', 'lot_make_cd', 'grp_model'])
    )

    # Rank within buyer_type + state
    popular_df_sorted['rank_clean'] = (
        popular_df_sorted.groupby(['buyer_type', 'mbr_state']).cumcount() + 1
    )

    # Keep only top 6
    return popular_df_sorted[popular_df_sorted['rank_clean'] <= 6]

In [34]:
popular_lots = clean_popular_lots(popular_lots)

  popular_df = popular_df.groupby('lot_make_cd', group_keys=False).apply(_fill_grp_model_make)


In [35]:
popular_lots

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
206,Consumer,AK,JEP,RENEGADE,6,6316.05,10900.0,961.00,1,1,1
207,Consumer,AK,KIA,SOUL,4,14091.00,18400.0,14091.00,1,2,2
208,Consumer,AK,JEP,COMPASS,4,16089.00,15175.0,20069.70,1,3,3
209,Consumer,AK,JEP,CHEROKEE,3,13012.00,18725.0,13485.75,1,4,4
210,Consumer,AK,JEP,WRANGLER,3,24008.70,22725.0,16335.57,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
1059,General Business,WY,HOND,CRV,4,19349.00,18525.0,15725.46,1,1,1
1060,General Business,WY,TOYT,COROLLA,4,16219.00,12050.0,11816.03,1,2,2
1061,General Business,WY,TOYT,RAV4,2,23603.00,19725.0,17271.13,1,3,3
1062,General Business,WY,MITS,OUTLANDER,1,13952.00,13200.0,21315.89,1,4,4


In [36]:
popular_lots[(popular_lots['buyer_type']=='Consumer') & (popular_lots['mbr_state']=='NJ')]

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
174,Consumer,NJ,JEP,CHEROKEE,49,22400.0,23675.0,14613.98,1,1,1
175,Consumer,NJ,CHRY,PACIFICA,45,20221.0,23450.0,13979.88,1,2,2
176,Consumer,NJ,TOYT,CAMRY,37,18331.0,22225.0,15069.26,1,3,3
177,Consumer,NJ,TOYT,RAV4,35,24856.0,30800.0,20315.0,1,4,4
178,Consumer,NJ,HYUN,ELANTRA,32,15982.25,20150.0,12584.77,1,5,5
179,Consumer,NJ,TOYT,COROLLA,30,16238.0,22300.0,12868.07,1,6,6


In [37]:
popular_lots.isnull().sum()

buyer_type             0
mbr_state              0
lot_make_cd            0
grp_model              0
cnt                    0
median_acv             0
median_plug_lot_acv    0
median_repair_cost     0
model_rank             0
rank                   0
rank_clean             0
dtype: int64

In [38]:
popular_lots.to_csv('../data/processed/popular_lots.csv', index=False)

### Upcoming Lots Preprocessing

In [41]:
upcoming_lots = pd.read_csv('../data/raw/upcoming_lots.csv')

In [42]:
def clean_upcoming_lots(df: pd.DataFrame) -> pd.DataFrame:

    mode_val = df['damage_type_desc'].mode()
    if not mode_val.empty:
        df['damage_type_desc'] = df['damage_type_desc'].fillna(mode_val[0])

    df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
    df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)
    df = df.dropna(subset=['grp_model'])

    df['acv'] = df['acv'].mask(df['acv']<=0, df['plug_lot_acv'])

    return df

In [43]:
upcoming_lots = clean_upcoming_lots(upcoming_lots)

  df = df.groupby(['lot_year', 'lot_make_cd'], group_keys=False).apply(_fill_grp_model_year_make)
  df = df.groupby(['lot_make_cd'], group_keys=False).apply(_fill_grp_model_make)


In [44]:
upcoming_lots.isnull().sum()

lot_nbr             0
lot_year            0
lot_make_cd         0
grp_model           0
damage_type_desc    0
repair_cost         0
acv                 0
plug_lot_acv        0
auc_dt              0
proquote_amt        0
dtype: int64

In [45]:
upcoming_lots.to_csv('../data/processed/upcoming_lots.csv', index=False)
active_buyers.to_csv('../data/processed/active_buyers.csv', index=False)
popular_lots.to_csv('../data/processed/popular_lots.csv', index=False)
non_active_buyers.to_csv('../data/processed/non_active_buyers.csv', index=False)

### Dividing in groups , Collaborative vs one-to-one

In [46]:
def divide_in_groups(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    This function split the data frame into Collaborative and 1-1 filtering groups
    """
    data_high = df[df['total_unique_lots_bid_by_buyers']>=7]
    data_low = df[df['total_unique_lots_bid_by_buyers']<7]

    return data_high, data_low

### Splitting in odd(test) and even(holdout)

In [47]:
def odd_even_split(df: pd.DataFrame, buyer_col) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    This function split the data frame into Test and Holdout groups based on odd and even buyer numbers
    """
    df = df.copy()
    df['last_digit'] = df[buyer_col] % 10
    holdout_df = df[df['last_digit'] % 2 == 0].drop(columns='last_digit')
    test_df = df[df['last_digit'] % 2 != 0].drop(columns='last_digit')

    return holdout_df, test_df

In [48]:
def save_split_data(df: pd.DataFrame, output_path: str) -> None:
    """This function will save the split data into csv files"""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f" Saved: {output_path} ({len(df):,} rows)")


In [49]:
active_buyers = pd.read_csv('../data/processed/active_buyers.csv')
nonactive_buyers = pd.read_csv('../data/processed/non_active_buyers.csv')

In [50]:
print(f"Number of active buyers: {active_buyers['buyer_nbr'].nunique()}")

## Group split
data_high, data_low = divide_in_groups(active_buyers)
print(f"Number of active buyers in CF: {data_high['buyer_nbr'].nunique()}")
print(f"Number of active buyers in one-to-one: {data_low['buyer_nbr'].nunique()}")

## Test vs Holdout split
holdout_df_cf, test_df_cf = odd_even_split(data_high, buyer_col='buyer_nbr')
print(f"Number of CF buyers in Test: {test_df_cf['buyer_nbr'].nunique()}")
print(f"Number of CF buyers in Control: {holdout_df_cf['buyer_nbr'].nunique()}")

holdout_df_onetoone, test_df_onetoone = odd_even_split(data_low, buyer_col='buyer_nbr')
print(f"Number of one-to-one buyers in Test: {test_df_onetoone['buyer_nbr'].nunique()}")
print(f"Number of one-to-one buyers in Control: {holdout_df_onetoone['buyer_nbr'].nunique()}")
## Non-active buyers
print(f"Number of non-active buyers: {nonactive_buyers['mbr_nbr'].nunique()}")
holdout_df_nonactive, test_df_nonactive = odd_even_split(nonactive_buyers, buyer_col='mbr_nbr')
print(f"Number of non-active buyers in Test: {test_df_nonactive['mbr_nbr'].nunique()}")
print(f"Number of non-active buyers in Control: {holdout_df_nonactive['mbr_nbr'].nunique()}")

Number of active buyers: 80757
Number of active buyers in CF: 33179
Number of active buyers in one-to-one: 47578
Number of CF buyers in Test: 16486
Number of CF buyers in Control: 16693
Number of one-to-one buyers in Test: 23702
Number of one-to-one buyers in Control: 23876
Number of non-active buyers: 142687
Number of non-active buyers in Test: 71327
Number of non-active buyers in Control: 71360


In [51]:
test_df_nonactive.to_csv('../data/split/nonactive_test.csv', index=False)
holdout_df_nonactive.to_csv('../data/split/nonactive_holdout.csv', index=False)

test_df_cf.to_csv('../data/split/cf_test.csv', index=False)
holdout_df_cf.to_csv('../data/split/cf_holdout.csv', index=False)

test_df_onetoone.to_csv('../data/split/one_to_one_test.csv', index=False)
holdout_df_onetoone.to_csv('../data/split/one_to_one_holdout.csv', index=False)

In [123]:
holdout_df_cf.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Dealer,WA,71511795,504922,21ramirezcars@gmail.com,3300.0,2025-09-23,2016,AUDI,A3,13561.0,10650.0,15542.18,8,121
2,Consumer,NY,60592415,634574,americanu961@yahoo.com,3000.0,2025-07-17,2020,BMW,X3,24637.0,22275.0,14269.97,3,495
4,Dismantler,TX,62463125,34736,chavas1428@msn.com,1850.0,2025-08-06,2016,BUIC,ENCLAVE,13241.0,11150.0,11016.75,15,56
5,Dismantler,NY,62464515,549138,RICK.WILBERT@WILBERTS.COM,1900.0,2025-07-14,2016,BMW,2 SERIES,19121.0,14475.0,16830.37,5,1477
6,Dismantler,MN,62474775,62636,elite-autoparts@hotmail.com,1200.0,2025-08-04,1989,CADI,ALL OTHER,4985.0,0.0,3067.21,8,7


In [124]:
holdout_df_cf[holdout_df_cf['buyer_nbr']==9484]

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
277,General Business,AK,89401545,9484,vitaliy.bocharov@yahoo.com,175.0,2025-07-15,2018,KIA,SOUL,12054.0,12750.0,11650.29,4,91


In [128]:
popular_lots[(popular_lots['buyer_type']=='General Business') & (popular_lots['mbr_state']=='AK')]

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
1056,General Business,AK,JEP,CHEROKEE,4,21150.0,21150.0,0.0,1,1,1
1057,General Business,AK,KIA,SPORTAGE,3,14698.59,19450.0,0.0,1,2,2
1058,General Business,AK,CHEV,SILVERADO,3,32394.26,36925.0,13008.31,1,3,3
1059,General Business,AK,KIA,SOUL,3,10160.0,12750.0,4796.0,1,4,4
1060,General Business,AK,FORD,F150,3,25991.88,31750.0,11984.0,1,5,5
1061,General Business,AK,RAM,1500,3,20625.0,20625.0,0.0,1,6,6


### Westlake

In [25]:
import pandas as pd
df_wl = pd.read_csv('data/raw/wl_lots.csv')

In [26]:
df_wl.head()

Unnamed: 0,lot_nbr,lot_stg,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,80762855,40,2017,MERZ,C-CLASS,NORMAL WEAR,7261.0,13220.0,15850.0,,0.0
1,85948435,40,2012,GMC,TERRAIN,FRONT END,2783.0,5260.0,6250.0,,0.0
2,85654195,40,2011,HOND,CIVIC,NORMAL WEAR,8092.0,6900.0,7975.0,,879.94
3,80588215,50,2011,GMC,SIERRA,MECHANICAL,11315.0,7440.0,8625.0,,1012.41
4,86334335,40,2017,TOYT,HIGHLANDER,SIDE,30808.0,17100.0,20850.0,,5008.09


In [27]:
df_wl.isnull().sum()

lot_nbr               0
lot_stg               0
lot_year              0
lot_make_cd           0
grp_model             0
damage_type_desc      0
repair_cost           0
acv                   0
plug_lot_acv          0
auc_dt              320
proquote_amt          0
dtype: int64

In [6]:
df_wl['grp_model'] = df_wl['grp_model'].fillna(df_wl['grp_model'].mode()[0])

In [28]:
df_wl.to_csv('data/interim/wl_lots.csv',index=False)