In [1]:
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
import pandas as pd
popular_lots = pd.read_csv('../data/processed/popular_lots.csv')

In [30]:
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict
from tqdm import tqdm

def build_future_dict(df):
    gb = df.groupby(['lot_make_cd', 'grp_model'])
    return {k: v.reset_index(drop=True) for k, v in gb}

def find_best_match(row_dict, future_dict, fallback_arr, fallback_idx):
    make, model = row_dict['lot_make_cd'], row_dict['grp_model']
    acv, repair = row_dict['median_acv'], row_dict['median_repair_cost']

    match_df = future_dict.get((make, model), None)

    if match_df is not None and len(match_df) > 0:
        acv_arr = match_df['acv'].values
        repair_arr = match_df['repair_cost'].values
        dist = np.abs(acv_arr - acv) + np.abs(repair_arr - repair)
        i = dist.argmin()
        selected = match_df.iloc[i]
        distance_value = dist[i]
        fallback_reason = "YMM match"
    else:
        dist = np.abs(fallback_arr[:,0] - acv) + np.abs(fallback_arr[:,1] - repair)
        i = dist.argmin()
        selected = fallback_idx.iloc[i]
        distance_value = dist[i]
        fallback_reason = "ACV+Repair fallback"

    return {
        'mbr_nbr': row_dict['mbr_nbr'],
        'recommended_lot_nbr': selected['lot_nbr'],
        'distance': distance_value,
        'fallback_reason': fallback_reason
    }

def match_recommendations_to_future_lots_fast(final_recommendations, future_lots, workers=10):
    future_filtered = future_lots[['lot_nbr','lot_make_cd','grp_model','acv','repair_cost']].copy()
    future_filtered['acv'] = future_filtered['acv'].astype(np.float32)
    future_filtered['repair_cost'] = future_filtered['repair_cost'].astype(np.float32)

    future_dict = build_future_dict(future_filtered)

    fallback_arr = future_filtered[['acv','repair_cost']].values
    fallback_idx = future_filtered

    results = []

    with ProcessPoolExecutor(max_workers=workers) as executor:
        futures = [
            executor.submit(find_best_match, row, future_dict, fallback_arr, fallback_idx)
            for row in final_recommendations.to_dict('records')
        ]
        for f in tqdm(as_completed(futures), total=len(futures)):
            results.append(f.result())

    return pd.DataFrame(results)


In [3]:
popular_lots

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
0,Consumer,AK,JEP,RENEGADE,6,6316.05,10900.0,961.00,1,1,1
1,Consumer,AK,KIA,SOUL,4,14091.00,18400.0,14091.00,1,2,2
2,Consumer,AK,JEP,COMPASS,4,16089.00,15175.0,20069.70,1,3,3
3,Consumer,AK,JEP,CHEROKEE,3,13012.00,18725.0,13485.75,1,4,4
4,Consumer,AK,JEP,WRANGLER,3,24008.70,22725.0,16335.57,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
1223,General Business,WY,HOND,CRV,4,19349.00,18525.0,15725.46,1,1,1
1224,General Business,WY,TOYT,COROLLA,4,16219.00,12050.0,11816.03,1,2,2
1225,General Business,WY,TOYT,RAV4,2,23603.00,19725.0,17271.13,1,3,3
1226,General Business,WY,MITS,OUTLANDER,1,13952.00,13200.0,21315.89,1,4,4


In [4]:
popular_lots['buyer_type'].value_counts()

buyer_type
Dealer              306
Consumer            295
Dismantler          268
General Business    195
Export              164
Name: count, dtype: int64

In [5]:
popular_lots[(popular_lots['buyer_type']=='Consumer') & (popular_lots['mbr_state']=='NJ')]

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
179,Consumer,NJ,JEP,CHEROKEE,49,22400.0,23675.0,14613.98,1,1,1
180,Consumer,NJ,CHRY,PACIFICA,45,20221.0,23450.0,13979.88,1,2,2
181,Consumer,NJ,TOYT,CAMRY,37,18331.0,22225.0,15069.26,1,3,3
182,Consumer,NJ,TOYT,RAV4,35,24856.0,30800.0,20315.0,1,4,4
183,Consumer,NJ,HYUN,ELANTRA,32,15982.25,20150.0,12584.77,1,5,5
184,Consumer,NJ,TOYT,COROLLA,30,16238.0,22300.0,12868.07,1,6,6


In [6]:
popular_lots.shape

(1228, 11)

In [7]:
nonactive_buyers_test = pd.read_csv('../data/split/nonactive_test.csv')

In [8]:
nonactive_buyers_test['mbr_nbr'].nunique()

71327

In [27]:
def generate_final_recommendations(data, popular_lots_top6):

    if 'mbr_lic_type' in data.columns:
        data = data.rename(columns={'mbr_lic_type': 'buyer_type'})

    if 'buyer_nbr' in data.columns:
        data = data.rename(columns={'buyer_nbr': 'mbr_nbr'})

    if 'acv' in data.columns:
        data = data.rename(columns={'acv': 'median_acv'})

    if 'repair_cost' in data.columns:
        data = data.rename(columns={'repair_cost': 'median_repair_cost'})

    # Step 1: Merge based on buyer_type and mbr_state

    merged = data.merge(
        popular_lots_top6,
        on=['buyer_type', 'mbr_state'],
        how='inner'
    )

    # Step 2: Format initial recommendations
    initial_recommendations = merged[[
        'mbr_nbr', 'mbr_email', 'buyer_type', 'mbr_state',
        'lot_make_cd', 'grp_model', 'rank', 'rank_clean',
        'median_acv', 'median_repair_cost'
    ]]

    final_reco_list = []
    processed_buyers = set()

    # Step 3: For buyers with initial matches
    for mbr_nbr, group in initial_recommendations.groupby('mbr_nbr'):
        buyer_type = group['buyer_type'].iloc[0]
        mbr_email = group['mbr_email'].iloc[0]
        mbr_state = group['mbr_state'].iloc[0]

        recos = group.sort_values('rank_clean').to_dict('records')
        processed_buyers.add(mbr_nbr)

        if len(recos) < 6:
            needed = 6 - len(recos)

            fallback_pool = (
                popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
                .sort_values('rank_clean')
            )

            already_recoed = {(r['lot_make_cd'], r['grp_model']) for r in recos}

            for _, row in fallback_pool.iterrows():
                key = (row['lot_make_cd'], row['grp_model'])
                if key in already_recoed:
                    continue

                recos.append({
                    'mbr_nbr': mbr_nbr,
                    'mbr_email': mbr_email,
                    'buyer_type': buyer_type,
                    'mbr_state': mbr_state,
                    'lot_make_cd': row['lot_make_cd'],
                    'grp_model': row['grp_model'],
                    'rank': row.get('rank'),
                    'rank_clean': row.get('rank_clean'),
                    'median_acv': row.get('median_acv'),
                    'median_repair_cost': row.get('median_repair_cost')
                })

                already_recoed.add(key)
                if len(recos) == 6:
                    break

        final_reco_list.extend(recos)

    # Step 4: Handle buyers with no initial match
    missing_mbrs = set(data['mbr_nbr'].unique()) - processed_buyers
    fallback_missing = data[data['mbr_nbr'].isin(missing_mbrs)]

    for _, row in fallback_missing.iterrows():
        mbr_nbr = row['mbr_nbr']
        mbr_email = row['mbr_email']
        buyer_type = row['buyer_type']
        mbr_state = row['mbr_state']

        fallback_pool = (
            popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
            .sort_values('cnt', ascending=False)
            .drop_duplicates(subset=['lot_make_cd', 'grp_model'])
            .head(6)
        )

        for _, lot in fallback_pool.iterrows():
            final_reco_list.append({
                'mbr_nbr': mbr_nbr,
                'mbr_email': mbr_email,
                'buyer_type': buyer_type,
                'mbr_state': mbr_state,
                'lot_make_cd': lot['lot_make_cd'],
                'grp_model': lot['grp_model'],
                'rank': lot.get('rank'),
                'rank_clean': lot.get('rank_clean'),
                'median_acv': lot.get('median_acv'),
                'median_repair_cost': lot.get('median_repair_cost')
            })

    # Step 5: Return final DataFrame
    return pd.DataFrame(final_reco_list).sort_values(by=['mbr_nbr', 'rank_clean'])


In [10]:
nonactive_buyers_test_past_reco = generate_final_recommendations(nonactive_buyers_test, popular_lots)

In [11]:
nonactive_buyers_holdout = pd.read_csv('../data/split/nonactive_holdout.csv')

In [12]:
nonactive_buyers_holdout_past_reco = generate_final_recommendations(nonactive_buyers_holdout, popular_lots)

In [58]:
## cf_holdout
cf_holdout = pd.read_csv('../data/split/cf_holdout.csv')
cf_holdout = cf_holdout[['mbr_lic_type','mbr_state','buyer_nbr','mbr_email']]
cf_holdout = cf_holdout.drop_duplicates(subset=['buyer_nbr'])
cf_holdout_past_reco = generate_final_recommendations(cf_holdout, popular_lots)


In [61]:
## cf_holdout
one_to_one_holdout = pd.read_csv('../data/split/one_to_one_holdout.csv')
one_to_one_holdout = one_to_one_holdout[['mbr_lic_type','mbr_state','buyer_nbr','mbr_email']]
one_to_one_holdout = one_to_one_holdout.drop_duplicates(subset=['buyer_nbr'])
one_to_one_holdout_past_reco = generate_final_recommendations(one_to_one_holdout, popular_lots)


In [62]:
upcoming_lots = pd.read_csv('../data/processed/upcoming_lots.csv')


In [63]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

def match_recommendations_fast(final_recommendations, future_lots):
    # Build lookup for YMM exact matches
    future_groups = {
        k: v.reset_index(drop=True)
        for k, v in future_lots.groupby(["lot_make_cd", "grp_model"])
    }

    # Pre-store full arrays for fallback
    fallback_df = future_lots.reset_index(drop=True)
    fallback_vals = fallback_df[['acv','repair_cost']].values.astype(np.float32)

    results = []

    for _, row in tqdm(final_recommendations.iterrows(), total=len(final_recommendations)):
        make = row['lot_make_cd']
        model = row['grp_model']
        acv = row['median_acv']
        repair = row['median_repair_cost']

        # Step 1: Try fast group lookup
        match_df = future_groups.get((make, model), None)

        if match_df is not None and len(match_df) > 0:
            arr = match_df[['acv','repair_cost']].values.astype(np.float32)
            dist = np.abs(arr[:,0] - acv) + np.abs(arr[:,1] - repair)
            i = dist.argmin()
            selected = match_df.iloc[i]
            fallback_reason = "YMM"
        else:
            # 🔥 Only compute fallback if YMM missing
            dist = np.abs(fallback_vals[:,0] - acv) + np.abs(fallback_vals[:,1] - repair)
            i = dist.argmin()
            selected = fallback_df.iloc[i]
            fallback_reason = "Global"

        results.append({
            "mbr_nbr": row['mbr_nbr'],
            "recommended_lot_nbr": selected["lot_nbr"],
            "distance": float(dist[i]),
            "fallback_reason": fallback_reason
        })

    return pd.DataFrame(results)


In [17]:
nonactive_buyers_test_reco = match_recommendations_fast(nonactive_buyers_test_past_reco, upcoming_lots)

100%|██████████| 427962/427962 [13:20<00:00, 534.56it/s]


In [18]:
nonactive_buyers_test_reco['mbr_nbr'].nunique()

71327

In [19]:
nonactive_buyers_test_reco.to_excel('../data/results/nonactive_test_reco.xlsx')

In [20]:
nonactive_buyers_holdout_reco = match_recommendations_fast(nonactive_buyers_holdout_past_reco, upcoming_lots)

100%|██████████| 428160/428160 [13:10<00:00, 541.78it/s]


In [46]:
nonactive_buyers_holdout_past_reco

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL 3,1,1,24714.0,19824.66
1,4,godwinomoosagie@ymail.com,Consumer,NY,FORD,ESCAPE,2,2,14546.0,9336.25
2,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL Y,3,3,31443.0,21993.42
3,4,godwinomoosagie@ymail.com,Consumer,NY,NISS,ROGUE,4,4,18046.0,12867.00
4,4,godwinomoosagie@ymail.com,Consumer,NY,VOLK,TIGUAN,5,5,13400.0,10016.76
...,...,...,...,...,...,...,...,...,...,...
427711,999678,bufddy@rogers.com,Consumer,OH,HOND,CRV,2,2,22565.0,18180.00
427712,999678,bufddy@rogers.com,Consumer,OH,HYUN,ELANTRA,3,3,9975.5,11752.45
427713,999678,bufddy@rogers.com,Consumer,OH,BUIC,ENCORE,4,4,13435.0,0.00
427714,999678,bufddy@rogers.com,Consumer,OH,CHEV,MALIBU,5,5,11161.0,9315.54


In [47]:
cf_holdout_past_reco

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.00
1,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.00
2,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.00
3,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.00
4,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.00
...,...,...,...,...,...,...,...,...,...,...
11069978,995026,995026cprt_dmmy_MASTERBDR_545946_773843@copart...,Consumer,FL,CHEV,EQUINOX,6,6,15645.0,13291.26
11069979,995026,995026cprt_dmmy_MASTERBDR_545946_773843@copart...,Consumer,FL,CHEV,EQUINOX,6,6,15645.0,13291.26
11069980,995026,995026cprt_dmmy_MASTERBDR_545946_773843@copart...,Consumer,FL,CHEV,EQUINOX,6,6,15645.0,13291.26
11069981,995026,995026cprt_dmmy_MASTERBDR_545946_773843@copart...,Consumer,FL,CHEV,EQUINOX,6,6,15645.0,13291.26


In [21]:
nonactive_buyers_holdout_reco.to_excel('../data/results/nonactive_holdout_reco.xlsx')

In [40]:
cf_holdout_past_reco.head()

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.0
1,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.0
2,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.0
3,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.0
4,28,TORRESSILVE@GMAIL.COM,Consumer,MI,NISS,ALTIMA,1,1,9897.0,10046.0


In [64]:
cf_holdout_reco = match_recommendations_fast(cf_holdout_past_reco,upcoming_lots)

100%|██████████| 100158/100158 [54:01<00:00, 30.90it/s]   


In [65]:
one_to_one_holdout_reco = match_recommendations_fast(one_to_one_holdout_past_reco, upcoming_lots)

100%|██████████| 143256/143256 [22:04<00:00, 108.15it/s]


In [68]:
cf_holdout_reco.to_excel('../data/results/cf_holdout_reco.xlsx', index = False)
one_to_one_holdout_reco.to_excel('../data/results/onetoone_holdout_reco.xlsx', index = False)


In [72]:
one_to_one_holdout_reco['mbr_nbr'].value_counts()

mbr_nbr
82        6
722474    6
722638    6
722588    6
722578    6
         ..
392386    6
392308    6
392264    6
392120    6
998724    6
Name: count, Length: 23876, dtype: int64

In [178]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def match_recommendations_to_future_lots_parallel(final_recommendations, upcoming_lots, max_workers=12):
    future_filtered = upcoming_lots[['lot_nbr', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost']].copy()
    results = []

    def process_row(row):
        buyer_nbr = row['mbr_nbr']
        make = row['lot_make_cd']
        model = row['grp_model']
        acv = row['median_acv']
        repair = row['median_repair_cost']

        matching_lots = future_filtered[
            (future_filtered['lot_make_cd'] == make) &
            (future_filtered['grp_model'] == model)
        ]

        if not matching_lots.empty:
            distances = np.abs(matching_lots['acv'] - acv) + np.abs(matching_lots['repair_cost'] - repair)
            min_idx = distances.idxmin()
            best_lot = matching_lots.loc[min_idx]
            distance_value = distances[min_idx]
        else:
            distances = np.abs(future_filtered['acv'] - acv) + np.abs(future_filtered['repair_cost'] - repair)
            min_idx = distances.idxmin()
            best_lot = future_filtered.loc[min_idx]
            distance_value = distances[min_idx]

        return {
            'mbr_nbr': buyer_nbr,
            'lot_make_cd': make,
            'grp_model': model,
            'recommended_lot_nbr': best_lot['lot_nbr'],
            'matched_lot_make_cd': best_lot['lot_make_cd'],
            'matched_grp_model': best_lot['grp_model'],
            'distance': distance_value
        }

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_row, row) for _, row in final_recommendations.iterrows()]
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())

    return pd.DataFrame(results)

In [192]:
nonactive_holdout = pd.read_csv('data/split/nonactive_holdout.csv')

In [180]:
nonactive_holdout_past_reco = generate_final_recommendations(nonactive_buyers_holdout, popular_lots)

In [177]:
nonactive_holdout_reco = match_recommendations_to_future_lots_parallel(nonactive_holdout_past_reco, upcoming_lots, max_workers=12)


  4%|▍         | 17270/436482 [00:27<11:19, 617.23it/s]


KeyboardInterrupt: 

In [195]:
nonactive_holdout.head()

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
0,Consumer,NH,252896,cmcgrath09@gmail.com
1,Consumer,DE,727700,dorzetshrt@yahoo.com
2,Consumer,SD,596008,mhayenga@nvc.net
3,Dealer,ND,235198,jimm@riverwoodrvs.com
4,Consumer,AK,783812,ywjpheejyaj101@gmail.com


In [196]:
nonactive_holdout_past_reco.head()

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL 3,1,1,23368.0,21130.38
1,4,godwinomoosagie@ymail.com,Consumer,NY,FORD,ESCAPE,2,2,14014.0,8420.92
2,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL Y,3,3,30119.0,21993.42
3,4,godwinomoosagie@ymail.com,Consumer,NY,TOYT,RAV4,4,4,21719.36,14052.0
4,4,godwinomoosagie@ymail.com,Consumer,NY,VOLK,TIGUAN,5,5,14197.86,10210.0


In [197]:
cf_holdout = pd.read_csv("data/split/CollaborativeFiltering_holdout.csv")
cf_holdout.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Dealer,NV,62520895,835040,cncauto775@gmail.com,1950.0,2025-07-31,2015,BMW,X1,9778.0,6525.0,0.0,5,332
1,Dealer,NV,80901305,260760,olea8086@gmail.com,3250.0,2025-10-08,2017,ACUR,MDX,16297.0,16025.0,11752.67,5,223
2,Dismantler,RI,65463145,617140,copart.617140@picknpull.com,500.0,2025-09-03,2002,DODG,RAM 2500,7000.0,0.0,0.0,5,211
3,Dismantler,RI,68813285,794584,northendtowing101@yahoo.com,400.0,2025-09-17,2015,CHRY,MINIVAN,5965.0,7150.0,8133.38,5,1617
4,Consumer,MT,61917475,574298,chuck.raup@yahoo.com,600.0,2025-09-24,2018,HYUN,KONA,13028.92,12150.0,0.0,3,16


In [203]:
cf_holdout = cf_holdout[['mbr_lic_type','buyer_nbr','mbr_state','mbr_email']].head(5000)

In [1]:
cf_holdout[cf_holdout['buyer_nbr']==9484]

NameError: name 'cf_holdout' is not defined

In [204]:
cf_holdout_past_reco = generate_final_recommendations(cf_holdout, popular_lots)

In [None]:
cf_holdout_past_reco

In [88]:
# Step 3: Merge with buyers using cross join logic per group
merged = not_active_bidders_test.merge(
    popular_lots_top6,
    on=['buyer_type', 'mbr_state'],
    how='inner'
)

# Step 4: Format initial recommendations
initial_recommendations = merged[[
    'mbr_nbr', 'mbr_email', 'buyer_type', 'mbr_state',
    'lot_make_cd', 'grp_model', 'rank', 'rank_clean',
    'median_acv', 'median_repair_cost'
]]

# Step 5: Ensure 6 recommendations per buyer, fallback if fewer
final_reco_list = []

# Track which buyers have been processed
processed_buyers = set()

grouped = initial_recommendations.groupby('mbr_nbr')

for mbr_nbr, group in grouped:
    buyer_type = group['buyer_type'].iloc[0]
    mbr_email = group['mbr_email'].iloc[0]
    mbr_state = group['mbr_state'].iloc[0]

    recos = group.sort_values('rank_clean').to_dict('records')
    processed_buyers.add(mbr_nbr)

    # Fallback if <6 using group-level popular lots
    if len(recos) < 6:
        needed = 6 - len(recos)

        fallback_pool = (
            popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
            .sort_values('rank_clean')
        )

        already_recoed = {(r['lot_make_cd'], r['grp_model']) for r in recos}

        for _, row in fallback_pool.iterrows():
            key = (row['lot_make_cd'], row['grp_model'])
            if key in already_recoed:
                continue

            recos.append({
                'mbr_nbr': mbr_nbr,
                'mbr_email': mbr_email,
                'buyer_type': buyer_type,
                'mbr_state': mbr_state,
                'lot_make_cd': row['lot_make_cd'],
                'grp_model': row['grp_model'],
                'rank': row.get('rank'),
                'rank_clean': row.get('rank_clean'),
                'median_acv': row.get('median_acv'),
                'median_repair_cost': row.get('median_repair_cost')
            })

            already_recoed.add(key)
            if len(recos) == 6:
                break

    final_reco_list.extend(recos)

# Step 5.1: Handle buyers who received 0 recommendations (i.e., not merged at all)
missing_mbrs = set(not_active_bidders_test['mbr_nbr'].unique()) - processed_buyers
fallback_missing = not_active_bidders_test[not_active_bidders_test['mbr_nbr'].isin(missing_mbrs)]

for _, row in fallback_missing.iterrows():
    mbr_nbr = row['mbr_nbr']
    mbr_email = row['mbr_email']
    buyer_type = row['buyer_type']
    mbr_state = row['mbr_state']

    fallback_pool = (
        popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
        .sort_values('cnt', ascending=False)
        .drop_duplicates(subset=['lot_make_cd', 'grp_model'])  # avoid duplicates
        .head(6)
    )

    for _, lot in fallback_pool.iterrows():
        final_reco_list.append({
            'mbr_nbr': mbr_nbr,
            'mbr_email': mbr_email,
            'buyer_type': buyer_type,
            'mbr_state': mbr_state,
            'lot_make_cd': lot['lot_make_cd'],
            'grp_model': lot['grp_model'],
            'rank': lot.get('rank'),
            'rank_clean': lot.get('rank_clean'),
            'median_acv': lot.get('median_acv'),
            'median_repair_cost': lot.get('median_repair_cost')
        })

# Step 6: Final DataFrame
final_recommendations = pd.DataFrame(final_reco_list).sort_values(by=['mbr_nbr', 'rank_clean'])

In [89]:
final_recommendations

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,1,MCNILR@QA-COPART-TEST.COM,Dealer,IA,CHEV,MALIBU,1,1,12953.0,8942.06
1,1,MCNILR@QA-COPART-TEST.COM,Dealer,IA,CHEV,SILVERADO,2,2,37468.0,19498.12
2,1,MCNILR@QA-COPART-TEST.COM,Dealer,IA,CHEV,EQUINOX,3,3,13845.0,11142.81
3,1,MCNILR@QA-COPART-TEST.COM,Dealer,IA,CHEV,CRUZE,4,4,9165.0,7360.48
4,1,MCNILR@QA-COPART-TEST.COM,Dealer,IA,FORD,ESCAPE,5,5,13105.0,10572.42
...,...,...,...,...,...,...,...,...,...,...
436153,999993,copartarchivebuyerdnd@qa-copart-test.com,Consumer,TX,CHEV,SILVERADO,2,2,27429.0,17066.01
436154,999993,copartarchivebuyerdnd@qa-copart-test.com,Consumer,TX,NISS,ROGUE,3,3,12203.0,11550.19
436155,999993,copartarchivebuyerdnd@qa-copart-test.com,Consumer,TX,TOYT,CAMRY,4,4,21332.0,15159.61
436156,999993,copartarchivebuyerdnd@qa-copart-test.com,Consumer,TX,CHEV,MALIBU,5,5,11657.0,9914.36


In [90]:
final_recommendations['mbr_email'].nunique()

72762

z### Upcoming lots

In [91]:
future_lots = pd.read_csv('data/interim/future_lots.csv')

In [92]:
future_lots

Unnamed: 0,lot_nbr,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,71123705,2020,HOND,PASSPORT,FRONT END,20089.12,23445.00,24225.0,2025-10-21,9536.78
1,62754925,2003,BMW,Z4,FRONT END,3200.00,3247.00,0.0,2025-10-22,1259.12
2,85277025,2020,INFI,QX50,FRONT END,21489.38,25417.00,21100.0,2025-10-22,6788.11
3,67924785,2013,AUDI,Q7,MECHANICAL,0.00,3380.00,6725.0,2025-10-22,858.85
4,86845485,2017,SUBA,WRX,MINOR DENT/SCRATCHES,0.00,15425.00,15425.0,2025-10-21,7563.92
...,...,...,...,...,...,...,...,...,...,...
45146,80203955,2017,JEP,WRANGLER,ALL OVER,20755.29,20449.00,22525.0,2025-10-24,6150.38
45147,71829755,2015,JEP,WRANGLER,VANDALISM,0.00,24500.00,16625.0,2025-10-21,12232.06
45148,85198465,2017,JEP,WRANGLER,MECHANICAL,16758.25,19574.96,18175.0,2025-10-24,7409.81
45149,81837765,2006,JEP,WRANGLER,FRONT END,0.00,8806.84,7275.0,2025-10-24,1238.04


In [93]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Keep only relevant columns
future_filtered = future_lots[['lot_nbr', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost']].copy()

results = []

# Loop through each buyer-lot recommendation
for idx, row in tqdm(final_recommendations.iterrows(), total=len(final_recommendations)):
    buyer_nbr = row['mbr_nbr']
    make = row['lot_make_cd']
    model = row['grp_model']
    acv = row['median_acv']
    repair = row['median_repair_cost']

    # Step 1: Exact match on make and model
    matching_lots = future_filtered[
        (future_filtered['lot_make_cd'] == make) &
        (future_filtered['grp_model'] == model)
    ]

    if not matching_lots.empty:
        # Step 2: Compute Manhattan distance for exact match
        distances = np.abs(matching_lots['acv'] - acv) + np.abs(matching_lots['repair_cost'] - repair)
        min_idx = distances.idxmin()
        best_lot = matching_lots.loc[min_idx]
        distance_value = distances[min_idx]
    else:
        # Fallback: find best lot based on lowest Manhattan distance across all future lots
        distances = np.abs(future_filtered['acv'] - acv) + np.abs(future_filtered['repair_cost'] - repair)
        min_idx = distances.idxmin()
        best_lot = future_filtered.loc[min_idx]
        distance_value = distances[min_idx]

    # Save result (works for both exact and fallback cases)
    results.append({
        'mbr_nbr': buyer_nbr,
        'lot_make_cd': make,
        'grp_model': model,
        'recommended_lot_nbr': best_lot['lot_nbr'],
        'matched_lot_make_cd': best_lot['lot_make_cd'],
        'matched_grp_model': best_lot['grp_model'],
        'distance': distance_value
    })


100%|██████████| 436572/436572 [15:01<00:00, 484.48it/s]


In [94]:
recommendation_output_df = pd.DataFrame(results)
recommendation_output_df

Unnamed: 0,mbr_nbr,lot_make_cd,grp_model,recommended_lot_nbr,matched_lot_make_cd,matched_grp_model,distance
0,1,CHEV,MALIBU,68163075,CHEV,MALIBU,237.64
1,1,CHEV,SILVERADO,81566335,CHEV,SILVERADO,3345.12
2,1,CHEV,EQUINOX,70869825,CHEV,EQUINOX,203.97
3,1,CHEV,CRUZE,71849425,CHEV,CRUZE,488.81
4,1,FORD,ESCAPE,86779465,FORD,ESCAPE,431.09
...,...,...,...,...,...,...,...
436567,999993,CHEV,SILVERADO,68705015,CHEV,SILVERADO,737.01
436568,999993,NISS,ROGUE,71707835,NISS,ROGUE,324.46
436569,999993,TOYT,CAMRY,81389845,TOYT,CAMRY,418.56
436570,999993,CHEV,MALIBU,70644295,CHEV,MALIBU,176.64


In [95]:
recommendation_output_df['mbr_nbr'].nunique()

72762

In [96]:
not_active_bidders_test['mbr_nbr'].nunique()

72762

In [97]:
recommendation_output_df.to_excel("data/processed/recommended_popular_non_active_bidders_test.xlsx", index=False)

In [98]:
data_high_holdout = pd.read_csv('data/interim/data_high_holdout.csv')
data_low_holdout = pd.read_csv('data/interim/data_low_holdout.csv')
not_active_bidders_holdout = pd.read_csv('data/interim/not_active_bidders_holdout.csv')

In [99]:
data_high_holdout['identifier'] = 1
data_low_holdout['identifier'] = 2
not_active_bidders_holdout['identifier'] = 3

In [100]:
data_high_holdout['buyer_nbr'].nunique()

16749

In [101]:
data_low_holdout['buyer_nbr'].nunique()

23810

In [102]:
not_active_bidders_holdout['mbr_nbr'].nunique()

72747

In [103]:
import pandas as pd

# 1. Select and rename from data_high_holdout
high_df = data_high_holdout[['mbr_lic_type', 'mbr_state', 'buyer_nbr', 'mbr_email']].copy()
high_df.rename(columns={'buyer_nbr': 'mbr_nbr'}, inplace=True)

# 2. Select and rename from data_low_holdout
low_df = data_low_holdout[['mbr_lic_type', 'mbr_state', 'buyer_nbr', 'mbr_email']].copy()
low_df.rename(columns={'buyer_nbr': 'mbr_nbr'}, inplace=True)

# 3. Already has desired columns in not_active_bidders_holdout
inactive_df = not_active_bidders_holdout[['mbr_lic_type', 'mbr_state', 'mbr_nbr', 'mbr_email']].copy()

# 4. Combine all
holdout_group = pd.concat([high_df, low_df, inactive_df], ignore_index=True)

In [104]:
holdout_group

Unnamed: 0,mbr_lic_type,mbr_state,mbr_nbr,mbr_email
0,Dealer,NV,835040,cncauto775@gmail.com
1,Dealer,NV,260760,olea8086@gmail.com
2,Dismantler,RI,617140,copart.617140@picknpull.com
3,Dismantler,RI,794584,northendtowing101@yahoo.com
4,Consumer,MT,574298,chuck.raup@yahoo.com
...,...,...,...,...
2005403,Consumer,WI,678874,ebd300792@gmail.com
2005404,Consumer,WI,203256,jofu3@icloud.com
2005405,Dealer,WI,191070,info@jmpaintingcontractors.com
2005406,Consumer,WI,562736,bennettbrown25@gmail.com


In [105]:
holdout_group.rename(columns={
    'mbr_lic_type': 'buyer_type'
}, inplace=True)

In [106]:
holdout_group['mbr_nbr'].nunique()

111953

In [107]:
holdout_group[holdout_group['mbr_nbr']==866]

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
19827,Dealer,ID,866,victorcapellan13@hotmail.com
44579,Dealer,ID,866,victorcapellan13@hotmail.com
55404,Dealer,ID,866,victorcapellan13@hotmail.com
58426,Dealer,ID,866,victorcapellan13@hotmail.com
79027,Dealer,ID,866,victorcapellan13@hotmail.com
...,...,...,...,...
1833757,Dealer,ID,866,victorcapellan13@hotmail.com
1856631,Dealer,ID,866,victorcapellan13@hotmail.com
1856633,Dealer,ID,866,victorcapellan13@hotmail.com
1859785,Dealer,ID,866,victorcapellan13@hotmail.com


In [108]:
holdout_group = holdout_group.drop_duplicates(subset='mbr_nbr', keep='first')


In [109]:
holdout_group[holdout_group['mbr_nbr']==866]

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
19827,Dealer,ID,866,victorcapellan13@hotmail.com


In [110]:
# Step 3: Merge with buyers using cross join logic per group
merged = holdout_group.merge(
    popular_lots_top6,
    on=['buyer_type', 'mbr_state'],
    how='inner'
)

# Step 4: Format initial recommendations
initial_recommendations = merged[[
    'mbr_nbr', 'mbr_email', 'buyer_type', 'mbr_state',
    'lot_make_cd', 'grp_model', 'rank', 'rank_clean',
    'median_acv', 'median_repair_cost'
]]

# Step 5: Ensure 6 recommendations per buyer, fallback if fewer
final_reco_list = []

# Track which buyers have been processed
processed_buyers = set()

grouped = initial_recommendations.groupby('mbr_nbr')

for mbr_nbr, group in grouped:
    buyer_type = group['buyer_type'].iloc[0]
    mbr_email = group['mbr_email'].iloc[0]
    mbr_state = group['mbr_state'].iloc[0]

    recos = group.sort_values('rank_clean').to_dict('records')
    processed_buyers.add(mbr_nbr)

    # Fallback if <6 using group-level popular lots
    if len(recos) < 6:
        needed = 6 - len(recos)

        fallback_pool = (
            popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
            .sort_values('rank_clean')
        )

        already_recoed = {(r['lot_make_cd'], r['grp_model']) for r in recos}

        for _, row in fallback_pool.iterrows():
            key = (row['lot_make_cd'], row['grp_model'])
            if key in already_recoed:
                continue

            recos.append({
                'mbr_nbr': mbr_nbr,
                'mbr_email': mbr_email,
                'buyer_type': buyer_type,
                'mbr_state': mbr_state,
                'lot_make_cd': row['lot_make_cd'],
                'grp_model': row['grp_model'],
                'rank': row.get('rank'),
                'rank_clean': row.get('rank_clean'),
                'median_acv': row.get('median_acv'),
                'median_repair_cost': row.get('median_repair_cost')
            })

            already_recoed.add(key)
            if len(recos) == 6:
                break

    final_reco_list.extend(recos)

# Step 5.1: Handle buyers who received 0 recommendations (i.e., not merged at all)
missing_mbrs = set(holdout_group['mbr_nbr'].unique()) - processed_buyers
fallback_missing = holdout_group[holdout_group['mbr_nbr'].isin(missing_mbrs)]

for _, row in fallback_missing.iterrows():
    mbr_nbr = row['mbr_nbr']
    mbr_email = row['mbr_email']
    buyer_type = row['buyer_type']
    mbr_state = row['mbr_state']

    fallback_pool = (
        popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
        .sort_values('cnt', ascending=False)
        .drop_duplicates(subset=['lot_make_cd', 'grp_model'])  # avoid duplicates
        .head(6)
    )

    for _, lot in fallback_pool.iterrows():
        final_reco_list.append({
            'mbr_nbr': mbr_nbr,
            'mbr_email': mbr_email,
            'buyer_type': buyer_type,
            'mbr_state': mbr_state,
            'lot_make_cd': lot['lot_make_cd'],
            'grp_model': lot['grp_model'],
            'rank': lot.get('rank'),
            'rank_clean': lot.get('rank_clean'),
            'median_acv': lot.get('median_acv'),
            'median_repair_cost': lot.get('median_repair_cost')
        })

# Step 6: Final DataFrame
final_recommendations = pd.DataFrame(final_reco_list).sort_values(by=['mbr_nbr', 'rank_clean'])

In [111]:
final_recommendations

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL 3,1,1,23368.00,21130.38
1,4,godwinomoosagie@ymail.com,Consumer,NY,FORD,ESCAPE,2,2,14014.00,8420.92
2,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL Y,3,3,30119.00,21993.42
3,4,godwinomoosagie@ymail.com,Consumer,NY,TOYT,RAV4,4,4,21719.36,14052.00
4,4,godwinomoosagie@ymail.com,Consumer,NY,VOLK,TIGUAN,5,5,14197.86,10210.00
...,...,...,...,...,...,...,...,...,...,...
668701,999678,bufddy@rogers.com,Consumer,OH,HOND,CRV,2,2,22565.00,18180.00
668702,999678,bufddy@rogers.com,Consumer,OH,FORD,FUSION,3,3,11405.00,10268.00
668703,999678,bufddy@rogers.com,Consumer,OH,CHEV,CRUZE,4,4,8440.00,5789.00
668704,999678,bufddy@rogers.com,Consumer,OH,HOND,CIVIC,5,5,15197.00,14267.91


In [112]:
final_recommendations['mbr_nbr'].value_counts()

mbr_nbr
4         6
689710    6
689820    6
689814    6
689812    6
         ..
349050    6
349046    6
349040    6
349030    6
999678    6
Name: count, Length: 111554, dtype: int64

In [113]:
final_recommendations['mbr_nbr'].nunique()

111554

In [114]:
future_lots.head()

Unnamed: 0,lot_nbr,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,71123705,2020,HOND,PASSPORT,FRONT END,20089.12,23445.0,24225.0,2025-10-21,9536.78
1,62754925,2003,BMW,Z4,FRONT END,3200.0,3247.0,0.0,2025-10-22,1259.12
2,85277025,2020,INFI,QX50,FRONT END,21489.38,25417.0,21100.0,2025-10-22,6788.11
3,67924785,2013,AUDI,Q7,MECHANICAL,0.0,3380.0,6725.0,2025-10-22,858.85
4,86845485,2017,SUBA,WRX,MINOR DENT/SCRATCHES,0.0,15425.0,15425.0,2025-10-21,7563.92


In [115]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Convert to NumPy arrays early for speed
future_filtered = future_lots[['lot_nbr', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost']].copy()
future_filtered[['acv', 'repair_cost']] = future_filtered[['acv', 'repair_cost']].fillna(0)

future_vecs = future_filtered[['acv', 'repair_cost']].to_numpy()

results = []

# Group future lots by (make, model) once to avoid repeated filtering
future_groups = {
    key: group.reset_index(drop=True)
    for key, group in future_filtered.groupby(['lot_make_cd', 'grp_model'])
}

# Main loop
for idx, row in tqdm(final_recommendations.iterrows(), total=len(final_recommendations)):
    buyer_nbr = row['mbr_nbr']
    make = row['lot_make_cd']
    model = row['grp_model']
    acv = row['median_acv']
    repair = row['median_repair_cost']
    input_vec = np.array([acv, repair])

    key = (make, model)

    if key in future_groups:
        group_df = future_groups[key]
        group_vecs = group_df[['acv', 'repair_cost']].to_numpy()
        distances = np.abs(group_vecs - input_vec).sum(axis=1)
        min_idx = distances.argmin()
        best_lot = group_df.iloc[min_idx]
        distance_value = distances[min_idx]
    else:
        # fallback to all
        distances = np.abs(future_vecs - input_vec).sum(axis=1)
        min_idx = distances.argmin()
        best_lot = future_filtered.iloc[min_idx]
        distance_value = distances[min_idx]

    # Append result
    results.append({
        'mbr_nbr': buyer_nbr,
        'lot_make_cd': make,
        'grp_model': model,
        'recommended_lot_nbr': best_lot['lot_nbr'],
        'matched_lot_make_cd': best_lot['lot_make_cd'],
        'matched_grp_model': best_lot['grp_model'],
        'distance': distance_value
    })

100%|██████████| 669324/669324 [01:07<00:00, 9915.86it/s] 


In [116]:
recommendation_output_df = pd.DataFrame(results)
recommendation_output_df

Unnamed: 0,mbr_nbr,lot_make_cd,grp_model,recommended_lot_nbr,matched_lot_make_cd,matched_grp_model,distance
0,4,TESL,MODEL 3,80156725,TESL,MODEL 3,413.38
1,4,FORD,ESCAPE,85594715,FORD,ESCAPE,593.34
2,4,TESL,MODEL Y,80667025,TESL,MODEL Y,264.58
3,4,TOYT,RAV4,67438605,TOYT,RAV4,879.83
4,4,VOLK,TIGUAN,71628985,VOLK,TIGUAN,647.60
...,...,...,...,...,...,...,...
669319,999678,HOND,CRV,81230735,HOND,CRV,450.00
669320,999678,FORD,FUSION,69794885,FORD,FUSION,291.74
669321,999678,CHEV,CRUZE,66462935,CHEV,CRUZE,355.12
669322,999678,HOND,CIVIC,80930005,HOND,CIVIC,453.91


In [117]:
recommendation_output_df['mbr_nbr'].nunique()

111554

In [118]:
recommendation_output_df.rename(columns={
    'mbr_nbr': 'input_buyer_nbr',
    'recommended_lot_nbr': 'recommended_lot'
}, inplace=True)

In [119]:
recommendation_output_df = recommendation_output_df[['input_buyer_nbr', 'recommended_lot']]

In [120]:
recommendation_output_df

Unnamed: 0,input_buyer_nbr,recommended_lot
0,4,80156725
1,4,85594715
2,4,80667025
3,4,67438605
4,4,71628985
...,...,...
669319,999678,81230735
669320,999678,69794885
669321,999678,66462935
669322,999678,80930005


In [121]:
import pandas as pd
import numpy as np

# ✅ Step 0: Ensure we work on a clean copy
df = recommendation_output_df[['input_buyer_nbr', 'recommended_lot']].copy()

# ✅ Step 1: Assign rank efficiently
# Use NumPy group index trick (much faster than groupby().cumcount())
df['rank'] = df.groupby('input_buyer_nbr', sort=False).cumcount().to_numpy() + 1

# ✅ Step 2: Filter only 6 lots per buyer (optional sanity check)
df = df[df['rank'] <= 6]

# ✅ Step 3: Convert to pivot (wide format) using pivot_table for stability
pivot_df = df.pivot_table(
    index='input_buyer_nbr',
    columns='rank',
    values='recommended_lot',
    aggfunc='first'
)

# ✅ Step 4: Rename columns
pivot_df.columns = [f'lot_{int(col)}' for col in pivot_df.columns]

# ✅ Step 5: Reset index
pivot_df = pivot_df.reset_index(drop=False)

In [122]:
pivot_df

Unnamed: 0,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6
0,4,80156725,85594715,80667025,67438605,71628985,80494945
1,28,78445034,86144575,71733305,70869825,70834965,85744625
2,32,71960335,81139975,69472435,68706185,80749095,58803955
3,36,66614255,72093995,86085605,85566595,81060465,85910165
4,44,62283675,82377575,86139785,81337525,80339485,81564585
...,...,...,...,...,...,...,...
111549,998374,80431674,70834965,71285335,82066565,70471535,70548125
111550,998724,68352335,81393735,69664385,70647505,65770505,72046315
111551,999222,84457995,63790615,83762405,71476485,68726465,80170555
111552,999662,81684625,85475495,71161585,70046845,84885265,64930505


In [123]:
data_high_holdout['identifier'] = 1
data_low_holdout['identifier'] = 2
not_active_bidders_holdout['identifier'] = 3

# Step 2: Concatenate all into one lookup table
all_holdouts = pd.concat([
    data_high_holdout[['buyer_nbr', 'identifier']],
    data_low_holdout[['buyer_nbr', 'identifier']],
    not_active_bidders_holdout[['mbr_nbr', 'identifier']].rename(columns={'mbr_nbr': 'buyer_nbr'})
], axis=0)

# Drop duplicates just in case
all_holdouts = all_holdouts.drop_duplicates(subset=['buyer_nbr'])

# Step 3: Merge into pivot_df
pivot_df = pivot_df.copy()
pivot_df = pivot_df.merge(all_holdouts, how='left', left_on='input_buyer_nbr', right_on='buyer_nbr')

In [124]:
pivot_df

Unnamed: 0,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6,buyer_nbr,identifier
0,4,80156725,85594715,80667025,67438605,71628985,80494945,4,3
1,28,78445034,86144575,71733305,70869825,70834965,85744625,28,1
2,32,71960335,81139975,69472435,68706185,80749095,58803955,32,3
3,36,66614255,72093995,86085605,85566595,81060465,85910165,36,3
4,44,62283675,82377575,86139785,81337525,80339485,81564585,44,3
...,...,...,...,...,...,...,...,...,...
111549,998374,80431674,70834965,71285335,82066565,70471535,70548125,998374,2
111550,998724,68352335,81393735,69664385,70647505,65770505,72046315,998724,2
111551,999222,84457995,63790615,83762405,71476485,68726465,80170555,999222,3
111552,999662,81684625,85475495,71161585,70046845,84885265,64930505,999662,3


In [125]:
# Step 1: Add the 'group' column with value 'holdout'
pivot_df['group'] = 'holdout'

# Step 2: Reorder columns
# Grab current lot columns
lot_cols = [col for col in pivot_df.columns if col.startswith('lot_')]

# Reorder: identifier, group, buyer_nbr, lots...
pivot_df = pivot_df[['identifier', 'group', 'input_buyer_nbr'] + lot_cols]


In [126]:
pivot_df

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6
0,3,holdout,4,80156725,85594715,80667025,67438605,71628985,80494945
1,1,holdout,28,78445034,86144575,71733305,70869825,70834965,85744625
2,3,holdout,32,71960335,81139975,69472435,68706185,80749095,58803955
3,3,holdout,36,66614255,72093995,86085605,85566595,81060465,85910165
4,3,holdout,44,62283675,82377575,86139785,81337525,80339485,81564585
...,...,...,...,...,...,...,...,...,...
111549,2,holdout,998374,80431674,70834965,71285335,82066565,70471535,70548125
111550,2,holdout,998724,68352335,81393735,69664385,70647505,65770505,72046315
111551,3,holdout,999222,84457995,63790615,83762405,71476485,68726465,80170555
111552,3,holdout,999662,81684625,85475495,71161585,70046845,84885265,64930505


In [127]:
pivot_df[pivot_df['input_buyer_nbr']==4]

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6
0,3,holdout,4,80156725,85594715,80667025,67438605,71628985,80494945


In [128]:
pivot_df.to_csv("data/processed/recommended_holdout.csv",index=False)

### Would have for holdout

In [129]:
recommendation_output_df_holdout3 = pivot_df[(pivot_df['identifier']==3) & (pivot_df['group']=='holdout')]

In [130]:
recommendation_output_df_holdout3

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6
0,3,holdout,4,80156725,85594715,80667025,67438605,71628985,80494945
2,3,holdout,32,71960335,81139975,69472435,68706185,80749095,58803955
3,3,holdout,36,66614255,72093995,86085605,85566595,81060465,85910165
4,3,holdout,44,62283675,82377575,86139785,81337525,80339485,81564585
5,3,holdout,58,62543695,81095435,80689125,65699525,69551855,50593775
...,...,...,...,...,...,...,...,...,...
111547,3,holdout,997866,80516675,70896325,86255335,71468125,80914135,87190475
111548,3,holdout,998042,71876405,67132335,85276355,64218355,86151785,68762775
111551,3,holdout,999222,84457995,63790615,83762405,71476485,68726465,80170555
111552,3,holdout,999662,81684625,85475495,71161585,70046845,84885265,64930505


In [131]:
recommendation_output_df_holdout3['input_buyer_nbr'].nunique()

71209

In [132]:
recommendation_output_df_holdout3.to_excel("data/would_have/recommended_popular_non_active_bidders_holdout.xlsx", index=False)