In [1]:
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
import pandas as pd
popular_lots = pd.read_csv('../data/processed/popular_lots.csv')

In [3]:
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict
from tqdm import tqdm

def build_future_dict(df):
    gb = df.groupby(['lot_make_cd', 'grp_model'])
    return {k: v.reset_index(drop=True) for k, v in gb}

def find_best_match(row_dict, future_dict, fallback_arr, fallback_idx):
    make, model = row_dict['lot_make_cd'], row_dict['grp_model']
    acv, repair = row_dict['median_acv'], row_dict['median_repair_cost']

    match_df = future_dict.get((make, model), None)

    if match_df is not None and len(match_df) > 0:
        acv_arr = match_df['acv'].values
        repair_arr = match_df['repair_cost'].values
        dist = np.abs(acv_arr - acv) + np.abs(repair_arr - repair)
        i = dist.argmin()
        selected = match_df.iloc[i]
        distance_value = dist[i]
        fallback_reason = "YMM match"
    else:
        dist = np.abs(fallback_arr[:,0] - acv) + np.abs(fallback_arr[:,1] - repair)
        i = dist.argmin()
        selected = fallback_idx.iloc[i]
        distance_value = dist[i]
        fallback_reason = "ACV+Repair fallback"

    return {
        'mbr_nbr': row_dict['mbr_nbr'],
        'recommended_lot_nbr': selected['lot_nbr'],
        'distance': distance_value,
        'fallback_reason': fallback_reason
    }

def match_recommendations_to_future_lots_fast(final_recommendations, future_lots, workers=10):
    future_filtered = future_lots[['lot_nbr','lot_make_cd','grp_model','acv','repair_cost']].copy()
    future_filtered['acv'] = future_filtered['acv'].astype(np.float32)
    future_filtered['repair_cost'] = future_filtered['repair_cost'].astype(np.float32)

    future_dict = build_future_dict(future_filtered)

    fallback_arr = future_filtered[['acv','repair_cost']].values
    fallback_idx = future_filtered

    results = []

    with ProcessPoolExecutor(max_workers=workers) as executor:
        futures = [
            executor.submit(find_best_match, row, future_dict, fallback_arr, fallback_idx)
            for row in final_recommendations.to_dict('records')
        ]
        for f in tqdm(as_completed(futures), total=len(futures)):
            results.append(f.result())

    return pd.DataFrame(results)


In [4]:
popular_lots

Unnamed: 0,buyer_type,mbr_state,lot_make_cd,grp_model,cnt,median_acv,median_plug_lot_acv,median_repair_cost,model_rank,rank,rank_clean
0,Consumer,AK,JEP,COMPASS,1,16089.00,12825.0,17984.54,1,1,1
1,Consumer,AK,CHEV,EQUINOX,1,14942.00,16325.0,14463.44,1,2,2
2,Consumer,AK,HOND,RIDGELINE,1,33517.00,35600.0,31893.54,1,3,3
3,Consumer,AK,NISS,ROGUE,1,14197.89,14325.0,14197.89,1,4,4
4,Consumer,AK,JEP,RENEGADE,1,13625.00,13625.0,0.00,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...
918,General Business,TX,NISS,VERSA,1,14552.00,14550.0,7337.38,1,6,6
919,General Business,VA,TOYT,COROLLA,3,16431.00,16950.0,16376.31,1,1,1
920,General Business,WA,CHRY,PACIFICA,1,17982.00,25050.0,15576.33,1,1,1
921,General Business,WA,HOND,FIT,1,14224.00,14300.0,18035.98,1,2,2


In [5]:
popular_lots['buyer_type'].value_counts()

buyer_type
Dealer              274
Consumer            247
Dismantler          240
Export               90
General Business     72
Name: count, dtype: int64

In [8]:
nonactive_buyers_test = pd.read_csv('../data/split/nonactive_test.csv')

In [9]:
nonactive_buyers_test['mbr_nbr'].nunique()

98100

In [10]:
def generate_final_recommendations(data, popular_lots_top6):

    if 'mbr_lic_type' in data.columns:
        data = data.rename(columns={'mbr_lic_type': 'buyer_type'})

    if 'buyer_nbr' in data.columns:
        data = data.rename(columns={'buyer_nbr': 'mbr_nbr'})

    if 'acv' in data.columns:
        data = data.rename(columns={'acv': 'median_acv'})

    if 'repair_cost' in data.columns:
        data = data.rename(columns={'repair_cost': 'median_repair_cost'})

    # Step 1: Merge based on buyer_type and mbr_state

    merged = data.merge(
        popular_lots_top6,
        on=['buyer_type', 'mbr_state'],
        how='inner'
    )

    # Step 2: Format initial recommendations
    initial_recommendations = merged[[
        'mbr_nbr', 'mbr_email', 'buyer_type', 'mbr_state',
        'lot_make_cd', 'grp_model', 'rank', 'rank_clean',
        'median_acv', 'median_repair_cost'
    ]]

    final_reco_list = []
    processed_buyers = set()

    # Step 3: For buyers with initial matches
    for mbr_nbr, group in initial_recommendations.groupby('mbr_nbr'):
        buyer_type = group['buyer_type'].iloc[0]
        mbr_email = group['mbr_email'].iloc[0]
        mbr_state = group['mbr_state'].iloc[0]

        recos = group.sort_values('rank_clean').to_dict('records')
        processed_buyers.add(mbr_nbr)

        if len(recos) < 6:
            needed = 6 - len(recos)

            fallback_pool = (
                popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
                .sort_values('rank_clean')
            )

            already_recoed = {(r['lot_make_cd'], r['grp_model']) for r in recos}

            for _, row in fallback_pool.iterrows():
                key = (row['lot_make_cd'], row['grp_model'])
                if key in already_recoed:
                    continue

                recos.append({
                    'mbr_nbr': mbr_nbr,
                    'mbr_email': mbr_email,
                    'buyer_type': buyer_type,
                    'mbr_state': mbr_state,
                    'lot_make_cd': row['lot_make_cd'],
                    'grp_model': row['grp_model'],
                    'rank': row.get('rank'),
                    'rank_clean': row.get('rank_clean'),
                    'median_acv': row.get('median_acv'),
                    'median_repair_cost': row.get('median_repair_cost')
                })

                already_recoed.add(key)
                if len(recos) == 6:
                    break

        final_reco_list.extend(recos)

    # Step 4: Handle buyers with no initial match
    missing_mbrs = set(data['mbr_nbr'].unique()) - processed_buyers
    fallback_missing = data[data['mbr_nbr'].isin(missing_mbrs)]

    for _, row in fallback_missing.iterrows():
        mbr_nbr = row['mbr_nbr']
        mbr_email = row['mbr_email']
        buyer_type = row['buyer_type']
        mbr_state = row['mbr_state']

        fallback_pool = (
            popular_lots_top6[popular_lots_top6['buyer_type'] == buyer_type]
            .sort_values('cnt', ascending=False)
            .drop_duplicates(subset=['lot_make_cd', 'grp_model'])
            .head(6)
        )

        for _, lot in fallback_pool.iterrows():
            final_reco_list.append({
                'mbr_nbr': mbr_nbr,
                'mbr_email': mbr_email,
                'buyer_type': buyer_type,
                'mbr_state': mbr_state,
                'lot_make_cd': lot['lot_make_cd'],
                'grp_model': lot['grp_model'],
                'rank': lot.get('rank'),
                'rank_clean': lot.get('rank_clean'),
                'median_acv': lot.get('median_acv'),
                'median_repair_cost': lot.get('median_repair_cost')
            })

    # Step 5: Return final DataFrame
    return pd.DataFrame(final_reco_list).sort_values(by=['mbr_nbr', 'rank_clean'])


In [11]:
nonactive_buyers_test_past_reco = generate_final_recommendations(nonactive_buyers_test, popular_lots)

In [12]:
nonactive_buyers_holdout = pd.read_csv('../data/split/nonactive_holdout.csv')

In [13]:
nonactive_buyers_holdout_past_reco = generate_final_recommendations(nonactive_buyers_holdout, popular_lots)

In [14]:
## cf_holdout
cf_holdout = pd.read_csv('../data/split/cf_holdout.csv')
cf_holdout = cf_holdout[['mbr_lic_type','mbr_state','buyer_nbr','mbr_email']]
cf_holdout = cf_holdout.drop_duplicates(subset=['buyer_nbr'])
cf_holdout_past_reco = generate_final_recommendations(cf_holdout, popular_lots)


In [15]:
## cf_holdout
one_to_one_holdout = pd.read_csv('../data/split/one_to_one_holdout.csv')
one_to_one_holdout = one_to_one_holdout[['mbr_lic_type','mbr_state','buyer_nbr','mbr_email']]
one_to_one_holdout = one_to_one_holdout.drop_duplicates(subset=['buyer_nbr'])
one_to_one_holdout_past_reco = generate_final_recommendations(one_to_one_holdout, popular_lots)


In [16]:
upcoming_lots = pd.read_csv('../data/processed/upcoming_lots.csv')


In [17]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

def match_recommendations_fast(final_recommendations, future_lots):
    # Build lookup for YMM exact matches
    future_groups = {
        k: v.reset_index(drop=True)
        for k, v in future_lots.groupby(["lot_make_cd", "grp_model"])
    }

    # Pre-store full arrays for fallback
    fallback_df = future_lots.reset_index(drop=True)
    fallback_vals = fallback_df[['acv','repair_cost']].values.astype(np.float32)

    results = []

    for _, row in tqdm(final_recommendations.iterrows(), total=len(final_recommendations)):
        make = row['lot_make_cd']
        model = row['grp_model']
        acv = row['median_acv']
        repair = row['median_repair_cost']

        # Step 1: Try fast group lookup
        match_df = future_groups.get((make, model), None)

        if match_df is not None and len(match_df) > 0:
            arr = match_df[['acv','repair_cost']].values.astype(np.float32)
            dist = np.abs(arr[:,0] - acv) + np.abs(arr[:,1] - repair)
            i = dist.argmin()
            selected = match_df.iloc[i]
            fallback_reason = "YMM"
        else:
            # 🔥 Only compute fallback if YMM missing
            dist = np.abs(fallback_vals[:,0] - acv) + np.abs(fallback_vals[:,1] - repair)
            i = dist.argmin()
            selected = fallback_df.iloc[i]
            fallback_reason = "Global"

        results.append({
            "mbr_nbr": row['mbr_nbr'],
            "recommended_lot_nbr": selected["lot_nbr"],
            "distance": float(dist[i]),
            "fallback_reason": fallback_reason
        })

    return pd.DataFrame(results)


In [18]:
nonactive_buyers_test_reco = match_recommendations_fast(nonactive_buyers_test_past_reco, upcoming_lots)

100%|██████████| 588600/588600 [01:01<00:00, 9536.38it/s]


In [19]:
nonactive_buyers_test_reco['mbr_nbr'].nunique()

98100

In [21]:
nonactive_buyers_test_reco.to_excel('../data/results/nonactive_test_reco.xlsx',index=False)

In [22]:
nonactive_buyers_holdout_reco = match_recommendations_fast(nonactive_buyers_holdout_past_reco, upcoming_lots)

100%|██████████| 589872/589872 [01:01<00:00, 9645.30it/s]


In [24]:
nonactive_buyers_holdout_reco.to_excel('../data/results/nonactive_holdout_reco.xlsx',index=False)

In [25]:
cf_holdout_reco = match_recommendations_fast(cf_holdout_past_reco,upcoming_lots)

100%|██████████| 22164/22164 [00:02<00:00, 9475.15it/s]


In [26]:
one_to_one_holdout_reco = match_recommendations_fast(one_to_one_holdout_past_reco, upcoming_lots)

100%|██████████| 67548/67548 [00:07<00:00, 9562.47it/s]


In [27]:
cf_holdout_reco.to_excel('../data/results/cf_holdout_reco.xlsx', index = False)
one_to_one_holdout_reco.to_excel('../data/results/onetoone_holdout_reco.xlsx', index = False)


In [178]:
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def match_recommendations_to_future_lots_parallel(final_recommendations, upcoming_lots, max_workers=12):
    future_filtered = upcoming_lots[['lot_nbr', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost']].copy()
    results = []

    def process_row(row):
        buyer_nbr = row['mbr_nbr']
        make = row['lot_make_cd']
        model = row['grp_model']
        acv = row['median_acv']
        repair = row['median_repair_cost']

        matching_lots = future_filtered[
            (future_filtered['lot_make_cd'] == make) &
            (future_filtered['grp_model'] == model)
        ]

        if not matching_lots.empty:
            distances = np.abs(matching_lots['acv'] - acv) + np.abs(matching_lots['repair_cost'] - repair)
            min_idx = distances.idxmin()
            best_lot = matching_lots.loc[min_idx]
            distance_value = distances[min_idx]
        else:
            distances = np.abs(future_filtered['acv'] - acv) + np.abs(future_filtered['repair_cost'] - repair)
            min_idx = distances.idxmin()
            best_lot = future_filtered.loc[min_idx]
            distance_value = distances[min_idx]

        return {
            'mbr_nbr': buyer_nbr,
            'lot_make_cd': make,
            'grp_model': model,
            'recommended_lot_nbr': best_lot['lot_nbr'],
            'matched_lot_make_cd': best_lot['lot_make_cd'],
            'matched_grp_model': best_lot['grp_model'],
            'distance': distance_value
        }

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_row, row) for _, row in final_recommendations.iterrows()]
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())

    return pd.DataFrame(results)

In [192]:
nonactive_holdout = pd.read_csv('data/split/nonactive_holdout.csv')

In [180]:
nonactive_holdout_past_reco = generate_final_recommendations(nonactive_buyers_holdout, popular_lots)

In [177]:
nonactive_holdout_reco = match_recommendations_to_future_lots_parallel(nonactive_holdout_past_reco, upcoming_lots, max_workers=12)


  4%|▍         | 17270/436482 [00:27<11:19, 617.23it/s]


KeyboardInterrupt: 

In [195]:
nonactive_holdout.head()

Unnamed: 0,buyer_type,mbr_state,mbr_nbr,mbr_email
0,Consumer,NH,252896,cmcgrath09@gmail.com
1,Consumer,DE,727700,dorzetshrt@yahoo.com
2,Consumer,SD,596008,mhayenga@nvc.net
3,Dealer,ND,235198,jimm@riverwoodrvs.com
4,Consumer,AK,783812,ywjpheejyaj101@gmail.com


In [196]:
nonactive_holdout_past_reco.head()

Unnamed: 0,mbr_nbr,mbr_email,buyer_type,mbr_state,lot_make_cd,grp_model,rank,rank_clean,median_acv,median_repair_cost
0,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL 3,1,1,23368.0,21130.38
1,4,godwinomoosagie@ymail.com,Consumer,NY,FORD,ESCAPE,2,2,14014.0,8420.92
2,4,godwinomoosagie@ymail.com,Consumer,NY,TESL,MODEL Y,3,3,30119.0,21993.42
3,4,godwinomoosagie@ymail.com,Consumer,NY,TOYT,RAV4,4,4,21719.36,14052.0
4,4,godwinomoosagie@ymail.com,Consumer,NY,VOLK,TIGUAN,5,5,14197.86,10210.0


In [197]:
cf_holdout = pd.read_csv("data/split/CollaborativeFiltering_holdout.csv")
cf_holdout.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Dealer,NV,62520895,835040,cncauto775@gmail.com,1950.0,2025-07-31,2015,BMW,X1,9778.0,6525.0,0.0,5,332
1,Dealer,NV,80901305,260760,olea8086@gmail.com,3250.0,2025-10-08,2017,ACUR,MDX,16297.0,16025.0,11752.67,5,223
2,Dismantler,RI,65463145,617140,copart.617140@picknpull.com,500.0,2025-09-03,2002,DODG,RAM 2500,7000.0,0.0,0.0,5,211
3,Dismantler,RI,68813285,794584,northendtowing101@yahoo.com,400.0,2025-09-17,2015,CHRY,MINIVAN,5965.0,7150.0,8133.38,5,1617
4,Consumer,MT,61917475,574298,chuck.raup@yahoo.com,600.0,2025-09-24,2018,HYUN,KONA,13028.92,12150.0,0.0,3,16


In [203]:
cf_holdout = cf_holdout[['mbr_lic_type','buyer_nbr','mbr_state','mbr_email']].head(5000)

In [1]:
cf_holdout[cf_holdout['buyer_nbr']==9484]

NameError: name 'cf_holdout' is not defined

In [204]:
cf_holdout_past_reco = generate_final_recommendations(cf_holdout, popular_lots)

In [None]:
cf_holdout_past_reco