In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import manhattan_distances
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
one_to_one_test = pd.read_csv('../data/split/one_to_one_test.csv')

print(one_to_one_test['buyer_nbr'].nunique())
print(one_to_one_test['lot_nbr'].nunique())

23702
47591


In [3]:
upcoming_lots = pd.read_csv("../data/processed/upcoming_lots.csv")

In [4]:
def recommend_lots_for_buyer(buyer_id, buyer_lots_df, upcoming_df, top_k=6):
    results = []
    used_lots = set()

    # Step 1: Loop over each lot the buyer has seen
    for _, row in buyer_lots_df.iterrows():
        acv, repair = row['acv'], row['repair_cost']
        year, make, model = row['lot_year'], row['lot_make_cd'], row['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        if ymm_candidates.empty:
            ymm_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]
        if ymm_candidates.empty:
            continue

        ymm_candidates = ymm_candidates.copy()
        ymm_candidates['manhattan_dist'] = manhattan_distances(
            ymm_candidates[['acv', 'repair_cost']].values, input_vec
        ).flatten()

        best_match = ymm_candidates.sort_values('manhattan_dist').iloc[0]

        results.append({
            'input_buyer_nbr': buyer_id,
            'original_lot': int(row['recommended_lot']),
            'recommended_lot': int(best_match['lot_nbr']),
            'manhattan_distance': float(best_match['manhattan_dist']),
            'source': 'Step 1 - YMM/Manhattan'
        })
        used_lots.add(int(best_match['lot_nbr']))

    # Step 2: Recent YMM
    if len(results) < top_k:
        most_recent = buyer_lots_df.sort_values('inv_dt', ascending=False).iloc[0]
        acv, repair = most_recent['acv'], most_recent['repair_cost']
        year, make, model = most_recent['lot_year'], most_recent['lot_make_cd'], most_recent['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]
        if not ymm_candidates.empty:
            ymm_candidates = ymm_candidates.copy()
            ymm_candidates['manhattan_dist'] = manhattan_distances(
                ymm_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in ymm_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 2 - Recent YMM/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # Step 3: Make-level fallback
    if len(results) < top_k:
        make = most_recent['lot_make_cd']
        input_vec = np.array([[acv, repair]])

        make_candidates = upcoming_df[
            (upcoming_df['lot_make_cd'] == make)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()
        make_candidates = make_candidates[~make_candidates['lot_nbr'].isin(used_lots)]

        if not make_candidates.empty:
            make_candidates = make_candidates.copy()
            make_candidates['manhattan_dist'] = manhattan_distances(
                make_candidates[['acv', 'repair_cost']].values, input_vec).flatten()

            for _, r in make_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 3 - Global Make/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # Step 4: Global fallback
    if len(results) < top_k:
        input_vec = np.array([[acv, repair]])
        global_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()
        global_candidates = global_candidates[~global_candidates['lot_nbr'].isin(used_lots)]

        if not global_candidates.empty:
            global_candidates = global_candidates.copy()
            global_candidates['manhattan_dist'] = manhattan_distances(
                global_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in global_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 4 - Global Fallback Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break
    return results


In [5]:
def refine_recommendations_parallel_per_buyer(reco_df, upcoming_df, max_workers=4):
    # 🛠 Rename buyer_nbr and lot_nbr to match expected inputs
    reco_df.columns = reco_df.columns.str.strip().str.lower()

    reco_df = reco_df.rename(columns={
        'buyer_nbr': 'input_buyer_nbr',
        'lot_nbr': 'recommended_lot'
    })

    results = []
    futures = []
    grouped = list(reco_df.groupby('input_buyer_nbr'))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for buyer_id, group_df in grouped:
            futures.append(executor.submit(recommend_lots_for_buyer, buyer_id, group_df, upcoming_df))

        for f in tqdm(as_completed(futures), total=len(futures), desc="Refining recos"):
            try:
                results.extend(f.result())
            except Exception as e:
                print(f"⚠️ Skipped buyer due to error: {e}")

    return pd.DataFrame(results)

In [6]:
one_to_one_test_reco = refine_recommendations_parallel_per_buyer(one_to_one_test, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 23702/23702 [03:08<00:00, 125.82it/s]


In [7]:
one_to_one_test_reco

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,source
0,467,70378335,63457395,2337.33,Step 1 - YMM/Manhattan
1,467,70720935,82109425,2204.37,Step 1 - YMM/Manhattan
2,467,68690975,87224095,6533.15,Step 1 - YMM/Manhattan
3,467,70378335,81966165,3798.55,Step 2 - Recent YMM/Manhattan
4,467,70378335,81230315,4287.50,Step 2 - Recent YMM/Manhattan
...,...,...,...,...,...
142207,988383,68906855,80529235,2443.00,Step 1 - YMM/Manhattan
142208,988383,66651015,65450105,232.33,Step 1 - YMM/Manhattan
142209,988383,69319655,67687855,1593.79,Step 1 - YMM/Manhattan
142210,988383,59697224,68573555,217996.92,Step 3 - Global Make/Manhattan


In [8]:
one_to_one_test_reco['input_buyer_nbr'].nunique()

23702

In [9]:
one_to_one_test_reco.to_excel('../data/results/onetoone_test_reco.xlsx',index=False)

In [11]:
one_to_one_holdout = pd.read_csv('../data/split/one_to_one_holdout.csv')

print(one_to_one_holdout['buyer_nbr'].nunique())
print(one_to_one_holdout['lot_nbr'].nunique())

23876
47539


In [12]:
one_to_one_holdout_reco = refine_recommendations_parallel_per_buyer(one_to_one_holdout, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 23876/23876 [03:12<00:00, 124.22it/s]


In [13]:
one_to_one_holdout_reco

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,source
0,144,63028865,82602225,3562.18,Step 1 - YMM/Manhattan
1,144,61231425,86861125,1459.57,Step 1 - YMM/Manhattan
2,144,80797995,80825505,1020.66,Step 1 - YMM/Manhattan
3,144,85478845,87279265,1629.00,Step 1 - YMM/Manhattan
4,144,63028865,87252715,6122.78,Step 2 - Recent YMM/Manhattan
...,...,...,...,...,...
143251,995336,69071085,83840965,0.00,Step 1 - YMM/Manhattan
143252,995336,63008385,85560105,3300.00,Step 1 - YMM/Manhattan
143253,995336,69071085,87478795,0.00,Step 3 - Global Make/Manhattan
143254,995336,69071085,89813695,0.00,Step 3 - Global Make/Manhattan


In [14]:
one_to_one_holdout_reco['input_buyer_nbr'].nunique()

23876

In [15]:
one_to_one_holdout_reco.to_excel('../data/results/onetoone_holdout_would_have_reco.xlsx',index=False)

In [16]:
cf_test = pd.read_excel('../data/past_reco/cf_test_reco.xlsx')

print(cf_test['input_buyer_nbr'].nunique())
print(cf_test['recommended_lot'].nunique())

16486
58256


In [17]:
cf_test_reco = refine_recommendations_parallel_per_buyer(cf_test, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 16486/16486 [03:39<00:00, 75.06it/s]


In [18]:
cf_test_reco['input_buyer_nbr'].nunique()

16486

In [19]:
cf_test_reco.to_excel('../data/results/cf_test_reco.xlsx',index=False)

In [20]:
cf_holdout_would_have = pd.read_excel('../data/past_reco/cf_holdout_would_have_reco.xlsx')

print(cf_holdout_would_have['input_buyer_nbr'].nunique())
print(cf_holdout_would_have['recommended_lot'].nunique())

16693
59019


In [21]:
cf_holdout_would_have_reco = refine_recommendations_parallel_per_buyer(cf_holdout_would_have, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 16693/16693 [03:42<00:00, 74.90it/s]


In [22]:
cf_holdout_would_have['input_buyer_nbr'].nunique()

16693

In [23]:
cf_holdout_would_have.to_excel('../data/results/cf_holdout_would_have_reco.xlsx',index=False)

In [6]:
data_low_test = data_low_test[['buyer_nbr','mbr_email','lot_nbr', 'lot_year', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost', 'inv_dt']]
data_low_test

Unnamed: 0,buyer_nbr,mbr_email,lot_nbr,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt
0,845199,jdpcdp7280@gmail.com,64241495,2016,CHEV,CORVETTE,35661.00,30000.00,2025-10-01
1,440309,koci.besim@gmail.com,71565645,2010,SUBA,IMPREZA,8892.77,7119.03,2025-09-30
2,78097,windyvalleyfarmmatt@icloud.com,62702845,2020,RAM,PROMASTER,15120.00,1975.00,2025-08-08
3,511537,damianjames309@gmail.com,65201845,2011,TOYT,COROLLA,4214.00,3986.00,2025-08-04
4,471975,nidopod@gmail.com,55886865,2016,FORD,F250,22855.00,12639.11,2025-08-13
...,...,...,...,...,...,...,...,...,...
57609,37727,djackson.aui@gmail.com,56553155,2019,JEP,WRANGLER,26515.00,17993.23,2025-07-28
57610,45301,masonmollman@hotmail.com,64722285,2004,JEP,WRANGLER,0.00,0.00,2025-09-25
57611,633103,babys4444@gmail.com,61585715,2015,JEP,WRANGLER,20939.00,13405.12,2025-09-02
57612,870563,CSTOKES@NETWORKFACTORY.COM,60593265,2014,JEP,WRANGLER,12865.00,0.00,2025-07-24


In [7]:
data_low_test.rename(columns={
    'buyer_nbr': 'input_buyer_nbr',
    'lot_nbr': 'recommended_lot'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_low_test.rename(columns={


In [8]:
data2 = pd.read_csv('data/interim/future_lots.csv')

In [9]:
data2.isnull().sum()

lot_nbr             0
lot_year            0
lot_make_cd         0
grp_model           0
damage_type_desc    0
repair_cost         0
acv                 0
plug_lot_acv        0
auc_dt              0
proquote_amt        0
dtype: int64

In [10]:
def recommend_lots_for_buyer(buyer_id, buyer_lots_df, upcoming_df, top_k=6):
    from sklearn.metrics.pairwise import manhattan_distances
    import numpy as np
    import pandas as pd

    results = []
    used_lots = set()

    # ------------------------
    # STEP 1: One-to-one exact YMM + fallback
    # ------------------------
    for _, row in buyer_lots_df.iterrows():
        acv, repair = row['acv'], row['repair_cost']
        year, make, model = row['lot_year'], row['lot_make_cd'], row['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        if ymm_candidates.empty:
            ymm_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]
        if ymm_candidates.empty:
            continue

        ymm_candidates['manhattan_dist'] = manhattan_distances(
            ymm_candidates[['acv', 'repair_cost']].values, input_vec
        ).flatten()

        best_match = ymm_candidates.sort_values('manhattan_dist').iloc[0]

        results.append({
            'input_buyer_nbr': buyer_id,
            'original_lot': int(row['recommended_lot']),
            'recommended_lot': int(best_match['lot_nbr']),
            'manhattan_distance': float(best_match['manhattan_dist']),
            'source': 'Step 1 - YMM/Manhattan'
        })
        used_lots.add(int(best_match['lot_nbr']))

    # ------------------------
    # STEP 2: Try YMM on most recent interacted lot
    # ------------------------
    if len(results) < top_k:
        most_recent = buyer_lots_df.sort_values('inv_dt', ascending=False).iloc[0]
        acv, repair = most_recent['acv'], most_recent['repair_cost']
        year, make, model = most_recent['lot_year'], most_recent['lot_make_cd'], most_recent['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]

        if not ymm_candidates.empty:
            ymm_candidates['manhattan_dist'] = manhattan_distances(
                ymm_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in ymm_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 2 - Recent YMM/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # ------------------------
    # STEP 3: Global MAKE match on most recent lot
    # ------------------------
    if len(results) < top_k:
        acv, repair = most_recent['acv'], most_recent['repair_cost']
        make = most_recent['lot_make_cd']
        input_vec = np.array([[acv, repair]])

        make_candidates = upcoming_df[
            (upcoming_df['lot_make_cd'] == make)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        make_candidates = make_candidates[~make_candidates['lot_nbr'].isin(used_lots)]

        if not make_candidates.empty:
            make_candidates['manhattan_dist'] = manhattan_distances(
                make_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in make_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 3 - Global Make/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # ------------------------
    # STEP 4: Global fallback - No filters at all
    # ------------------------
    if len(results) < top_k:
        acv, repair = most_recent['acv'], most_recent['repair_cost']
        input_vec = np.array([[acv, repair]])

        global_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()
        global_candidates = global_candidates[~global_candidates['lot_nbr'].isin(used_lots)]

        if not global_candidates.empty:
            global_candidates['manhattan_dist'] = manhattan_distances(
                global_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in global_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 4 - Global Fallback Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    return results


In [11]:
def refine_recommendations_parallel_per_buyer(reco_df, data2_df, max_workers=4):
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from tqdm import tqdm

    results = []
    futures = []

    grouped = list(reco_df.groupby('input_buyer_nbr'))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for buyer_id, group_df in grouped:
            futures.append(executor.submit(recommend_lots_for_buyer, buyer_id, group_df, data2_df))

        for f in tqdm(as_completed(futures), total=len(futures), desc="Refining recos"):
            try:
                results.extend(f.result())  # each result is a list of 6 recos
            except Exception as e:
                print(f"⚠️ Skipped buyer due to error: {e}")

    return pd.DataFrame(results)

In [12]:
# final input:
# - data_low_test → contains columns: input_buyer_nbr, recommended_lot, acv, repair_cost, lot_year, lot_make_cd, grp_model, inv_dt
# - future_lots → upcoming inventory to match against

recommended_upcoming_df_lt6 = refine_recommendations_parallel_per_buyer(data_low_test, data2, max_workers=8)


Refining recos: 100%|██████████| 23684/23684 [03:34<00:00, 110.55it/s]


In [13]:
recommended_upcoming_df_lt6.isnull().sum()

input_buyer_nbr       0
original_lot          0
recommended_lot       0
manhattan_distance    0
source                0
dtype: int64

In [14]:
recommended_upcoming_df_lt6['input_buyer_nbr'].nunique()

23684

In [15]:
recommended_upcoming_df_lt6.to_excel('data/processed/recommended_onetoone_test.xlsx', index=False)

### Reco for holdout

In [16]:
data_low_holdout = pd.read_csv('data/interim/data_low_holdout.csv')
data_low_holdout.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Dealer,VT,57688775,64494,mountainviewautosalesservice@gmail.com,6600.0,2025-07-24,2015,SUBA,FORESTER,16445.0,11175.0,11579.0,6,5
1,Consumer,PR,63809825,767398,abdiel.xaviel@gmail.com,300.0,2025-09-19,2025,NISS,FRONTIER,38408.0,0.0,24984.93,20,3
2,Consumer,HI,60894405,299872,piginaloha@gmail.com,2400.0,2025-09-23,2020,SUBA,OUTBACK,22060.0,21175.0,15573.23,7,3
3,Consumer,SD,51295585,690560,michiel.steyn55@gmail.com,425.0,2025-09-19,2016,FORD,EXPEDITION,10613.0,17825.0,14889.6,13,1
4,Consumer,MT,47185205,141040,pandaman101crew@gmail.com,225.0,2025-09-02,2013,CHEV,SILVERADO,11164.0,14500.0,5881.12,12,1


In [17]:
data_low_holdout['buyer_nbr'].nunique()

23810

In [18]:
data_low_holdout.isnull().sum()

mbr_lic_type                       0
mbr_state                          0
lot_nbr                            0
buyer_nbr                          0
mbr_email                          0
max_bid                            0
inv_dt                             0
lot_year                           0
lot_make_cd                        0
grp_model                          0
acv                                0
plug_lot_acv                       0
repair_cost                        0
total_unique_buyers_on_that_lot    0
total_unique_lots_bid_by_buyers    0
dtype: int64

In [19]:
data_low_holdout = data_low_holdout[['buyer_nbr','mbr_email','lot_nbr', 'lot_year', 'lot_make_cd', 'grp_model', 'acv', 'repair_cost', 'inv_dt']]
data_low_holdout

Unnamed: 0,buyer_nbr,mbr_email,lot_nbr,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt
0,64494,mountainviewautosalesservice@gmail.com,57688775,2015,SUBA,FORESTER,16445.0,11579.00,2025-07-24
1,767398,abdiel.xaviel@gmail.com,63809825,2025,NISS,FRONTIER,38408.0,24984.93,2025-09-19
2,299872,piginaloha@gmail.com,60894405,2020,SUBA,OUTBACK,22060.0,15573.23,2025-09-23
3,690560,michiel.steyn55@gmail.com,51295585,2016,FORD,EXPEDITION,10613.0,14889.60,2025-09-19
4,141040,pandaman101crew@gmail.com,47185205,2013,CHEV,SILVERADO,11164.0,5881.12,2025-09-02
...,...,...,...,...,...,...,...,...,...
58069,156970,ajones3656@gmail.com,54426285,2011,JEP,WRANGLER,11269.0,11269.00,2025-08-07
58070,770546,770546cprt_dmmy_BROKERBDR_307147_30901@copart.com,61507695,2023,JEP,WRANGLER,31052.0,10714.65,2025-07-28
58071,227838,Eduarsoto2092@gmail.com,65616315,2016,JEP,WRANGLER,17550.0,0.00,2025-09-11
58072,828732,joeawyatt@gmail.com,82098545,2020,JEP,WRANGLER,32833.0,18604.98,2025-10-13


In [20]:
data_low_holdout.rename(columns={
    'buyer_nbr': 'input_buyer_nbr',
    'lot_nbr': 'recommended_lot'
}, inplace=True)

In [21]:
recommended_upcoming_df_lt6_holdout = refine_recommendations_parallel_per_buyer(data_low_holdout, data2, max_workers=8)


Refining recos: 100%|██████████| 23810/23810 [03:43<00:00, 106.51it/s]


In [22]:
recommended_upcoming_df_lt6_holdout.to_excel('data/would_have/recommended_onetoone_holdout.xlsx', index=False)

In [23]:
recommended_upcoming_df_lt6_holdout['input_buyer_nbr'].nunique()

23810