In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import manhattan_distances
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
one_to_one_test = pd.read_csv('../data/split/one_to_one_test.csv')

print(one_to_one_test['buyer_nbr'].nunique())
print(one_to_one_test['lot_nbr'].nunique())

11250
15163


In [3]:
upcoming_lots = pd.read_csv("../data/processed/upcoming_lots.csv")

In [4]:
def recommend_lots_for_buyer(buyer_id, buyer_lots_df, upcoming_df, top_k=6):
    results = []
    used_lots = set()

    # Step 1: Loop over each lot the buyer has seen
    for _, row in buyer_lots_df.iterrows():
        acv, repair = row['acv'], row['repair_cost']
        year, make, model = row['lot_year'], row['lot_make_cd'], row['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        if ymm_candidates.empty:
            ymm_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]
        if ymm_candidates.empty:
            continue

        ymm_candidates = ymm_candidates.copy()
        ymm_candidates['manhattan_dist'] = manhattan_distances(
            ymm_candidates[['acv', 'repair_cost']].values, input_vec
        ).flatten()

        best_match = ymm_candidates.sort_values('manhattan_dist').iloc[0]

        results.append({
            'input_buyer_nbr': buyer_id,
            'original_lot': int(row['recommended_lot']),
            'recommended_lot': int(best_match['lot_nbr']),
            'manhattan_distance': float(best_match['manhattan_dist']),
            'source': 'Step 1 - YMM/Manhattan'
        })
        used_lots.add(int(best_match['lot_nbr']))

    # Step 2: Recent YMM
    if len(results) < top_k:
        most_recent = buyer_lots_df.sort_values('inv_dt', ascending=False).iloc[0]
        acv, repair = most_recent['acv'], most_recent['repair_cost']
        year, make, model = most_recent['lot_year'], most_recent['lot_make_cd'], most_recent['grp_model']
        input_vec = np.array([[acv, repair]])

        ymm_candidates = upcoming_df[
            (upcoming_df['lot_year'] == year) &
            (upcoming_df['lot_make_cd'] == make) &
            (upcoming_df['grp_model'] == model)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()

        ymm_candidates = ymm_candidates[~ymm_candidates['lot_nbr'].isin(used_lots)]
        if not ymm_candidates.empty:
            ymm_candidates = ymm_candidates.copy()
            ymm_candidates['manhattan_dist'] = manhattan_distances(
                ymm_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in ymm_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 2 - Recent YMM/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # Step 3: Make-level fallback
    if len(results) < top_k:
        make = most_recent['lot_make_cd']
        input_vec = np.array([[acv, repair]])

        make_candidates = upcoming_df[
            (upcoming_df['lot_make_cd'] == make)
        ][['lot_nbr', 'acv', 'repair_cost']].dropna()
        make_candidates = make_candidates[~make_candidates['lot_nbr'].isin(used_lots)]

        if not make_candidates.empty:
            make_candidates = make_candidates.copy()
            make_candidates['manhattan_dist'] = manhattan_distances(
                make_candidates[['acv', 'repair_cost']].values, input_vec).flatten()

            for _, r in make_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 3 - Global Make/Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break

    # Step 4: Global fallback
    if len(results) < top_k:
        input_vec = np.array([[acv, repair]])
        global_candidates = upcoming_df[['lot_nbr', 'acv', 'repair_cost']].dropna()
        global_candidates = global_candidates[~global_candidates['lot_nbr'].isin(used_lots)]

        if not global_candidates.empty:
            global_candidates = global_candidates.copy()
            global_candidates['manhattan_dist'] = manhattan_distances(
                global_candidates[['acv', 'repair_cost']].values, input_vec
            ).flatten()

            for _, r in global_candidates.sort_values('manhattan_dist').iterrows():
                results.append({
                    'input_buyer_nbr': buyer_id,
                    'original_lot': int(most_recent['recommended_lot']),
                    'recommended_lot': int(r['lot_nbr']),
                    'manhattan_distance': float(r['manhattan_dist']),
                    'source': 'Step 4 - Global Fallback Manhattan'
                })
                used_lots.add(int(r['lot_nbr']))
                if len(results) >= top_k:
                    break
    return results


In [5]:
def refine_recommendations_parallel_per_buyer(reco_df, upcoming_df, max_workers=4):
    # 🛠 Rename buyer_nbr and lot_nbr to match expected inputs
    reco_df.columns = reco_df.columns.str.strip().str.lower()

    reco_df = reco_df.rename(columns={
        'buyer_nbr': 'input_buyer_nbr',
        'lot_nbr': 'recommended_lot'
    })

    results = []
    futures = []
    grouped = list(reco_df.groupby('input_buyer_nbr'))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for buyer_id, group_df in grouped:
            futures.append(executor.submit(recommend_lots_for_buyer, buyer_id, group_df, upcoming_df))

        for f in tqdm(as_completed(futures), total=len(futures), desc="Refining recos"):
            try:
                results.extend(f.result())
            except Exception as e:
                print(f"⚠️ Skipped buyer due to error: {e}")

    return pd.DataFrame(results)

In [6]:
one_to_one_test_reco = refine_recommendations_parallel_per_buyer(one_to_one_test, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 11250/11250 [01:28<00:00, 127.80it/s]


In [7]:
one_to_one_test_reco

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,source
0,617,84271655,71233705,1573.62,Step 1 - YMM/Manhattan
1,617,84271655,72042405,2388.24,Step 2 - Recent YMM/Manhattan
2,617,84271655,77395814,2518.52,Step 2 - Recent YMM/Manhattan
3,617,84271655,85086145,3016.27,Step 2 - Recent YMM/Manhattan
4,617,84271655,83852145,3412.52,Step 2 - Recent YMM/Manhattan
...,...,...,...,...,...
67495,984099,65579335,80752685,150.00,Step 1 - YMM/Manhattan
67496,984099,69138305,80088025,11055.00,Step 1 - YMM/Manhattan
67497,984099,64254235,62076005,25.00,Step 1 - YMM/Manhattan
67498,984099,68469665,69436705,26762.00,Step 1 - YMM/Manhattan


In [8]:
one_to_one_test_reco['input_buyer_nbr'].nunique()

11250

In [9]:
one_to_one_test_reco.to_excel('../data/results/onetoone_test_reco.xlsx',index=False)

In [10]:
one_to_one_holdout = pd.read_csv('../data/split/one_to_one_holdout.csv')

print(one_to_one_holdout['buyer_nbr'].nunique())
print(one_to_one_holdout['lot_nbr'].nunique())

11258
15245


In [11]:
one_to_one_holdout_reco = refine_recommendations_parallel_per_buyer(one_to_one_holdout, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 11258/11258 [01:29<00:00, 125.89it/s]


In [12]:
one_to_one_holdout_reco

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,source
0,406,80269295,72069185,2042.68,Step 1 - YMM/Manhattan
1,406,86244445,41840135,2007.78,Step 1 - YMM/Manhattan
2,406,80269295,85140265,2541.12,Step 2 - Recent YMM/Manhattan
3,406,80269295,71862725,3938.41,Step 2 - Recent YMM/Manhattan
4,406,80269295,68831635,4013.00,Step 2 - Recent YMM/Manhattan
...,...,...,...,...,...
67543,988440,81375995,63988545,4136.75,Step 1 - YMM/Manhattan
67544,988440,71806915,85835015,2412.41,Step 1 - YMM/Manhattan
67545,988440,70291305,80278775,666.00,Step 1 - YMM/Manhattan
67546,988440,85287725,68333575,410.00,Step 3 - Global Make/Manhattan


In [13]:
one_to_one_holdout_reco['input_buyer_nbr'].nunique()

11258

In [14]:
one_to_one_holdout_reco.to_excel('../data/results/onetoone_holdout_would_have_reco.xlsx',index=False)

In [15]:
cf_test = pd.read_excel('../data/past_reco/cf_test_reco.xlsx')

print(cf_test['input_buyer_nbr'].nunique())
print(cf_test['recommended_lot'].nunique())

3618
12820


In [16]:
cf_test_reco = refine_recommendations_parallel_per_buyer(cf_test, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 3618/3618 [00:52<00:00, 68.77it/s]


In [17]:
cf_test_reco['input_buyer_nbr'].nunique()

3618

In [18]:
cf_test_reco.to_excel('../data/results/cf_test_reco.xlsx',index=False)

In [19]:
cf_holdout_would_have = pd.read_excel('../data/past_reco/cf_holdout_would_have_reco.xlsx')

print(cf_holdout_would_have['input_buyer_nbr'].nunique())
print(cf_holdout_would_have['recommended_lot'].nunique())

3694
12566


In [20]:
cf_holdout_would_have_reco = refine_recommendations_parallel_per_buyer(cf_holdout_would_have, upcoming_lots, max_workers=8)

Refining recos: 100%|██████████| 3694/3694 [00:54<00:00, 67.42it/s]


In [21]:
cf_holdout_would_have['input_buyer_nbr'].nunique()

3694

In [22]:
cf_holdout_would_have.to_excel('../data/results/cf_holdout_would_have_reco.xlsx',index=False)