In [2]:
import numpy as np
import pandas as pd
import faiss
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cf_test = pd.read_csv('../data/split/cf_test.csv')
print(cf_test['buyer_nbr'].nunique())
print(cf_test['lot_nbr'].nunique())

16486
679207


In [4]:
def build_encoders(data: pd.DataFrame):
    """Fit label encoders for buyer and lot IDs."""
    buyer_encoder = LabelEncoder()
    lot_encoder = LabelEncoder()

    buyer_ids = buyer_encoder.fit_transform(data['buyer_nbr'])
    lot_ids = lot_encoder.fit_transform(data['lot_nbr'])

    return buyer_encoder, lot_encoder, buyer_ids, lot_ids

In [5]:
def build_sparse_matrix(data: pd.DataFrame, buyer_ids, lot_ids):
    """Create a buyer-lot sparse matrix weighted by max_bid."""
    max_bid_values = data['max_bid'].fillna(0).astype(float)
    n_buyers = len(np.unique(buyer_ids))
    n_lots = len(np.unique(lot_ids))

    sparse_matrix = csr_matrix((max_bid_values, (buyer_ids, lot_ids)), shape=(n_buyers, n_lots))
    return sparse_matrix


In [6]:
def train_als_model(sparse_matrix, factors=32, regularization=0.5, iterations=30, use_gpu=False):
    """Train implicit ALS model."""
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        use_gpu=use_gpu
    )
    model.fit(sparse_matrix)
    return model

In [7]:
def extract_embeddings(als_model):
    """Extract buyer and lot embeddings from the trained ALS model."""
    buyer_embeddings = als_model.user_factors.astype('float32')
    lot_embeddings = als_model.item_factors.astype('float32')
    return buyer_embeddings, lot_embeddings


In [8]:
def build_faiss_index(buyer_embeddings):
    """Build FAISS index using normalized buyer embeddings."""
    faiss.normalize_L2(buyer_embeddings)
    dim = buyer_embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(buyer_embeddings)
    return index

In [9]:
def get_similar_buyers_faiss(input_buyer_id, buyer_encoder, buyer_embeddings, faiss_index, als_model,top_k=5):
    """Return top-k similar buyers for a given buyer using FAISS."""
    if input_buyer_id not in buyer_encoder.classes_:
        raise ValueError("Buyer not in training data")

    internal_buyer_id = buyer_encoder.transform([input_buyer_id])[0]

    # Get query embedding and normalize
    query_vec = als_model.user_factors[internal_buyer_id].astype('float32').reshape(1, -1)
    faiss.normalize_L2(query_vec)

    distances, indices = faiss_index.search(query_vec, top_k + 1)
    similar_ids = indices[0]
    similar_ids = [i for i in similar_ids if i != internal_buyer_id][:top_k]
    similar_buyers = buyer_encoder.inverse_transform(similar_ids)

    return similar_buyers


In [10]:
def recommend_lots_cosine_from_similar_buyers(input_buyer_id, data, buyer_encoder, lot_encoder, buyer_embeddings, lot_embeddings,
    als_model, faiss_index, top_k_buyers=5, top_k_lots=6):
    """Generate lot recommendations for a buyer based on similar buyers' behavior."""
    # Step 1: Find similar buyers
    similar_buyers = get_similar_buyers_faiss(
        input_buyer_id, buyer_encoder, buyer_embeddings, faiss_index, als_model, top_k=top_k_buyers
    )

    # Step 2: Get lots interacted by similar buyers
    sim_buyer_lots = data[data['buyer_nbr'].isin(similar_buyers)]
    candidate_lot_ids = sim_buyer_lots['lot_nbr'].unique()

    # Step 3: Remove already seen lots
    input_buyer_lot_ids = data[data['buyer_nbr'] == input_buyer_id]['lot_nbr'].unique()
    candidate_lot_ids = list(set(candidate_lot_ids) - set(input_buyer_lot_ids))
    if not candidate_lot_ids:
        return pd.DataFrame()

    # Step 4: Get buyer and lot embeddings
    input_buyer_idx = buyer_encoder.transform([input_buyer_id])[0]
    buyer_vec = buyer_embeddings[input_buyer_idx].reshape(1, -1)
    lot_indices = lot_encoder.transform(candidate_lot_ids)
    lot_vecs = lot_embeddings[lot_indices]
    faiss.normalize_L2(lot_vecs)

    # Step 5: Cosine similarity = dot product of normalized vectors
    cosine_scores = np.dot(lot_vecs, buyer_vec.T).flatten()

    # Step 6: Top lots
    top_indices = np.argsort(-cosine_scores)[:top_k_lots]
    top_lot_ids = [candidate_lot_ids[i] for i in top_indices]
    top_scores = cosine_scores[top_indices]

    # Step 7: Build recommendation DataFrame
    top_rows = []
    for lot_id, score in zip(top_lot_ids, top_scores):
        matching_rows = sim_buyer_lots[(sim_buyer_lots['lot_nbr'] == lot_id) & (sim_buyer_lots['buyer_nbr'].isin(similar_buyers))]

        if matching_rows.empty:
            continue  # skip this lot if no matching similar buyer row found

        row = matching_rows.iloc[0]

        top_rows.append({
            'input_buyer_nbr': input_buyer_id,
            'mbr_email': row['mbr_email'],
            'recommended_lot': lot_id,
            'lot_year': row['lot_year'],
            'lot_make_cd': row['lot_make_cd'],
            'grp_model': row['grp_model'],
            'acv': row['acv'],
            'repair_cost': row['repair_cost'],
            'inv_dt': row['inv_dt'],
            'cosine_similarity': score
        })

    return pd.DataFrame(top_rows)

In [11]:
def run_batch_recommendations(data):

    print("\nBuilding encoders and sparse matrix...")
    buyer_encoder, lot_encoder, buyer_ids, lot_ids = build_encoders(data)
    sparse_matrix = build_sparse_matrix(data, buyer_ids, lot_ids)

    print("Training ALS model...")
    als_model = train_als_model(sparse_matrix)

    print("Extracting embeddings and building FAISS index...")
    buyer_embeddings, lot_embeddings = extract_embeddings(als_model)
    faiss_index = build_faiss_index(buyer_embeddings)

    print("Generating recommendations...")
    all_buyers = data['buyer_nbr'].unique()
    all_recos = []

    for buyer in tqdm(all_buyers):
        try:
            df = recommend_lots_cosine_from_similar_buyers(
                input_buyer_id=buyer,
                data=data,
                buyer_encoder=buyer_encoder,
                lot_encoder=lot_encoder,
                buyer_embeddings=buyer_embeddings,
                lot_embeddings=lot_embeddings,
                als_model=als_model,
                faiss_index=faiss_index
            )
            if not df.empty:
                all_recos.append(df)
        except Exception as e:
            print(f"⚠️ Error for buyer {buyer}: {e}")

    recommendations_df = pd.concat(all_recos, ignore_index=True)
    return recommendations_df, buyer_encoder, buyer_embeddings, faiss_index, als_model

In [12]:
cf_test_past_reco, be, be_emb, fi, als = run_batch_recommendations(cf_test)


Building encoders and sparse matrix...
Training ALS model...


100%|██████████| 30/30 [00:16<00:00,  1.82it/s]


Extracting embeddings and building FAISS index...
Generating recommendations...


100%|██████████| 16486/16486 [06:31<00:00, 42.12it/s]


In [13]:
cf_test_past_reco

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt,source_buyer_nbr,cosine_similarity
0,67677,juliorozna@hotmail.com,67557145,2016,TOYT,TACOMA,23464.0,0.00,2025-08-19,518543,0.651200
1,67677,juliorozna@hotmail.com,66901555,2000,CHEV,SILVERADO,7920.0,5403.09,2025-08-21,518543,0.638709
2,67677,juliorozna@hotmail.com,55647485,2017,FORD,F150,21042.0,13487.21,2025-08-18,518543,0.636010
3,67677,juliorozna@hotmail.com,66919015,2014,GMC,SIERRA,15033.0,7977.29,2025-09-02,518543,0.627677
4,67677,ztmotors@yahoo.com,68432375,2017,CHEV,CAMARO,34608.0,28214.78,2025-08-21,870295,0.624715
...,...,...,...,...,...,...,...,...,...,...,...
98911,564955,savelo@gmail.com,61808535,2016,TOYT,CAMRY,12580.0,7986.12,2025-08-04,139553,0.859184
98912,564955,savelo@gmail.com,65421985,2015,TOYT,CAMRY,8289.0,9814.13,2025-08-01,139553,0.847755
98913,564955,savelo@gmail.com,43920205,2020,TOYT,CAMRY,18669.0,18229.00,2025-07-31,139553,0.839058
98914,564955,savelo@gmail.com,57503425,2017,MAZD,6,16743.0,17677.85,2025-07-31,139553,0.824435


In [14]:
get_similar_buyers_faiss(438059, be, be_emb, fi, als)

array([751839, 814569, 232575, 479301, 721495])

In [15]:
cf_test_past_reco['input_buyer_nbr'].nunique()

16486

In [16]:
cf_test_past_reco.to_excel('../data/past_reco/cf_test_reco.xlsx',index=False)

### CF holdout would have

In [17]:
cf_holdout = pd.read_csv("../data/split/cf_holdout.csv")

In [18]:
cf_holdout_would_have_reco, be, be_emb, fi, als = run_batch_recommendations(cf_holdout)


Building encoders and sparse matrix...
Training ALS model...


100%|██████████| 30/30 [00:15<00:00,  1.88it/s]


Extracting embeddings and building FAISS index...
Generating recommendations...


100%|██████████| 16693/16693 [06:35<00:00, 42.21it/s]


In [19]:
cf_holdout_would_have_reco

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt,source_buyer_nbr,cosine_similarity
0,689940,josewilson15@myyahoo.com,80381845,2021,RAM,1500,30850.00,0.00,2025-10-14,590036,0.977582
1,689940,ivans500@hotmail.com,67398015,2016,HOND,PILOT,14475.00,0.00,2025-08-10,435606,0.976755
2,689940,ivans500@hotmail.com,66383305,2021,CHEV,BLAZER,20478.00,15504.98,2025-08-19,435606,0.976563
3,689940,josewilson15@myyahoo.com,64571745,2017,KIA,SPORTAGE,12155.98,8665.71,2025-10-07,590036,0.976517
4,689940,denisvasilik@yahoo.com,60545795,2008,TOYT,COROLLA,5060.39,5060.39,2025-09-03,836014,0.976031
...,...,...,...,...,...,...,...,...,...,...,...
100153,560266,littlebody76@yahoo.com,87273624,2001,TOYT,TACOMA,0.00,0.00,2025-07-30,890532,0.740654
100154,560266,littlebody76@yahoo.com,70381505,2018,GMC,SIERRA,18760.00,11247.55,2025-10-06,890532,0.705407
100155,560266,littlebody76@yahoo.com,61206085,2012,CHEV,SILVERADO,6433.00,7375.87,2025-09-29,890532,0.693949
100156,560266,littlebody76@yahoo.com,67284685,2007,GMC,SIERRA,13709.14,12644.08,2025-09-24,890532,0.687807


In [20]:
cf_holdout_would_have_reco['input_buyer_nbr'].nunique()

16693

In [21]:
cf_holdout_would_have_reco.to_excel('../data/past_reco/cf_holdout_would_have_reco.xlsx',index=False)

### For BQ push

In [23]:
cf_test_past_reco.head()

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt,source_buyer_nbr,cosine_similarity
0,67677,juliorozna@hotmail.com,67557145,2016,TOYT,TACOMA,23464.0,0.0,2025-08-19,518543,0.6512
1,67677,juliorozna@hotmail.com,66901555,2000,CHEV,SILVERADO,7920.0,5403.09,2025-08-21,518543,0.638709
2,67677,juliorozna@hotmail.com,55647485,2017,FORD,F150,21042.0,13487.21,2025-08-18,518543,0.63601
3,67677,juliorozna@hotmail.com,66919015,2014,GMC,SIERRA,15033.0,7977.29,2025-09-02,518543,0.627677
4,67677,ztmotors@yahoo.com,68432375,2017,CHEV,CAMARO,34608.0,28214.78,2025-08-21,870295,0.624715


In [24]:
cf_holdout_would_have_reco.head()

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,inv_dt,source_buyer_nbr,cosine_similarity
0,689940,josewilson15@myyahoo.com,80381845,2021,RAM,1500,30850.0,0.0,2025-10-14,590036,0.977582
1,689940,ivans500@hotmail.com,67398015,2016,HOND,PILOT,14475.0,0.0,2025-08-10,435606,0.976755
2,689940,ivans500@hotmail.com,66383305,2021,CHEV,BLAZER,20478.0,15504.98,2025-08-19,435606,0.976563
3,689940,josewilson15@myyahoo.com,64571745,2017,KIA,SPORTAGE,12155.98,8665.71,2025-10-07,590036,0.976517
4,689940,denisvasilik@yahoo.com,60545795,2008,TOYT,COROLLA,5060.39,5060.39,2025-09-03,836014,0.976031


In [32]:
import pandas as pd
import datetime
from datetime import datetime, timedelta
import pytz

def format_and_concat_two_groups(df1, df2, group1="test", group2="would_have", identifier=1):
    def format_one(df, group_label):
        df = df.copy()
        df['identifier'] = identifier
        df['group'] = group_label
        df = df[['identifier', 'group', 'input_buyer_nbr', 'recommended_lot']]
        df['rank'] = df.groupby('input_buyer_nbr').cumcount() + 1
        pivoted = df.pivot(index=['identifier', 'group', 'input_buyer_nbr'],
                           columns='rank',
                           values='recommended_lot').reset_index()
        pivoted.columns = [
            f'lot_{int(col)}' if isinstance(col, int) else col
            for col in pivoted.columns
        ]
        lot_cols = [f'lot_{i}' for i in range(1, 7)]
        for col in lot_cols:
            if col not in pivoted.columns:
                pivoted[col] = 0
        pivoted = pivoted[['identifier', 'group', 'input_buyer_nbr'] + lot_cols]
        pivoted[lot_cols] = pivoted[lot_cols].fillna(0).astype(int)
        return pivoted

    df1_formatted = format_one(df1, group1)
    df2_formatted = format_one(df2, group2)

    combined = pd.concat([df1_formatted, df2_formatted], ignore_index=True)

    cst = pytz.timezone('US/Central')

    now_cst = datetime.now(cst)
    next_day_7am_cst = (now_cst + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0)

    combined['created_at'] = now_cst
    combined['sent_at'] = next_day_7am_cst
    return combined


In [33]:
combined_cf = format_and_concat_two_groups(df1=cf_test_past_reco,df2=cf_holdout_would_have_reco,group1="test",group2="would_have",identifier=1)

In [34]:
combined_cf.groupby(['identifier', 'group'])['input_buyer_nbr'].nunique()

identifier  group     
1           test          16486
            would_have    16693
Name: input_buyer_nbr, dtype: int64

In [36]:
combined_cf.head()

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6,created_at,sent_at
0,1,test,75,68343725,65582345,63489295,68858295,85924905,70952715,2025-10-27 14:06:59.382688-05:00,2025-10-28 07:00:00-05:00
1,1,test,107,65363455,69821515,66562805,66684345,63452495,66751235,2025-10-27 14:06:59.382688-05:00,2025-10-28 07:00:00-05:00
2,1,test,153,65718995,68334535,70014105,70579665,66926725,64571955,2025-10-27 14:06:59.382688-05:00,2025-10-28 07:00:00-05:00
3,1,test,223,62334645,66906415,70962795,71680685,69975815,45607915,2025-10-27 14:06:59.382688-05:00,2025-10-28 07:00:00-05:00
4,1,test,295,68773125,69539685,81613565,60546995,63511715,56038715,2025-10-27 14:06:59.382688-05:00,2025-10-28 07:00:00-05:00


In [38]:
from datetime import datetime, timedelta
import pytz

def save_combined_df_to_excel(combined_cf, save_dir="../data/final", prefix="recommendations_past_reco"):
    # Get current CST time
    cst = pytz.timezone('US/Central')
    now_cst = datetime.now(cst)

    # Get tomorrow's date as string
    tomorrow_date = (now_cst + timedelta(days=1)).strftime("%Y-%m-%d")

    # Build file path
    file_path = f"{save_dir}/{prefix}_{tomorrow_date}.xlsx"

    # Drop timezone from datetime columns (Excel doesn't support timezones)
    for col in ['created_at', 'sent_at']:
        if col in combined_cf.columns:
            combined_cf[col] = pd.to_datetime(combined_cf[col]).dt.tz_localize(None)

    # Save to Excel
    combined_cf.to_excel(file_path, index=False)
    print(f"✅ File saved successfully as: {file_path}")

save_combined_df_to_excel(combined_cf)


✅ File saved successfully as: ../data/final/recommendations_past_reco_2025-10-28.xlsx


In [39]:
from google.cloud import bigquery

def upload_to_bigquery(dataframe, table_id, project_id, credentials_path):
    """
    Uploads a DataFrame to a BigQuery table.

    Args:
        dataframe (pd.DataFrame): The DataFrame to upload.
        table_id (str): The BigQuery table ID in the format `dataset.table`.
        project_id (str): The GCP project ID.
        credentials_path (str): Path to the service account JSON credentials file.
    """
    # Initialize BigQuery client
    client = bigquery.Client.from_service_account_json(credentials_path)

    # Define job configuration
    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_APPEND",  # Appends data to the table if it exists
        autodetect=True,  # Automatically detects schema
    )

    # Upload the DataFrame to BigQuery
    job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config)

    # Wait for the job to complete
    job.result()

    print(f"Data appended to {table_id} in project {project_id}.")

# Example usage
upload_to_bigquery(
    dataframe=combined_cf,  # Replace with your DataFrame
    table_id="member_reco.member_past_reco_match",  # Replace with your dataset and table name
    project_id="cprtqa_strategicanalytics-sp1",  # Replace with your GCP project ID
    credentials_path='/Users/srdeo/OneDrive - Copart, Inc/cprtqa-strategicanalytics-sp1-8b7a00c4fbae.json'  # Replace with your credentials file path
)




Data appended to member_reco.member_past_reco_match in project cprtqa_strategicanalytics-sp1.


In [48]:
### Build the buyer-lot matrix
#buyer_lot_matrix = data_high.pivot_table(
#    index='buyer_nbr',
#    columns='lot_nbr',
#    values='max_bid',
#    fill_value=0
#)

#buyer_lot_matrix = buyer_lot_matrix.astype(int)
#buyer_lot_matrix

### Traditional matrix will not be optimal. Hence creating CSR matrix

In [63]:
### CSR matrix

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

buyer_encoder = LabelEncoder()
lot_encoder = LabelEncoder()

buyer_ids = buyer_encoder.fit_transform(cf_test['buyer_nbr'])
lot_ids = lot_encoder.fit_transform(cf_test['lot_nbr'])

# Max bid values for matrix
max_bid_values = cf_test['max_bid'].fillna(0).astype(float)

# Create CSR matrix (buyers as rows, lots as columns)
n_buyers = len(buyer_encoder.classes_)
n_lots = len(lot_encoder.classes_)

buyer_lot_max_bid_sparse = csr_matrix(
    (max_bid_values, (buyer_ids, lot_ids)),
    shape=(n_buyers, n_lots)
)
print(buyer_lot_max_bid_sparse.shape)


(16498, 689730)


In [64]:
from implicit.als import AlternatingLeastSquares

In [65]:
als_model = AlternatingLeastSquares(
    factors=32,         # Embedding dimension
    regularization=0.5, # Controls overfitting
    iterations=30,      # Number of ALS training iterations
    use_gpu=False       # Set to True if you want to use GPU
)

# ALS expects (items x users), so transpose
# Implicit also assumes higher values = higher confidence
als_model.fit(buyer_lot_max_bid_sparse)

100%|██████████| 30/30 [00:16<00:00,  1.78it/s]


In [66]:
import faiss
import numpy as np

# Step 1: Normalize buyer embeddings
buyer_embeddings = als_model.user_factors.astype('float32')  # shape (n_buyers, dim)
lot_embeddings = als_model.item_factors.astype('float32')  # shape: (n_lots, dim)

# Step 2: Normalize both embeddings for cosine similarity
faiss.normalize_L2(buyer_embeddings)  # in-place normalization
faiss.normalize_L2(lot_embeddings)    # normalize lots too!

# Step 2: Create FAISS index (cosine similarity is L2 on normalized vectors)
faiss_index = faiss.IndexFlatIP(buyer_embeddings.shape[1])  # IP = inner product
faiss_index.add(buyer_embeddings)  # Add all buyer vectors to index

#### Get similar buyers

In [67]:
def get_similar_buyers_faiss(input_buyer_id, top_k=5):
    if input_buyer_id not in buyer_encoder.classes_:
        raise ValueError("Buyer not in training data")

    internal_buyer_id = buyer_encoder.transform([input_buyer_id])[0]

    # Step 1: Get embedding of input buyer
    query_vec = als_model.user_factors[internal_buyer_id].astype('float32').reshape(1, -1)
    faiss.normalize_L2(query_vec)  # normalize for cosine

    # Step 2: Search FAISS index
    distances, indices = faiss_index.search(query_vec, top_k + 1)  # +1 to exclude self
    similar_ids = indices[0]

    # Step 3: Remove the buyer themselves if present
    similar_ids = [i for i in similar_ids if i != internal_buyer_id][:top_k]

    # Decode to original buyer_nbrs
    similar_buyers = buyer_encoder.inverse_transform(similar_ids)

    return similar_buyers


In [68]:
similar_buyers = get_similar_buyers_faiss(205873, top_k=5)

In [69]:
similar_buyers

array([626943, 866235, 419617, 700559, 686839])

In [71]:
import numpy as np
import pandas as pd

def recommend_lots_cosine_from_similar_buyers(input_buyer_id, top_k_buyers=5, top_k_lots=6):
    # Step 1: Get similar buyers
    similar_buyers = get_similar_buyers_faiss(input_buyer_id, top_k=top_k_buyers)

    # Step 2: Get lots interacted by similar buyers
    sim_buyer_lots = data_high_test[data_high_test['buyer_nbr'].isin(similar_buyers)]
    candidate_lot_ids = sim_buyer_lots['lot_nbr'].unique()

    # Step 3: Remove lots already interacted by the input buyer
    input_buyer_lot_ids = data_high_test[data_high_test['buyer_nbr'] == input_buyer_id]['lot_nbr'].unique()
    candidate_lot_ids = list(set(candidate_lot_ids) - set(input_buyer_lot_ids))

    if not candidate_lot_ids:
        return pd.DataFrame()  # No candidates to recommend

   # Step 4: Get buyer and lot embeddings (already normalized)
    input_buyer_idx = buyer_encoder.transform([input_buyer_id])[0]
    buyer_vec = buyer_embeddings[input_buyer_idx].reshape(1, -1)
    lot_indices = lot_encoder.transform(candidate_lot_ids)
    lot_vecs = lot_embeddings[lot_indices]

    # Step 5: Cosine similarity = dot product of normalized vectors
    cosine_scores = np.dot(lot_vecs, buyer_vec.T).flatten()

    # Step 6: Select top N lots with highest cosine score
    top_indices = np.argsort(-cosine_scores)[:top_k_lots]
    top_lot_ids = [candidate_lot_ids[i] for i in top_indices]
    top_scores = cosine_scores[top_indices]

    # Step 7: Build output
    top_rows = []
    for lot_id, score in zip(top_lot_ids, top_scores):
        row = data_high_test[data_high_test['lot_nbr'] == lot_id].iloc[0]
        top_rows.append({
            'input_buyer_nbr': input_buyer_id,
            'mbr_email': row['mbr_email'],
            'recommended_lot': lot_id,
            'lot_year': row['lot_year'],
            'lot_make_cd': row['lot_make_cd'],
            'grp_model': row['grp_model'],
            'acv': row['acv'],
            'repair_cost': row['repair_cost'],
            'cosine_similarity': score
        })

    return pd.DataFrame(top_rows)

In [72]:
from tqdm import tqdm

all_buyers = cf_test['buyer_nbr'].unique()
all_recos = []

for buyer in tqdm(all_buyers):
    try:
        df = recommend_lots_cosine_from_similar_buyers(buyer)
        all_recos.append(df)
    except Exception as e:
        print(f"⚠️ Error for buyer {buyer}: {e}")

recommendations_df = pd.concat(all_recos, ignore_index=True)

100%|██████████| 16498/16498 [07:25<00:00, 37.01it/s]


In [73]:
recommendations_df_copy = recommendations_df.copy()

In [74]:
recommendations_df_copy

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
0,39955,Motors5rs@outlook.com,71750285,2002,TOYT,TACOMA,9573.82,8094.44,0.675246
1,39955,krandall95@hotmail.com,67138785,2005,GMC,SIERRA,10741.50,10741.50,0.658057
2,39955,emr1vstar29@gmail.com,68749485,2010,TOYT,CAMRY,6467.00,11557.17,0.652185
3,39955,emr1vstar29@gmail.com,72063235,2012,HOND,CIVIC,9272.02,9272.02,0.651740
4,39955,odinaseric@gmail.com,81817225,1984,CHEV,EL CAMINO,9848.00,11780.58,0.650558
...,...,...,...,...,...,...,...,...,...
98983,335927,alexcristi@icloud.com,83992455,2002,NISS,QUEST,2200.00,6524.84,0.669703
98984,335927,donsbuying@fenixparts.com,82030925,2013,TOYT,RAV4,14754.00,13339.69,0.657292
98985,335927,spcompleteautorepair@gmail.com,85758245,2022,HOND,CIVIC,27557.00,0.00,0.654426
98986,335927,spcompleteautorepair@gmail.com,80475765,2015,NISS,SENTRA,8360.50,7910.28,0.653975


In [75]:
recommendations_df_copy[recommendations_df_copy['input_buyer_nbr']==205873]

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
18,205873,toddandsonsauto@gmail.com,64665515,2018,JEP,CHEROKEE,11849.2,11849.2,0.930395
19,205873,myv8autosales@gmail.com,68630945,2006,TOYT,RAV4,10926.2,6633.86,0.925143
20,205873,importmotoring@gmail.com,68976965,2012,CHEV,IMPALA,6507.5,7562.78,0.924479
21,205873,myv8autosales@gmail.com,67168945,2006,DODG,DAKOTA,8345.0,7101.82,0.914087
22,205873,toddandsonsauto@gmail.com,59636005,2010,JEP,CHEROKEE,5247.0,3370.85,0.910548
23,205873,northwestautosalesplus@gmail.com,66170405,2019,SUBA,CROSSTREK,14498.0,10590.54,0.906774


In [285]:
recommendations_df_copy['input_buyer_nbr'].nunique()

16498

In [286]:
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_rmse(als_model, interaction_matrix, sample_size=100_000, random_state=42):
    # nonzero entries (buyer, lot, true value)
    users, items = interaction_matrix.nonzero()
    vals = interaction_matrix.data.astype(np.float64)
    n = len(vals)
    if n == 0:
        raise ValueError("interaction_matrix has no non-zero entries")

    # sample for speed
    rng = np.random.default_rng(random_state)
    if sample_size and sample_size < n:
        idx = rng.choice(n, size=sample_size, replace=False)
    else:
        idx = np.arange(n)

    u_idx = users[idx]
    i_idx = items[idx]
    y_true = vals[idx]

    # ALS predictions = dot(user_vec, item_vec)
    u_vecs = als_model.user_factors[u_idx]
    i_vecs = als_model.item_factors[i_idx]
    y_pred = np.einsum("ij,ij->i", u_vecs, i_vecs)

    # sklearn-compatible RMSE (works on old/new versions)
    try:
        # new sklearn
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # old sklearn
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    return float(rmse)


In [287]:
rmse_value = calculate_rmse(als_model, buyer_lot_max_bid_sparse, sample_size=50000)
print(f"ALS RMSE: {rmse_value:.4f}")


ALS RMSE: 6222.1831


#### Future lots

In [294]:
data2 = pd.read_csv('data/interim/future_lots.csv')

In [295]:
data2

Unnamed: 0,lot_nbr,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,71123705,2020,HOND,PASSPORT,FRONT END,20089.12,23445.00,24225.0,2025-10-21,9536.78
1,62754925,2003,BMW,Z4,FRONT END,3200.00,3247.00,0.0,2025-10-22,1259.12
2,85277025,2020,INFI,QX50,FRONT END,21489.38,25417.00,21100.0,2025-10-22,6788.11
3,67924785,2013,AUDI,Q7,MECHANICAL,0.00,3380.00,6725.0,2025-10-22,858.85
4,86845485,2017,SUBA,WRX,MINOR DENT/SCRATCHES,0.00,15425.00,15425.0,2025-10-21,7563.92
...,...,...,...,...,...,...,...,...,...,...
45146,80203955,2017,JEP,WRANGLER,ALL OVER,20755.29,20449.00,22525.0,2025-10-24,6150.38
45147,71829755,2015,JEP,WRANGLER,VANDALISM,0.00,24500.00,16625.0,2025-10-21,12232.06
45148,85198465,2017,JEP,WRANGLER,MECHANICAL,16758.25,19574.96,18175.0,2025-10-24,7409.81
45149,81837765,2006,JEP,WRANGLER,FRONT END,0.00,8806.84,7275.0,2025-10-24,1238.04


In [296]:
data2.isnull().sum()

lot_nbr             0
lot_year            0
lot_make_cd         0
grp_model           0
damage_type_desc    0
repair_cost         0
acv                 0
plug_lot_acv        0
auc_dt              0
proquote_amt        0
dtype: int64

In [297]:
from sklearn.metrics.pairwise import manhattan_distances
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np

# --- Step 1: Define per-row function
def process_row(row_dict, data2_df):
    year, make, model = row_dict['lot_year'], row_dict['lot_make_cd'], row_dict['grp_model']
    acv, repair = row_dict['acv'], row_dict['repair_cost']
    input_vec = np.array([[acv, repair]])

    # Step 1: Try YMM match
    ymm_match = data2_df[
        (data2_df['lot_year'] == year) &
        (data2_df['lot_make_cd'] == make) &
        (data2_df['grp_model'] == model)
    ]

    candidates = ymm_match.copy() if not ymm_match.empty else data2_df.copy()
    candidate_vecs = candidates[['acv', 'repair_cost']].values
    candidates['manhattan_dist'] = manhattan_distances(candidate_vecs, input_vec).flatten()
    best_match = candidates.sort_values('manhattan_dist').iloc[0]

    return {
        'input_buyer_nbr': row_dict['input_buyer_nbr'],
        'original_lot': row_dict['recommended_lot'],
        'recommended_lot': best_match['lot_nbr'],
        'manhattan_distance': best_match['manhattan_dist']
    }


In [298]:
from tqdm import tqdm

def refine_recommendations_parallel(reco_df, data2_df, max_workers=4):
    results = []
    futures = []

    # Convert rows to dictionaries
    row_dicts = reco_df.to_dict(orient='records')

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for row in row_dicts:
            futures.append(executor.submit(process_row, row, data2_df))

        for f in tqdm(as_completed(futures), total=len(futures), desc="Refining recos"):
            try:
                results.append(f.result())
            except Exception as e:
                print(f"⚠️ Skipped one row due to: {e}")

    return pd.DataFrame(results)


In [299]:
recommended_upcoming_df_gt6 = refine_recommendations_parallel(recommendations_df_copy, data2, max_workers=6)


Refining recos: 100%|██████████| 98988/98988 [03:49<00:00, 430.96it/s]


In [300]:
recommended_upcoming_df_gt6

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance
0,514361,80355025,85763995,2360.00
1,509689,71001835,80941725,0.00
2,438059,67144265,58424585,1435.00
3,899609,63557875,64784225,2437.30
4,771135,71073235,84928615,195.00
...,...,...,...,...
98983,335927,67550205,81331915,10333.00
98984,335927,57335575,58743135,1584.01
98985,335927,69567985,70641455,337.00
98986,335927,65034115,59233435,3153.33


In [1]:
recommendations_df_copy[recommendations_df_copy['input_buyer_nbr'] == 2581]

NameError: name 'recommendations_df_copy' is not defined

In [302]:
recommended_upcoming_df_gt6[recommended_upcoming_df_gt6['input_buyer_nbr'] == 1345]

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance
85624,1345,63399205,84012795,2011.63
85626,1345,49860375,84647465,5549.1
85627,1345,62561135,84970075,2291.85
85628,1345,62687425,80221785,3039.0
85630,1345,66360885,83850395,1976.91
85634,1345,58948375,66024135,402.0


In [303]:
recommended_upcoming_df_gt6.isnull().sum()

input_buyer_nbr       0
original_lot          0
recommended_lot       0
manhattan_distance    0
dtype: int64

In [304]:
recommended_upcoming_df_gt6['input_buyer_nbr'].nunique()

16498

In [305]:
recommended_upcoming_df_gt6.to_excel("data/processed/recommended_CF_test.xlsx", index=False)

### Westlake

In [244]:
df_wl = pd.read_csv('data/interim/wl_lots.csv')

In [245]:
df_wl

Unnamed: 0,lot_nbr,lot_stg,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,80762855,40,2017,MERZ,C-CLASS,NORMAL WEAR,7261.0,13220.0,15850.0,,0.00
1,85948435,40,2012,GMC,TERRAIN,FRONT END,2783.0,5260.0,6250.0,,0.00
2,85654195,40,2011,HOND,CIVIC,NORMAL WEAR,8092.0,6900.0,7975.0,,879.94
3,80588215,50,2011,GMC,SIERRA,MECHANICAL,11315.0,7440.0,8625.0,,1012.41
4,86334335,40,2017,TOYT,HIGHLANDER,SIDE,30808.0,17100.0,20850.0,,5008.09
...,...,...,...,...,...,...,...,...,...,...,...
319,84297605,50,2019,FORD,EXPLORER,FRONT END,5643.0,21420.0,24850.0,,2359.35
320,84837845,40,2016,CHEV,CRUZE,FRONT END,0.0,6900.0,8600.0,,0.00
321,85654165,40,2020,HYUN,SANTA FE,SIDE,6485.0,14620.0,17525.0,,4508.17
322,85544495,40,2006,LAND,LR3,MINOR DENT/SCRATCHES,835.0,3960.0,4000.0,,909.14


In [246]:
# If 'acv' is 0 or NaN, take value from 'plug_lot_acv'
df_wl['acv'] = df_wl['acv'].mask((df_wl['acv'] == 0) | (df_wl['acv'].isna()), df_wl['plug_lot_acv'])

In [247]:
df_wl

Unnamed: 0,lot_nbr,lot_stg,lot_year,lot_make_cd,grp_model,damage_type_desc,repair_cost,acv,plug_lot_acv,auc_dt,proquote_amt
0,80762855,40,2017,MERZ,C-CLASS,NORMAL WEAR,7261.0,13220.0,15850.0,,0.00
1,85948435,40,2012,GMC,TERRAIN,FRONT END,2783.0,5260.0,6250.0,,0.00
2,85654195,40,2011,HOND,CIVIC,NORMAL WEAR,8092.0,6900.0,7975.0,,879.94
3,80588215,50,2011,GMC,SIERRA,MECHANICAL,11315.0,7440.0,8625.0,,1012.41
4,86334335,40,2017,TOYT,HIGHLANDER,SIDE,30808.0,17100.0,20850.0,,5008.09
...,...,...,...,...,...,...,...,...,...,...,...
319,84297605,50,2019,FORD,EXPLORER,FRONT END,5643.0,21420.0,24850.0,,2359.35
320,84837845,40,2016,CHEV,CRUZE,FRONT END,0.0,6900.0,8600.0,,0.00
321,85654165,40,2020,HYUN,SANTA FE,SIDE,6485.0,14620.0,17525.0,,4508.17
322,85544495,40,2006,LAND,LR3,MINOR DENT/SCRATCHES,835.0,3960.0,4000.0,,909.14


In [248]:
from sklearn.metrics.pairwise import manhattan_distances
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np

# --- Step 1: Define per-row function
def process_row(row_dict, data2_df):
    year, make, model = row_dict['lot_year'], row_dict['lot_make_cd'], row_dict['grp_model']
    acv, repair = row_dict['acv'], row_dict['repair_cost']
    input_vec = np.array([[acv, repair]])

    # --- Step 1: Try exact YMM (Year, Make, Model) match
    ymm_match = data2_df[
        (data2_df['lot_year'] == year) &
        (data2_df['lot_make_cd'] == make) &
        (data2_df['grp_model'] == model)
    ]

    if not ymm_match.empty:
        candidates = ymm_match.copy()
        fallback_reason = 'YMM match'

    # --- Step 2: If no YMM match, try MM (Make, Model) match
    else:
        mm_match = data2_df[
            (data2_df['lot_make_cd'] == make) &
            (data2_df['grp_model'] == model)
        ]
        if not mm_match.empty:
            candidates = mm_match.copy()
            fallback_reason = 'MM match'
        else:
            # --- Step 3: Fallback to all rows (only based on acv, repair)
            candidates = data2_df.copy()
            fallback_reason = 'No YMM/MM match'

    # Compute Manhattan distances
    candidate_vecs = candidates[['acv', 'repair_cost']].values
    candidates['manhattan_dist'] = manhattan_distances(candidate_vecs, input_vec).flatten()

    # Pick the closest match
    best_match = candidates.sort_values('manhattan_dist').iloc[0]

    return {
        'input_buyer_nbr': row_dict['input_buyer_nbr'],
        'original_lot': row_dict['recommended_lot'],
        'recommended_lot': best_match['lot_nbr'],
        'manhattan_distance': best_match['manhattan_dist'],
        'fallback_reason': fallback_reason  # Optional: track which step was used
    }


In [249]:
recommended_upcoming_df_wl_gt6 = refine_recommendations_parallel(recommendations_df_copy, df_wl, max_workers=6)

Refining recos: 100%|██████████| 99054/99054 [01:01<00:00, 1597.98it/s]


In [250]:
recommended_upcoming_df_wl_gt6

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason
0,127607,70503295,55223885,19674.00,MM match
1,175059,58639345,81348135,14077.71,MM match
2,175059,69602765,85544415,1794.00,MM match
3,649757,63895915,75679514,100.00,No YMM/MM match
4,175059,65615925,80929265,12754.76,MM match
...,...,...,...,...,...
99049,28777,70430715,80929255,2615.00,No YMM/MM match
99050,158785,67091135,83762415,132.00,No YMM/MM match
99051,470235,70221035,81528675,250.00,No YMM/MM match
99052,470235,68872315,85544545,750.77,No YMM/MM match


In [251]:
recommended_upcoming_df_wl_gt6['input_buyer_nbr'].nunique()

16509

In [252]:
westlake_members = pd.read_csv('data/raw/westlake_members.csv')

In [253]:
westlake_members_lst = westlake_members['buyer_nbr'].tolist()

In [254]:
abc = recommended_upcoming_df_wl_gt6[recommended_upcoming_df_wl_gt6['input_buyer_nbr'].isin(westlake_members_lst)]
abc

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason
1,175059,58639345,81348135,14077.71,MM match
2,175059,69602765,85544415,1794.00,MM match
4,175059,65615925,80929265,12754.76,MM match
5,659773,61739115,85544435,12896.27,MM match
6,659773,64311295,84837825,1279.00,MM match
...,...,...,...,...,...
98991,341511,62154655,81164295,11502.57,MM match
98993,341511,58437615,66832105,21109.83,YMM match
98997,341511,69486435,80588195,7773.00,MM match
99002,341511,64776795,70962555,51.60,No YMM/MM match


In [255]:
abc['input_buyer_nbr'].nunique()

3409

In [256]:
# Step 1: Min-Max Normalize the 'manhattan_distance' column
min_val = abc['manhattan_distance'].min()
max_val = abc['manhattan_distance'].max()

abc['manhattan_distance_normalized'] = (
    (abc['manhattan_distance'] - min_val) / (max_val - min_val)
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abc['manhattan_distance_normalized'] = (


In [257]:
abc

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason,manhattan_distance_normalized
1,175059,58639345,81348135,14077.71,MM match,0.013800
2,175059,69602765,85544415,1794.00,MM match,0.001759
4,175059,65615925,80929265,12754.76,MM match,0.012503
5,659773,61739115,85544435,12896.27,MM match,0.012642
6,659773,64311295,84837825,1279.00,MM match,0.001254
...,...,...,...,...,...,...
98991,341511,62154655,81164295,11502.57,MM match,0.011276
98993,341511,58437615,66832105,21109.83,YMM match,0.020693
98997,341511,69486435,80588195,7773.00,MM match,0.007620
99002,341511,64776795,70962555,51.60,No YMM/MM match,0.000051


In [258]:
recommendations_df_copy[recommendations_df_copy['input_buyer_nbr'] == 653227]

Unnamed: 0,input_buyer_nbr,mbr_email,recommended_lot,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
49590,653227,dependableautosportsllc@yahoo.com,66749645,2011,HOND,PILOT,0.0,0.0,0.716331
49591,653227,429959cprt_dmmy_BROKERBDR_388238_646628@copart...,67322265,2016,TOYT,RAV4,8320.0,0.0,0.693707
49592,653227,429959cprt_dmmy_BROKERBDR_388238_646628@copart...,66271405,2010,ACUR,TL,1.0,0.0,0.683602
49593,653227,191657-MSTRBDR-a0b769e0-af44-490b-9093-732fe8d...,62612665,2014,TOYT,PRIUS,0.0,0.0,0.679204
49594,653227,marcod77@live.com,64490755,2014,TOYT,PRIUS,0.0,0.0,0.678819
49595,653227,RAKHSHANITRD123@GMAIL.COM,81457705,2012,TOYT,PRIUS,2405.0,0.0,0.67744


In [259]:
# Rename 'recommended_lot' to 'original_lot' temporarily in recommendations_df_copy
recommendations_df_copy_renamed = recommendations_df_copy.rename(columns={'recommended_lot': 'original_lot'})

# Perform the merge on both input_buyer_nbr and original_lot
merged_df = abc.merge(
    recommendations_df_copy_renamed,
    how='left',
    on=['input_buyer_nbr', 'original_lot']
)


In [224]:
merged_df_short = merged_df[['input_buyer_nbr','original_lot','recommended_lot','manhattan_distance','manhattan_distance_normalized', 'acv','repair_cost']]
merged_df_short

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,manhattan_distance_normalized,acv,repair_cost
0,905135,71661045,84296275,31199.41,0.030584,43733.00,33840.41
1,660769,68030405,85381705,23767.86,0.023299,25885.00,23444.86
2,905135,57981245,81164375,49852.77,0.048869,46654.00,36746.77
3,660769,57577165,80588225,26152.00,0.025636,30623.00,22828.00
4,905135,62903665,84608695,15066.17,0.014769,39372.00,27112.17
...,...,...,...,...,...,...,...
20449,341511,62154655,81164295,11502.57,0.011276,7068.00,9721.57
20450,341511,69486435,80588195,7773.00,0.007620,19597.00,12276.00
20451,341511,58437615,66832105,21109.83,0.020693,18190.00,14899.83
20452,341511,64776795,70962555,51.60,0.000051,7132.56,9438.96


In [226]:
merged_df[(merged_df['acv']>1000) & (merged_df['repair_cost']>1000)]

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason,manhattan_distance_normalized,mbr_email,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
0,905135,71661045,84296275,31199.41,MM match,0.030584,karlosdavtyan@yahoo.com,2024,TOYT,4RUNNER,43733.00,33840.41,0.761977
1,660769,68030405,85381705,23767.86,YMM match,0.023299,timkorchak@gmail.com,2022,HOND,CIVIC,25885.00,23444.86,0.674539
2,905135,57981245,81164375,49852.77,MM match,0.048869,336343-MSTRBDR-d1cf820b-f6cc-459c-a47f-9c77b92...,2025,RAM,1500,46654.00,36746.77,0.766689
3,660769,57577165,80588225,26152.00,MM match,0.025636,timkorchak@gmail.com,2025,HOND,CIVIC,30623.00,22828.00,0.715982
4,905135,62903665,84608695,15066.17,MM match,0.014769,sales@74auto.com,2023,CHEV,SILVERADO,39372.00,27112.17,0.790351
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20449,341511,62154655,81164295,11502.57,MM match,0.011276,marioac927@gmail.com,2019,JEP,RENEGADE,7068.00,9721.57,0.649155
20450,341511,69486435,80588195,7773.00,MM match,0.007620,blackhawkautos@gmail.com,2020,HOND,CIVIC,19597.00,12276.00,0.637506
20451,341511,58437615,66832105,21109.83,YMM match,0.020693,sumnerautosales281@gmail.com,2020,HOND,ACCORD,18190.00,14899.83,0.620764
20452,341511,64776795,70962555,51.60,No YMM/MM match,0.000051,marioac927@gmail.com,2005,TOYT,TUNDRA,7132.56,9438.96,0.649168


In [260]:
merged_df[(merged_df['fallback_reason'] == 'YMM match')]

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason,manhattan_distance_normalized,mbr_email,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
6,659773,62591475,84300295,22456.00,YMM match,0.022013,mhvehicleenterprises@yahoo.com,2021,INFI,Q50,21988.0,21988.00,0.758903
12,660075,65472425,84061035,10009.00,YMM match,0.009812,n.americanauto@gmail.com,2021,TOYT,COROLLA,15030.0,0.00,0.743260
25,514361,68937515,54020485,2805.47,YMM match,0.002750,fhachem1010@gmail.com,2015,TOYT,COROLLA,7859.0,7090.53,0.908003
62,660769,52982175,85381705,13456.45,YMM match,0.013191,N.Eautorepair2001@gmail.com,2022,HOND,CIVIC,20952.0,17250.45,0.656515
68,660769,68030405,85381705,23767.86,YMM match,0.023299,timkorchak@gmail.com,2022,HOND,CIVIC,25885.0,23444.86,0.674539
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20430,537079,80181885,85592955,5580.00,YMM match,0.005470,ilaservicecenter@gmail.com,2016,DODG,CARAVAN,0.0,0.00,0.659087
20432,537079,60738655,70063705,7651.00,YMM match,0.007500,sergio_aldape@yahoo.com,2014,GMC,SIERRA,12359.0,11133.00,0.632715
20433,537079,67830965,85907675,20675.06,YMM match,0.020267,rapiditosprinting@yahoo.com,2015,GMC,SIERRA,11603.0,10453.06,0.632281
20434,537079,82225265,84837815,14462.00,YMM match,0.014177,rgmparts956@gmail.com,2014,CHEV,IMPALA,11425.0,0.00,0.631239


In [261]:
merged_df[(merged_df['fallback_reason'] == 'YMM match')]['manhattan_distance_normalized'].quantile(0.2)

np.float64(0.005322748163152678)

In [267]:
merged_df[(merged_df['fallback_reason'] == 'YMM match') &(merged_df['manhattan_distance_normalized'] < 0.1)]

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,fallback_reason,manhattan_distance_normalized,mbr_email,lot_year,lot_make_cd,grp_model,acv,repair_cost,cosine_similarity
6,659773,62591475,84300295,22456.00,YMM match,0.022013,mhvehicleenterprises@yahoo.com,2021,INFI,Q50,21988.0,21988.00,0.758903
12,660075,65472425,84061035,10009.00,YMM match,0.009812,n.americanauto@gmail.com,2021,TOYT,COROLLA,15030.0,0.00,0.743260
25,514361,68937515,54020485,2805.47,YMM match,0.002750,fhachem1010@gmail.com,2015,TOYT,COROLLA,7859.0,7090.53,0.908003
62,660769,52982175,85381705,13456.45,YMM match,0.013191,N.Eautorepair2001@gmail.com,2022,HOND,CIVIC,20952.0,17250.45,0.656515
68,660769,68030405,85381705,23767.86,YMM match,0.023299,timkorchak@gmail.com,2022,HOND,CIVIC,25885.0,23444.86,0.674539
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20430,537079,80181885,85592955,5580.00,YMM match,0.005470,ilaservicecenter@gmail.com,2016,DODG,CARAVAN,0.0,0.00,0.659087
20432,537079,60738655,70063705,7651.00,YMM match,0.007500,sergio_aldape@yahoo.com,2014,GMC,SIERRA,12359.0,11133.00,0.632715
20433,537079,67830965,85907675,20675.06,YMM match,0.020267,rapiditosprinting@yahoo.com,2015,GMC,SIERRA,11603.0,10453.06,0.632281
20434,537079,82225265,84837815,14462.00,YMM match,0.014177,rgmparts956@gmail.com,2014,CHEV,IMPALA,11425.0,0.00,0.631239


In [None]:
merged_df.to

In [263]:
merged_df[(merged_df['fallback_reason'] == 'YMM match') &(merged_df['manhattan_distance_normalized'] < 0.005)]['input_buyer_nbr'].nunique()

491

In [264]:
merged_df[(merged_df['fallback_reason'] == 'YMM match') &(merged_df['manhattan_distance_normalized'] < 0.005)]['recommended_lot'].nunique()

168

In [265]:
merged_df[(merged_df['fallback_reason'] == 'YMM match') &(merged_df['manhattan_distance_normalized'] < 0.005)]['input_buyer_nbr'].value_counts()

input_buyer_nbr
922717    3
844923    3
516451    3
743905    3
303845    2
         ..
112235    1
450417    1
355987    1
536903    1
896601    1
Name: count, Length: 491, dtype: int64

#### Reco for holdout

In [306]:
data_high_holdout = pd.read_csv("data/interim/data_high_holdout.csv")
data_high_holdout.head()

Unnamed: 0,mbr_lic_type,mbr_state,lot_nbr,buyer_nbr,mbr_email,max_bid,inv_dt,lot_year,lot_make_cd,grp_model,acv,plug_lot_acv,repair_cost,total_unique_buyers_on_that_lot,total_unique_lots_bid_by_buyers
0,Dealer,NV,62520895,835040,cncauto775@gmail.com,1950.0,2025-07-31,2015,BMW,X1,9778.0,6525.0,0.0,5,332
1,Dealer,NV,80901305,260760,olea8086@gmail.com,3250.0,2025-10-08,2017,ACUR,MDX,16297.0,16025.0,11752.67,5,223
2,Dismantler,RI,65463145,617140,copart.617140@picknpull.com,500.0,2025-09-03,2002,DODG,RAM 2500,7000.0,0.0,0.0,5,211
3,Dismantler,RI,68813285,794584,northendtowing101@yahoo.com,400.0,2025-09-17,2015,CHRY,MINIVAN,5965.0,7150.0,8133.38,5,1617
4,Consumer,MT,61917475,574298,chuck.raup@yahoo.com,600.0,2025-09-24,2018,HYUN,KONA,13028.92,12150.0,0.0,3,16


In [307]:
data_high_holdout['buyer_nbr'].nunique()

16749

In [308]:
data_high_holdout.isnull().sum()

mbr_lic_type                       0
mbr_state                          0
lot_nbr                            0
buyer_nbr                          0
mbr_email                          0
max_bid                            0
inv_dt                             0
lot_year                           0
lot_make_cd                        0
grp_model                          0
acv                                0
plug_lot_acv                       0
repair_cost                        0
total_unique_buyers_on_that_lot    0
total_unique_lots_bid_by_buyers    0
dtype: int64

In [309]:
### To include only popular lots
#data_high_holdout = data_high_holdout[(data_high_holdout['total_unique_buyers_on_that_lot']>=10)]
print(data_high_holdout['buyer_nbr'].nunique())
print(data_high_holdout['lot_nbr'].nunique())

16749
686014


In [310]:
### CSR matrix

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

buyer_encoder = LabelEncoder()
lot_encoder = LabelEncoder()

buyer_ids = buyer_encoder.fit_transform(data_high_holdout['buyer_nbr'])
lot_ids = lot_encoder.fit_transform(data_high_holdout['lot_nbr'])

# Max bid values for matrix
max_bid_values = data_high_holdout['max_bid'].fillna(0).astype(float)

# Create CSR matrix (buyers as rows, lots as columns)
n_buyers = len(buyer_encoder.classes_)
n_lots = len(lot_encoder.classes_)

buyer_lot_max_bid_sparse = csr_matrix(
    (max_bid_values, (buyer_ids, lot_ids)),
    shape=(n_buyers, n_lots)
)

print(buyer_lot_max_bid_sparse.shape)

(16749, 686014)


In [311]:
als_model = AlternatingLeastSquares(
    factors=32,         # Embedding dimension
    regularization=0.5, # Controls overfitting
    iterations=30,      # Number of ALS training iterations
    use_gpu=False       # Set to True if you want to use GPU
)

# ALS expects (items x users), so transpose
# Implicit also assumes higher values = higher confidence
als_model.fit(buyer_lot_max_bid_sparse)

100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


In [312]:
import faiss
import numpy as np

# Step 1: Normalize buyer embeddings
buyer_embeddings = als_model.user_factors.astype('float32')  # shape (n_buyers, dim)
lot_embeddings = als_model.item_factors.astype('float32')  # shape: (n_lots, dim)

# Step 2: Normalize both embeddings for cosine similarity
faiss.normalize_L2(buyer_embeddings)  # in-place normalization
faiss.normalize_L2(lot_embeddings)    # normalize lots too!

# Step 2: Create FAISS index (cosine similarity is L2 on normalized vectors)
faiss_index = faiss.IndexFlatIP(buyer_embeddings.shape[1])  # IP = inner product
faiss_index.add(buyer_embeddings)  # Add all buyer vectors to index

In [313]:
def get_similar_buyers_faiss(input_buyer_id, top_k=5):
    if input_buyer_id not in buyer_encoder.classes_:
        raise ValueError("Buyer not in training data")

    internal_buyer_id = buyer_encoder.transform([input_buyer_id])[0]

    # Step 1: Get embedding of input buyer
    query_vec = als_model.user_factors[internal_buyer_id].astype('float32').reshape(1, -1)
    faiss.normalize_L2(query_vec)  # normalize for cosine

    # Step 2: Search FAISS index
    distances, indices = faiss_index.search(query_vec, top_k + 1)  # +1 to exclude self
    similar_ids = indices[0]

    # Step 3: Remove the buyer themselves if present
    similar_ids = [i for i in similar_ids if i != internal_buyer_id][:top_k]

    # Decode to original buyer_nbrs
    similar_buyers = buyer_encoder.inverse_transform(similar_ids)

    return similar_buyers


In [314]:
import numpy as np
import pandas as pd

def recommend_lots_cosine_from_similar_buyers(input_buyer_id, top_k_buyers=5, top_k_lots=6):
    # Step 1: Get similar buyers
    similar_buyers = get_similar_buyers_faiss(input_buyer_id, top_k=top_k_buyers)

    # Step 2: Get lots interacted by similar buyers
    sim_buyer_lots = data_high_holdout[data_high_holdout['buyer_nbr'].isin(similar_buyers)]
    candidate_lot_ids = sim_buyer_lots['lot_nbr'].unique()

    # Step 3: Remove lots already interacted by the input buyer
    input_buyer_lot_ids = data_high_holdout[data_high_holdout['buyer_nbr'] == input_buyer_id]['lot_nbr'].unique()
    candidate_lot_ids = list(set(candidate_lot_ids) - set(input_buyer_lot_ids))

    if not candidate_lot_ids:
        return pd.DataFrame()  # No candidates to recommend

   # Step 4: Get buyer and lot embeddings (already normalized)
    input_buyer_idx = buyer_encoder.transform([input_buyer_id])[0]
    buyer_vec = buyer_embeddings[input_buyer_idx].reshape(1, -1)
    lot_indices = lot_encoder.transform(candidate_lot_ids)
    lot_vecs = lot_embeddings[lot_indices]

    # Step 5: Cosine similarity = dot product of normalized vectors
    cosine_scores = np.dot(lot_vecs, buyer_vec.T).flatten()

    # Step 6: Select top N lots with highest cosine score
    top_indices = np.argsort(-cosine_scores)[:top_k_lots]
    top_lot_ids = [candidate_lot_ids[i] for i in top_indices]
    top_scores = cosine_scores[top_indices]

    # Step 7: Build output
    top_rows = []
    for lot_id, score in zip(top_lot_ids, top_scores):
        row = data_high_holdout[data_high_holdout['lot_nbr'] == lot_id].iloc[0]
        top_rows.append({
            'input_buyer_nbr': input_buyer_id,
            'recommended_lot': lot_id,
            'lot_year': row['lot_year'],
            'lot_make_cd': row['lot_make_cd'],
            'grp_model': row['grp_model'],
            'acv': row['acv'],
            'repair_cost': row['repair_cost'],
            'cosine_similarity': score
        })

    return pd.DataFrame(top_rows)

In [315]:
from tqdm import tqdm

all_buyers = data_high_holdout['buyer_nbr'].unique()
all_recos = []

for buyer in tqdm(all_buyers):
    try:
        df = recommend_lots_cosine_from_similar_buyers(buyer)
        all_recos.append(df)
    except Exception as e:
        print(f"⚠️ Error for buyer {buyer}: {e}")

recommendations_df = pd.concat(all_recos, ignore_index=True)

100%|██████████| 16749/16749 [07:38<00:00, 36.55it/s]


In [316]:
recommendations_df.isnull().sum()

input_buyer_nbr      0
recommended_lot      0
lot_year             0
lot_make_cd          0
grp_model            0
acv                  0
repair_cost          0
cosine_similarity    0
dtype: int64

In [317]:
recommendations_df['input_buyer_nbr'].nunique()

16749

In [318]:
recommendations_df_copy = recommendations_df.copy()

In [319]:
recommendations_df_copy['input_buyer_nbr'].nunique()

16749

In [320]:
recommended_upcoming_df_gt6_holdout = refine_recommendations_parallel(recommendations_df_copy, data2, max_workers=6)


Refining recos: 100%|██████████| 100494/100494 [03:57<00:00, 423.37it/s]


In [321]:
recommended_upcoming_df_gt6_holdout['input_buyer_nbr'].nunique()

16749

In [322]:
recommended_upcoming_df_gt6_holdout.to_excel('data/would_have/data_high_holdout_reco.xlsx', index=False)