In [2]:
import pandas as pd

In [3]:
cf_test_reco = pd.read_excel('../data/results/cf_test_reco.xlsx')
one_to_one_test_reco = pd.read_excel('../data/results/onetoone_test_reco.xlsx')
nonactive_test_reco = pd.read_excel('../data/results/nonactive_test_reco.xlsx')

cf_holdout_reco = pd.read_excel('../data/results/cf_holdout_reco.xlsx')
one_to_one_holdout_reco = pd.read_excel('../data/results/onetoone_holdout_reco.xlsx')
nonactive_holdout_reco = pd.read_excel('../data/results/nonactive_holdout_reco.xlsx')

cf_holdout_would_have = pd.read_excel('../data/results/cf_holdout_would_have_reco.xlsx')
one_to_one_holdout_would_have = pd.read_excel('../data/results/onetoone_holdout_would_have_reco.xlsx')

In [9]:
cf_test_reco.head()

Unnamed: 0,input_buyer_nbr,original_lot,recommended_lot,manhattan_distance,source
0,223,62334645,61841485,1247.6,Step 1 - YMM/Manhattan
1,223,66906415,81628545,1883.0,Step 1 - YMM/Manhattan
2,223,70962795,85878315,9302.0,Step 1 - YMM/Manhattan
3,223,71680685,56195315,575.0,Step 1 - YMM/Manhattan
4,223,69975815,71277285,4394.88,Step 1 - YMM/Manhattan


In [25]:
import pandas as pd
from datetime import datetime, timedelta
import pytz

def rename_tag_concat_and_pivot(
    cf_test_reco,
    one_to_one_test_reco,
    nonactive_test_reco,
    cf_holdout_reco,
    one_to_one_holdout_reco,
    nonactive_holdout_reco,
    cf_holdout_would_have,
    one_to_one_holdout_would_have
):
    """
    Full pipeline:
    1. Renames:
       - 'mbr_nbr' → 'input_buyer_nbr'
       - 'recommended_lot_nbr' → 'recommended_lot'
    2. Keeps ['input_buyer_nbr', 'recommended_lot']
    3. Adds 'identifier' (1=CF, 2=One-to-One, 3=Nonactive)
    4. Adds 'group' (test / holdout / would_have)
    5. Concatenates all 9 DataFrames (keeps duplicates)
    6. Reorders columns: identifier → group → input_buyer_nbr
    7. Pivots to create 6 columns for recommended lots per buyer
    8. Renames columns to lot_1 ... lot_6
    9. Converts lot columns to int
    10. Adds 'created_at' (current CST) and 'sent_at' (next day 7 AM CST)
    """

    def _rename_and_trim(df):
        rename_map = {}
        if 'mbr_nbr' in df.columns:
            rename_map['mbr_nbr'] = 'input_buyer_nbr'
        if 'recommended_lot_nbr' in df.columns:
            rename_map['recommended_lot_nbr'] = 'recommended_lot'
        df = df.rename(columns=rename_map)

        keep_cols = [col for col in ['input_buyer_nbr', 'recommended_lot'] if col in df.columns]
        return df[keep_cols]

    # --- Apply rename & trim ---
    cf_test_reco = _rename_and_trim(cf_test_reco)
    one_to_one_test_reco = _rename_and_trim(one_to_one_test_reco)
    nonactive_test_reco = _rename_and_trim(nonactive_test_reco)
    cf_holdout_reco = _rename_and_trim(cf_holdout_reco)
    one_to_one_holdout_reco = _rename_and_trim(one_to_one_holdout_reco)
    nonactive_holdout_reco = _rename_and_trim(nonactive_holdout_reco)
    cf_holdout_would_have = _rename_and_trim(cf_holdout_would_have)
    one_to_one_holdout_would_have = _rename_and_trim(one_to_one_holdout_would_have)

    # ✅ Create a separate copy for nonactive_holdout_would_have
    nonactive_holdout_would_have = nonactive_holdout_reco.copy(deep=True)

    # --- Add identifier and group columns ---
    mapping = [
        # TEST
        (cf_test_reco, 1, 'test'),
        (one_to_one_test_reco, 2, 'test'),
        (nonactive_test_reco, 3, 'test'),

        # HOLDOUT
        (cf_holdout_reco, 1, 'holdout'),
        (one_to_one_holdout_reco, 2, 'holdout'),
        (nonactive_holdout_reco, 3, 'holdout'),

        # WOULD_HAVE
        (cf_holdout_would_have, 1, 'would_have'),
        (one_to_one_holdout_would_have, 2, 'would_have'),
        (nonactive_holdout_would_have, 3, 'would_have')
    ]

    for df, id_val, grp in mapping:
        df['identifier'] = id_val
        df['group'] = grp

    # --- Concatenate all together ---
    combined_recos = pd.concat([
        cf_test_reco,
        one_to_one_test_reco,
        nonactive_test_reco,
        cf_holdout_reco,
        one_to_one_holdout_reco,
        nonactive_holdout_reco,
        cf_holdout_would_have,
        one_to_one_holdout_would_have,
        nonactive_holdout_would_have
    ], ignore_index=True)

    # --- Reorder columns (identifier first, group second) ---
    cols = ['identifier', 'group', 'input_buyer_nbr', 'recommended_lot']
    combined_recos = combined_recos[cols]

    # --- Rank and pivot to create 6 columns for recommended lots ---
    combined_recos['rank'] = combined_recos.groupby(['identifier', 'group', 'input_buyer_nbr']).cumcount() + 1
    pivoted = combined_recos.pivot(
        index=['identifier', 'group', 'input_buyer_nbr'],
        columns='rank',
        values='recommended_lot'
    ).reset_index()

    # ✅ Rename recommendation columns to lot_1 ... lot_6
    pivoted.columns = [
        f'lot_{int(col)}' if isinstance(col, int) else col
        for col in pivoted.columns
    ]

    # ✅ Ensure all 6 lot columns exist
    lot_cols = [f'lot_{i}' for i in range(1, 7)]
    for col in lot_cols:
        if col not in pivoted.columns:
            pivoted[col] = 0

    pivoted = pivoted[['identifier', 'group', 'input_buyer_nbr'] + lot_cols]

    # ✅ Convert lot columns to int
    pivoted[lot_cols] = pivoted[lot_cols].fillna(0).astype(int)

    # ✅ Add created_at and sent_at timestamps
    cst = pytz.timezone('US/Central')
    now_cst = datetime.now(cst)
    next_day_7am_cst = (now_cst + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0)

    pivoted['created_at'] = now_cst
    pivoted['sent_at'] = next_day_7am_cst

    return pivoted


In [26]:
final_pivoted_recos = rename_tag_concat_and_pivot(
    cf_test_reco,
    one_to_one_test_reco,
    nonactive_test_reco,
    cf_holdout_reco,
    one_to_one_holdout_reco,
    nonactive_holdout_reco,
    cf_holdout_would_have,
    one_to_one_holdout_would_have
)

final_pivoted_recos.head()

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6,created_at,sent_at
0,1,holdout,28,59312945,82164455,80790925,69942755,70914395,80687015,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
1,1,holdout,86,59312945,82164455,80790925,69942755,70914395,80687015,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
2,1,holdout,154,81887675,68198055,71482135,81285705,70871365,70933795,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
3,1,holdout,390,81295775,67637415,85781165,71059075,48435405,68577795,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
4,1,holdout,408,71166555,80366115,64550245,81246825,81661895,83858065,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00


In [27]:
final_pivoted_recos

Unnamed: 0,identifier,group,input_buyer_nbr,lot_1,lot_2,lot_3,lot_4,lot_5,lot_6,created_at,sent_at
0,1,holdout,28,59312945,82164455,80790925,69942755,70914395,80687015,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
1,1,holdout,86,59312945,82164455,80790925,69942755,70914395,80687015,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
2,1,holdout,154,81887675,68198055,71482135,81285705,70871365,70933795,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
3,1,holdout,390,81295775,67637415,85781165,71059075,48435405,68577795,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
4,1,holdout,408,71166555,80366115,64550245,81246825,81661895,83858065,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
...,...,...,...,...,...,...,...,...,...,...,...
335368,3,would_have,997866,82352255,68440305,80914135,85472465,83784285,85202385,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
335369,3,would_have,998042,82100865,86057595,81497045,67396195,85578555,87254885,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
335370,3,would_have,999222,68726465,60554255,84016405,81863605,65647295,88603765,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00
335371,3,would_have,999662,65906055,84026085,65647295,71742185,85584215,84012855,2025-10-27 13:35:48.782046-05:00,2025-10-28 07:00:00-05:00


In [28]:
final_pivoted_recos.isnull().sum()

identifier         0
group              0
input_buyer_nbr    0
lot_1              0
lot_2              0
lot_3              0
lot_4              0
lot_5              0
lot_6              0
created_at         0
sent_at            0
dtype: int64

In [29]:
final_pivoted_recos.groupby(['identifier', 'group'])['input_buyer_nbr'].nunique()

identifier  group     
1           holdout       16693
            test          16486
            would_have    16693
2           holdout       23876
            test          23702
            would_have    23876
3           holdout       71360
            test          71327
            would_have    71360
Name: input_buyer_nbr, dtype: int64

In [23]:
from datetime import datetime, timedelta
import pytz

# Get current CST date
cst = pytz.timezone('US/Central')
now_cst = datetime.now(cst)

# Get tomorrow's date in CST (YYYY-MM-DD format)
tomorrow_date = (now_cst + timedelta(days=1)).strftime("%Y-%m-%d")

# Build the file path
file_path = f"../data/final/recommendations_{tomorrow_date}.xlsx"

# Save DataFrame
final_pivoted_recos.to_excel(file_path, index=False)

print(f"✅ File saved successfully as: {file_path}")

✅ File saved successfully as: ../data/final/recommendations_2025-10-28.xlsx


In [30]:
from google.cloud import bigquery

def upload_to_bigquery(dataframe, table_id, project_id, credentials_path):
    """
    Uploads a DataFrame to a BigQuery table.

    Args:
        dataframe (pd.DataFrame): The DataFrame to upload.
        table_id (str): The BigQuery table ID in the format `dataset.table`.
        project_id (str): The GCP project ID.
        credentials_path (str): Path to the service account JSON credentials file.
    """
    # Initialize BigQuery client
    client = bigquery.Client.from_service_account_json(credentials_path)

    # Define job configuration
    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_APPEND",  # Appends data to the table if it exists
        autodetect=True,  # Automatically detects schema
    )

    # Upload the DataFrame to BigQuery
    job = client.load_table_from_dataframe(dataframe, table_id, job_config=job_config)

    # Wait for the job to complete
    job.result()

    print(f"Data appended to {table_id} in project {project_id}.")

# Example usage
upload_to_bigquery(
    dataframe=final_pivoted_recos,  # Replace with your DataFrame
    table_id="member_reco.member_future_reco",  # Replace with your dataset and table name
    project_id="cprtqa_strategicanalytics-sp1",  # Replace with your GCP project ID
    credentials_path='/Users/srdeo/OneDrive - Copart, Inc/cprtqa-strategicanalytics-sp1-8b7a00c4fbae.json'  # Replace with your credentials file path
)




Data appended to member_reco.member_future_reco in project cprtqa_strategicanalytics-sp1.
