In [8]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import joblib

# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# 1) Connect to your DB
# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
conn = sqlite3.connect("cross_selling.db")

# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# 2) Build OBSERVATION features (Jan 1–May 31, 2024)
# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
obs_start = "01/01/2024"
obs_end   = "05/31/2024"

# a) Pull raw transactions in obs window
tx = pd.read_sql(f"""
    SELECT Customer_ID,
           Amount,
           Category,
           date(Timestamp) AS dt
    FROM transactions
    WHERE dt BETWEEN '{obs_start}' AND '{obs_end}'
""", conn)

# b) Aggregate basic spend features
agg = tx.groupby("Customer_ID").agg(
    Total_Spend       = ("Amount","sum"),
    Num_Transactions  = ("Amount","count"),
    Avg_Txn_Amount    = ("Amount","mean"),
    Max_Txn_Amount    = ("Amount","max"),
).reset_index()

# c) Pivot out category‐spends
cat_spend = (
    tx.groupby(["Customer_ID","Category"])["Amount"]
      .sum()
      .unstack(fill_value=0)
      .add_prefix("Spend_")
      .reset_index()
)

# d) Bring in static demographics
cust = pd.read_sql("SELECT Customer_ID, Age, Annual_Income, Credit_Score FROM customers", conn)
features = (
    agg
    .merge(cat_spend, on="Customer_ID", how="left")
    .merge(cust,      on="Customer_ID", how="left")
)
features.fillna(0, inplace=True)
features.set_index("Customer_ID", inplace=True)

# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# 3) Build LABELS on June 1–30, 2024 acquisitions
# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
label_df = pd.read_sql(f"""
    SELECT Customer_ID, Product_ID
    FROM customer_products
    WHERE Acquisition_Date BETWEEN '2024-06-01' AND '2024-06-30'
""", conn)

# For product P003, binary label = 1 if acquired in June
prod = "P003"
label_df["label"] = (label_df["Product_ID"] == prod).astype(int)

# Keep one row per customer (0 if missing)
labels = (
    label_df[["Customer_ID","label"]]
    .groupby("Customer_ID")["label"]
    .max()
    .reindex(features.index, fill_value=0)
)

# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# 4) Prepare X/y
# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
df = features.copy()
df["label"] = labels

# Separate features & target
y      = df["label"]
X_all  = df.drop("label", axis=1)

# Numeric vs categorical (if any)
X_num = X_all.select_dtypes(include=[np.number])
X_cat = X_all.select_dtypes(exclude=[np.number]).fillna("Unknown")

# One‐hot encode categoricals
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat_enc = pd.DataFrame(
    encoder.fit_transform(X_cat),
    index=X_cat.index,
    columns=encoder.get_feature_names_out(X_cat.columns)
)

# Final feature matrix
X_prepared = pd.concat([X_num, X_cat_enc], axis=1)

# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# 5) Train/Test Split & Model
# —–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
X_train, X_test, y_train, y_test = train_test_split(
    X_prepared, y, stratify=y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
print(f"AUC for product {prod}: {auc:.3f}")

# Persist both model and encoder for inference
joblib.dump(model, f"propensity_{prod}.joblib")
joblib.dump(encoder, f"encoder_{prod}.joblib")

print("✅ Observation window features built, label window trained, and model saved.")


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [10]:
tx

Unnamed: 0,Customer_ID,Amount,Category,dt
