
# Dynamic Discount Targeting (Uplift Modeling)

This notebook builds a simple uplift-style model to **target discounts only to customers who are likely to change behavior because of a discount**.

**Pipeline:**  
1) Load customer-level data (`processed_customers.csv`)  
2) One shared encoder on full data (country)  
3) Train two models:  
   - `P(buy | discount)` on treated group (`saw_discount = 1`)  
   - `P(buy | no discount)` on control group (`saw_discount = 0`)  
4) Score everyone and compute `uplift = P(buy|disc) - P(buy|no-disc)`  
5) Save outputs and small visuals


In [None]:

import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


## 1) Load processed customer data

In [None]:

# If your CSV is in a different folder, update this path:
DATA_PATH = "processed_customers.csv"

df = pd.read_csv(DATA_PATH)
df.head()


## 2) Shared preprocessing on full data

In [None]:

features = ["total_spend", "num_invoices", "num_unique_items", "recency_days", "country"]
target = "purchased_again"

num_features = ["total_spend", "num_invoices", "num_unique_items", "recency_days"]
cat_features = ["country"]

preprocess_full = ColumnTransformer(
    [("num","passthrough",num_features),
     ("cat",OneHotEncoder(handle_unknown="ignore"),cat_features)]
)
preprocess_full.fit(df[features])

X_all = preprocess_full.transform(df[features])
y_all = df[target].values

# masks for treated/control
treated_mask = df["saw_discount"] == 1
control_mask = df["saw_discount"] == 0

X_t, y_t = X_all[treated_mask], y_all[treated_mask]
X_c, y_c = X_all[control_mask], y_all[control_mask]
X_all.shape, X_t.shape, X_c.shape


## 3) Train treated/control models

In [None]:

treated_model = RandomForestClassifier(random_state=42).fit(X_t, y_t)
control_model = RandomForestClassifier(random_state=42).fit(X_c, y_c)

# score everyone
p_treated = treated_model.predict_proba(X_all)[:,1]
p_control = control_model.predict_proba(X_all)[:,1]

df["p_treated"] = p_treated
df["p_control"] = p_control
df["uplift"] = df["p_treated"] - df["p_control"]

df[["CustomerID","p_treated","p_control","uplift"]].head(10)


## 4) Save outputs

In [None]:

# Save scored data
out_scored = "discount_targeting_scored.csv"
df.to_csv(out_scored, index=False)
print("Saved:", out_scored)

# Save a small sample of top uplift rows
sample = df.sort_values("uplift", ascending=False)[
    ["CustomerID","total_spend","num_invoices","recency_days","country","p_treated","p_control","uplift"]
].head(50)
sample_out = "top_uplift_sample.csv"
sample.to_csv(sample_out, index=False)
print("Saved:", sample_out)


## 5) Create a small visual of top uplift customers

In [None]:

import os
os.makedirs("img", exist_ok=True)

cols_to_show = ["CustomerID","p_treated","p_control","uplift"]
top10 = df.sort_values("uplift", ascending=False)[cols_to_show].head(10)

plt.figure(figsize=(6, 2 + 0.35*len(top10)))
plt.axis('off')
tbl = plt.table(cellText=top10.values,
                colLabels=top10.columns,
                cellLoc='center', loc='center')
tbl.auto_set_font_size(False)
tbl.set_fontsize(8)
tbl.scale(1, 1.2)

png_path = os.path.join("img", "top_uplift_table.png")
plt.savefig(png_path, bbox_inches="tight", dpi=200)
print("Saved:", png_path)
