In [30]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
import pandas as pd

csv_path = "/content/drive/MyDrive/Dataset/Causal_Digital_Marketing_Campaign.xlsx"
df = pd.read_excel(csv_path)
df.shape, df.head()

((5000, 15),
    user_id campaign_id channel   device country segment  prior_visits_30d  \
 0   221958      CMP003  Social  Desktop      AE    Food                 3   
 1   771155      CMP006  Social   Mobile      IN    Food                 0   
 2   231932      CMP004  Search   Mobile      FR    Tech                 4   
 3   465838      CMP006  Search   Tablet      IN    Food                 2   
 4   359178      CMP003  Search  Desktop      IN    Tech                 4   
 
    prior_spend_180d  treatment_exposed  impressions  clicks  spend_usd  \
 0             93.14                  0            9       1       0.29   
 1             55.14                  0           16       1       0.37   
 2            278.79                  0           10       0       0.04   
 3            292.60                  1           84       1       0.74   
 4             16.98                  0            8       0       0.03   
 
    conversion  revenue_usd  roi  
 0           0          0.0 -1

In [32]:
import numpy as np

# Inspect dataset
print("Columns:", df.columns.tolist())
print("Missingness (top 10):")
print(df.isna().mean().sort_values(ascending=False).head(10))

# Map your actual treatment and outcome columns here
# Common choices:
# - Treatment: 'exposed' or 'treatment'
# - Outcome: 'converted' or 'response'
df["T"] = df["treatment_exposed"].astype(int)      # change to your col name
df["Y"] = df["conversion"].astype(int)    # change to your col name

# Drop identifiers and post-treatment/leaky features
id_cols = [c for c in df.columns if c.lower() in {"user_id","customer_id"}]
leaky_cols = [
    # Add any columns created after exposure (e.g., post-click metrics)
    # "post_click_conversions"
]
drop_cols = ["T", "Y"] + id_cols + leaky_cols

X = df.drop(columns=drop_cols, errors="ignore")
y = df["Y"]
t = df["T"]

# Train/test split (stratify by treatment)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
    X, y, t, test_size=0.3, random_state=42, stratify=t
)

print("Treatment share (train):", t_train.mean())
print("Base conversion rate - control:", y_train[t_train==0].mean())
print("Base conversion rate - treatment:", y_train[t_train==1].mean())

Columns: ['user_id', 'campaign_id', 'channel', 'device', 'country', 'segment', 'prior_visits_30d', 'prior_spend_180d', 'treatment_exposed', 'impressions', 'clicks', 'spend_usd', 'conversion', 'revenue_usd', 'roi']
Missingness (top 10):
user_id              0.0
campaign_id          0.0
channel              0.0
device               0.0
country              0.0
segment              0.0
prior_visits_30d     0.0
prior_spend_180d     0.0
treatment_exposed    0.0
impressions          0.0
dtype: float64
Treatment share (train): 0.464
Base conversion rate - control: 0.08422174840085288
Base conversion rate - treatment: 0.1354679802955665


In [33]:
# Preprocessing: one-hot for categorical, passthrough for numeric
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

preprocess.fit(X_train)
Xtr = preprocess.transform(X_train)
Xte = preprocess.transform(X_test)

print("Transformed shapes:", Xtr.shape, Xte.shape)


Transformed shapes: (3500, 38) (1500, 38)


In [34]:
# T-learner: two outcome models (treated vs. control)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import clone

base_model = GradientBoostingClassifier(random_state=42)

model_t = clone(base_model)
model_c = clone(base_model)

model_t.fit(Xtr[t_train.values==1], y_train[t_train.values==1])
model_c.fit(Xtr[t_train.values==0], y_train[t_train.values==0])

mu1 = model_t.predict_proba(Xte)[:, 1]  # E[Y|X,T=1]
mu0 = model_c.predict_proba(Xte)[:, 1]  # E[Y|X,T=0]
uplift_tlearner = mu1 - mu0

print("Uplift (T-learner) sample:", uplift_tlearner[:5])


Uplift (T-learner) sample: [2.29820444e-06 2.29820444e-06 2.29820444e-06 2.29820444e-06
 2.29820444e-06]


In [35]:
# DR Learner (robust uplift)
!pip install -q econml

from econml.dr import DRLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

dr = DRLearner(
    model_propensity=LogisticRegression(max_iter=1000),
    model_regression=RandomForestRegressor(random_state=42),
    model_final=RandomForestRegressor(random_state=42)
)

dr.fit(y_train.values, t_train.values, X=Xtr)
uplift_dr = dr.effect(Xte)  # CATE estimates

print("Uplift (DR) sample:", uplift_dr[:5])

Uplift (DR) sample: [0. 0. 0. 0. 0.]


In [36]:
# Uplift metrics: Qini and AUUC
!pip install -q scikit-uplift

from sklift.metrics import qini_auc_score, uplift_auc_score

# Choose a prediction
uplift_pred = uplift_tlearner  # alternatively: uplift_dr

qini = qini_auc_score(y_true=y_test.values, uplift=uplift_pred, treatment=t_test.values)
auuc = uplift_auc_score(y_true=y_test.values, uplift=uplift_pred, treatment=t_test.values)

print(f"Qini: {qini:.4f}, AUUC: {auuc:.4f}")


Qini: -0.1907, AUUC: -0.0726


In [37]:
# Decile analysis for interpretable segments
import pandas as pd
import numpy as np

deciles = pd.qcut(uplift_pred, 10, labels=False, duplicates="drop")
summary = pd.DataFrame({
    "decile": deciles,
    "treatment": t_test.values,
    "outcome": y_test.values
})

seg = summary.groupby("decile").apply(
    lambda g: pd.Series({
        "n": len(g),
        "t_rate": g[g["treatment"]==1]["outcome"].mean(),
        "c_rate": g[g["treatment"]==0]["outcome"].mean(),
        "uplift": g[g["treatment"]==1]["outcome"].mean() - g[g["treatment"]==0]["outcome"].mean()
    })
).reset_index().sort_values("decile", ascending=False)

seg


  seg = summary.groupby("decile").apply(


Unnamed: 0,decile,n,t_rate,c_rate,uplift
1,1,10.0,1.0,,
0,0,1490.0,0.107872,0.05597,0.051902


In [38]:
# Attach features back to deciles for interpretation and segment rules
Xte_df = pd.DataFrame(X_test.reset_index(drop=True))
Xte_df["decile"] = deciles
Xte_df["uplift_pred"] = uplift_pred

def summarize_segments(df, min_size=50, top_k=3):
    candidates = []
    # Example categorical features commonly present in marketing datasets
    for col in df.columns:
        if col in {"decile","uplift_pred"}:
            continue
        if df[col].dtype == "object" or str(df[col].dtype).startswith("category"):
            for v in df[col].astype(str).unique():
                mask = df[col].astype(str) == v
                if mask.sum() >= min_size:
                    candidates.append((f"{col}={v}", df.loc[mask, "uplift_pred"].mean(), mask.sum()))
        elif np.issubdtype(df[col].dtype, np.number):
            # High vs. low bins (75th percentile)
            p75 = np.percentile(df[col].dropna(), 75)
            p25 = np.percentile(df[col].dropna(), 25)
            m_high = df[col] >= p75
            m_low  = df[col] <= p25
            if m_high.sum() >= min_size:
                candidates.append((f"{col} high (>=P75)", df.loc[m_high, "uplift_pred"].mean(), m_high.sum()))
            if m_low.sum() >= min_size:
                candidates.append((f"{col} low (<=P25)", df.loc[m_low, "uplift_pred"].mean(), m_low.sum()))
    candidates_sorted = sorted(candidates, key=lambda x: x[1], reverse=True)
    top = candidates_sorted[:top_k]
    bottom = sorted(candidates, key=lambda x: x[1])[:top_k]
    return top, bottom

# Top deciles for persuadables
top_dec = Xte_df[Xte_df["decile"] >= Xte_df["decile"].max()-2]
bottom_dec = Xte_df[Xte_df["decile"] <= Xte_df["decile"].min()+2]

top3, bottom3 = summarize_segments(top_dec, min_size=30, top_k=3)
top3, bottom3


([('campaign_id=CMP007', np.float64(0.012667187644477097), np.int64(148)),
  ('spend_usd high (>=P75)', np.float64(0.012301496271456498), np.int64(381)),
  ('clicks high (>=P75)', np.float64(0.009244860282757688), np.int64(507))],
 [('campaign_id=CMP009', np.float64(2.1133892123540198e-06), np.int64(149)),
  ('campaign_id=CMP010', np.float64(2.1468996659237065e-06), np.int64(182)),
  ('country=DE', np.float64(2.1514446737061345e-06), np.int64(258))])

In [39]:
# Propensity overlap diagnostics
from sklearn.linear_model import LogisticRegression
ps_model = LogisticRegression(max_iter=1000)
ps_model.fit(Xtr, t_train.values)
ps = ps_model.predict_proba(Xte)[:, 1]

print(pd.Series(ps).describe())
print("Extreme propensities share (<0.05 or >0.95):",
      ((ps < 0.05) | (ps > 0.95)).mean())


count    1.500000e+03
mean     4.639988e-01
std      4.988603e-01
min      2.589389e-47
25%      1.596056e-06
50%      2.018963e-05
75%      1.000000e+00
max      1.000000e+00
dtype: float64
Extreme propensities share (<0.05 or >0.95): 1.0


In [40]:
# Hyperparameter sensitivity: see if Qini is stable with different depths
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import clone
from sklift.metrics import qini_auc_score

def tlearner_qini(max_depth):
    m = GradientBoostingClassifier(max_depth=max_depth, random_state=42)
    mt, mc = clone(m), clone(m)
    mt.fit(Xtr[t_train.values==1], y_train[t_train.values==1])
    mc.fit(Xtr[t_train.values==0], y_train[t_train.values==0])
    u = mt.predict_proba(Xte)[:,1] - mc.predict_proba(Xte)[:,1]
    return qini_auc_score(y_true=y_test.values, uplift=u, treatment=t_test.values)

for d in [2, 3, 4]:
    print(f"Max_depth {d} Qini:", tlearner_qini(d))


Max_depth 2 Qini: -0.19073056238409128
Max_depth 3 Qini: -0.19073056238409128
Max_depth 4 Qini: -0.19073056238409128
