In [6]:


from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score

from xgboost import XGBClassifier

PROC_DIR = Path("data/processed")
labeled_path = PROC_DIR / "features_labeled.parquet"

data = pd.read_parquet(labeled_path)
print(f"Loaded labeled data: {data.shape[0]:,} rows, {data.shape[1]} columns")
print("Campaigns:", sorted(data["product_name"].unique()))

# Target
TARGET_COL = "label_high_engagement"

# Baseline: account size only
BASELINE_FEATURES = [
    "log1p_user_followers",
    "log1p_user_friends",
]

# Full model: size + network position (no direct engagement counts)
FULL_FEATURES = BASELINE_FEATURES + [
    "log1p_in_degree",
    "log1p_out_degree",
    "log1p_pagerank",
    "kcore",
]

print("\nBaseline feature columns:", BASELINE_FEATURES)
print("\nFull model feature columns:", FULL_FEATURES)

Loaded labeled data: 30,089 rows, 22 columns
Campaigns: ['abc_reading', 'electric_toothbrush', 'intelligent_floor_scrubber', 'ruby_face_cream', 'spark_thinking', 'supor_boosted_showerhead']

Baseline feature columns: ['log1p_user_followers', 'log1p_user_friends']

Full model feature columns: ['log1p_user_followers', 'log1p_user_friends', 'log1p_in_degree', 'log1p_out_degree', 'log1p_pagerank', 'kcore']


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from xgboost import XGBClassifier

# Pick one campaign here
CAMPAIGN = "spark_thinking"   # change as needed

df_c = data[data["product_name"] == CAMPAIGN].copy()
print(f"Campaign: {CAMPAIGN}")
print(f"Total rows (candidates): {len(df_c):,}")

# Train/test split (same as before)
df_train, df_test = train_test_split(
    df_c,
    test_size=0.3,
    stratify=df_c[TARGET_COL],
    random_state=42,
)

def label_summary(name, df):
    n = len(df)
    pos = int(df[TARGET_COL].sum())
    neg = n - pos
    print(f"{name}: {n:5d} rows  "
          f"positives={pos:4d} ({pos/n:5.1%}),  "
          f"negatives={neg:4d} ({neg/n:5.1%})")

print("\nTrain/test split:")
label_summary("Train", df_train)
label_summary("Test ", df_test)

# Matrices
Xb_train = df_train[BASELINE_FEATURES].to_numpy()
Xb_test  = df_test[BASELINE_FEATURES].to_numpy()

Xf_train = df_train[FULL_FEATURES].to_numpy()
Xf_test  = df_test[FULL_FEATURES].to_numpy()

y_train = df_train[TARGET_COL].to_numpy()
y_test  = df_test[TARGET_COL].to_numpy()

# Helper: precision@k
def precision_at_k(y_true, y_score, k):
    k = min(k, len(y_true))
    idx = np.argsort(y_score)[::-1][:k]
    return float(y_true[idx].sum()) / k

def model_metrics(name, y_true, scores):
    y_pred = (scores >= 0.5).astype(int)
    return {
        "model": name,
        "f1": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, scores),
        "p@50": precision_at_k(y_true, scores, 50),
        "p@100": precision_at_k(y_true, scores, 100),
    }

metrics = []
test_with_scores = df_test.copy()

# 1) Logistic, baseline features
sc_base = StandardScaler()
Xb_train_s = sc_base.fit_transform(Xb_train)
Xb_test_s  = sc_base.transform(Xb_test)

log_base = LogisticRegression(max_iter=1000, random_state=42)
log_base.fit(Xb_train_s, y_train)
p_log_base = log_base.predict_proba(Xb_test_s)[:, 1]
metrics.append(model_metrics("logistic_baseline", y_test, p_log_base))
test_with_scores["score_logistic_baseline"] = p_log_base

# 2) Logistic, full features
sc_full_log = StandardScaler()
Xf_train_s_log = sc_full_log.fit_transform(Xf_train)
Xf_test_s_log  = sc_full_log.transform(Xf_test)

log_full = LogisticRegression(max_iter=1000, random_state=42)
log_full.fit(Xf_train_s_log, y_train)
p_log_full = log_full.predict_proba(Xf_test_s_log)[:, 1]
metrics.append(model_metrics("logistic_full", y_test, p_log_full))
test_with_scores["score_logistic_full"] = p_log_full

# 3) XGBoost, full features
sc_full_xgb = StandardScaler()
Xf_train_s_xgb = sc_full_xgb.fit_transform(Xf_train)
Xf_test_s_xgb  = sc_full_xgb.transform(Xf_test)

xgb_full = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)
xgb_full.fit(Xf_train_s_xgb, y_train)
p_xgb_full = xgb_full.predict_proba(Xf_test_s_xgb)[:, 1]
metrics.append(model_metrics("xgboost_full", y_test, p_xgb_full))
test_with_scores["score_xgb_full"] = p_xgb_full

print("\nTest metrics for campaign:", CAMPAIGN)
metrics_df = pd.DataFrame(metrics)
display(metrics_df)

# See top 10 test accounts by xgboost_full score
cols_show = [
    "user_id",
    "is_official_influencer",
    "user_followers",
    "total_engagement",
    "in_degree",
    "pagerank",
    TARGET_COL,
    "score_logistic_baseline",
    "score_logistic_full",
    "score_xgb_full",
]

print("\nTop 10 test accounts by xgboost_full score:")
display(
    test_with_scores.sort_values("score_xgb_full", ascending=False)[cols_show].head(10)
)

Campaign: spark_thinking
Total rows (candidates): 7,138

Train/test split:
Train:  4996 rows  positives=1060 (21.2%),  negatives=3936 (78.8%)
Test :  2142 rows  positives= 454 (21.2%),  negatives=1688 (78.8%)

Test metrics for campaign: spark_thinking


Unnamed: 0,model,f1,roc_auc,p@50,p@100
0,logistic_baseline,0.0,0.623758,0.4,0.36
1,logistic_full,0.729763,0.889994,1.0,1.0
2,xgboost_full,0.727717,0.911015,1.0,1.0



Top 10 test accounts by xgboost_full score:


Unnamed: 0,user_id,is_official_influencer,user_followers,total_engagement,in_degree,pagerank,label_high_engagement,score_logistic_baseline,score_logistic_full,score_xgb_full
20060,104815,0,11350,2654,165,0.001559,1,0.224648,0.999999,0.999684
18133,83142,0,698524,2811,907,0.003817,1,0.347959,1.0,0.999683
18893,88275,0,63501,80,22,0.000322,1,0.283495,0.99895,0.999681
20255,105746,0,18383,107,24,0.000252,1,0.237261,0.999096,0.999677
19470,97952,0,15122,28,17,0.000332,1,0.232878,0.997246,0.999653
21005,123443,0,145480,2577,206,0.001473,1,0.281977,1.0,0.999652
18112,83087,0,186930,5524,526,0.005346,1,0.3042,1.0,0.99963
18355,84395,0,23523,106,25,0.000377,1,0.240742,0.999284,0.999621
20864,119321,0,43224,655,80,0.000916,1,0.256487,0.999991,0.999619
18196,83448,0,25037,90,29,0.00036,1,0.241486,0.999601,0.999615
