In [1]:

from pathlib import Path
import pandas as pd

PROC_DIR = Path("data/processed")
path = PROC_DIR / "features_per_product.parquet"

features = pd.read_parquet(path)
print("Loaded features_per_product.parquet")
print(f"rows: {features.shape[0]:,}, cols: {features.shape[1]}")
print("columns:", sorted(features.columns.tolist())[:14], "...")

# Per-product engaged-user counts (these are the users who received any engagement)
summary = features.groupby("product_name").size().sort_values(ascending=False)
print("\nUsers with engagement per product:")
display(summary.to_frame("n_users"))

Loaded features_per_product.parquet
rows: 42,224, cols: 21
columns: ['comment', 'engagement_per_follower', 'in_degree', 'is_official_influencer', 'kcore', 'log1p_comment', 'log1p_in_degree', 'log1p_out_degree', 'log1p_pagerank', 'log1p_reposts', 'log1p_total_engagement', 'log1p_user_followers', 'log1p_user_friends', 'out_degree'] ...

Users with engagement per product:


Unnamed: 0_level_0,n_users
product_name,Unnamed: 1_level_1
spark_thinking,9616
abc_reading,9068
supor_boosted_showerhead,8355
intelligent_floor_scrubber,7556
ruby_face_cream,4871
electric_toothbrush,2758


In [2]:
# Parameters you can tune
TOP_PCT = 0.20          # label top 20% as high-engagement within each product
MIN_FOLLOWERS = 100     # include non-officials with at least this many followers

df = features.copy()

# Candidate pool: official influencers OR users with followers >= threshold
candidates = (df["is_official_influencer"] == 1) | (df["user_followers"] >= MIN_FOLLOWERS)
df = df.loc[candidates].copy()

# Weâ€™ll rank by total_engagement (you can switch to engagement_per_follower if you prefer)
def label_top_pct(group, pct):
    # threshold at the (1 - pct) quantile
    thr = group["total_engagement"].quantile(1 - pct)
    group["label_high_engagement"] = (group["total_engagement"] >= thr).astype("int8")
    group["_threshold"] = thr
    return group

labeled = df.groupby("product_name", group_keys=False).apply(label_top_pct, pct=TOP_PCT)

# Small report per product
rep = (
    labeled.groupby("product_name")
    .agg(
        candidates=("user_id","nunique"),
        positives=("label_high_engagement","sum"),
        threshold=(" _threshold".strip(),"max")  # display max since constant per group
    )
    .assign(positive_rate=lambda x: x["positives"] / x["candidates"])
    .sort_values("candidates", ascending=False)
)

print("Label summary per product")
display(rep)

# Save for modeling
out_path = PROC_DIR / "features_labeled.parquet"
labeled.drop(columns=["_threshold"], errors="ignore").to_parquet(out_path, index=False)
print(f"Saved: {out_path}")

Label summary per product


  labeled = df.groupby("product_name", group_keys=False).apply(label_top_pct, pct=TOP_PCT)


Unnamed: 0_level_0,candidates,positives,threshold,positive_rate
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
spark_thinking,7138,1514,8.0,0.212104
abc_reading,6869,1520,7.0,0.221284
intelligent_floor_scrubber,5959,1334,7.0,0.223863
supor_boosted_showerhead,5503,1128,5.0,0.204979
ruby_face_cream,3059,700,4.0,0.228833
electric_toothbrush,1561,330,9.0,0.211403


Saved: data/processed/features_labeled.parquet


In [3]:
# Inspect the labeled table on disk
from pathlib import Path
import pandas as pd

PROC = Path("data/processed")
path = PROC / "features_labeled.parquet"
assert path.exists(), f"Missing {path}"

labeled = pd.read_parquet(path)
print(f"rows: {len(labeled):,}  columns: {len(labeled.columns)}")
print(sorted(labeled.columns)[:18], "...")  # peek at columns

# Show a few labeled rows per campaign (sorted by engagement)
for prod in sorted(labeled["product_name"].unique()):
    print(f"\n=== {prod} ===")
    view = (labeled[labeled["product_name"]==prod]
            .sort_values("total_engagement", ascending=False)
            [["user_id","is_official_influencer","total_engagement",
              "in_degree","pagerank","kcore","user_followers",
              "label_high_engagement"]]
            .head(10))
    display(view)

rows: 30,089  columns: 22
['comment', 'engagement_per_follower', 'in_degree', 'is_official_influencer', 'kcore', 'label_high_engagement', 'log1p_comment', 'log1p_in_degree', 'log1p_out_degree', 'log1p_pagerank', 'log1p_reposts', 'log1p_total_engagement', 'log1p_user_followers', 'log1p_user_friends', 'out_degree', 'pagerank', 'product_name', 'reposts'] ...

=== abc_reading ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
518,82937,0,14317,414,0.004271,27,1969114,1
579,83090,0,14075,1225,0.002917,24,1812943,1
770,83806,0,13224,496,0.002503,13,375878,1
635,83253,0,10950,527,0.0027,14,323900,1
955,84700,0,10213,2712,0.008042,14,385320,1
1014,84769,0,9290,5676,0.012914,19,953194,1
2951,104785,0,9129,5077,0.023752,13,6815996,1
2,185,0,8325,4700,0.00984,12,16153761,1
571,83082,0,8163,706,0.002875,27,1098010,1
448,82098,0,7877,1536,0.003949,14,1261987,1



=== electric_toothbrush ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
7157,79288,0,12276,9527,0.074524,15,3957652,1
7124,75497,1,8615,1910,0.012698,26,1186069,1
7967,146880,1,7981,1029,0.006307,33,6697975,1
6930,67092,1,7611,2801,0.019093,28,2453462,1
8072,150218,0,7442,2196,0.01179,10,152133,1
7985,146931,0,6773,1032,0.005525,33,5973577,1
6994,70148,1,6540,4688,0.032478,18,1764111,1
8062,148762,1,6344,2481,0.009232,23,2187839,1
6869,185,0,5988,3497,0.012578,23,16155400,1
8021,147719,0,5633,2564,0.011751,23,3851880,1



=== intelligent_floor_scrubber ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
10198,83806,0,11333,425,0.001609,13,375864,1
9936,83090,0,10616,1131,0.002958,21,1812313,1
9842,82937,0,10503,368,0.004092,26,1969108,1
10592,84769,0,10439,5341,0.013044,17,973230,1
10532,84700,0,8596,2511,0.007916,17,385344,1
10007,83253,0,8203,487,0.002568,16,324196,1
9928,83082,0,6197,718,0.003137,26,1098019,1
8431,185,0,6096,3750,0.007473,8,16155412,1
8809,69736,0,5630,1060,0.008916,17,1185392,1
9776,82621,0,5584,5077,0.027878,13,1237119,1



=== ruby_face_cream ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
14495,695,0,8168,7963,0.063174,4,564434,1
14818,20278,0,7284,6835,0.037278,6,1148655,1
14584,8724,1,7107,6167,0.039965,6,787985,1
15205,30328,1,6890,4282,0.028438,19,877526,1
14836,20350,0,4720,3390,0.028185,12,4062074,1
15283,30711,0,3982,1750,0.006952,19,331260,1
14817,20277,1,3767,3120,0.023017,7,602027,1
14761,16798,1,3343,2999,0.023279,3,607365,1
15334,31027,1,3074,2160,0.014657,19,301144,1
14799,19789,0,2878,511,0.003553,8,681793,1



=== spark_thinking ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
18115,83090,0,13261,872,0.002843,24,1816181,1
19293,96960,0,13102,1053,0.007926,33,1049567,1
18052,82937,0,12863,393,0.004045,30,1969184,1
20321,105989,0,10782,1149,0.005021,29,1072407,1
18107,83082,0,8555,819,0.003195,30,1098069,1
21394,162303,0,8414,8159,0.033867,13,754017,1
19299,96967,0,6970,3307,0.018607,33,31194,1
17900,81584,0,6275,1537,0.012182,29,233592,1
18112,83087,0,5524,526,0.005346,33,186930,1
17679,71739,0,5326,4484,0.014462,10,269963,1



=== supor_boosted_showerhead ===


Unnamed: 0,user_id,is_official_influencer,total_engagement,in_degree,pagerank,kcore,user_followers,label_high_engagement
27939,243714,0,18077,16120,0.083984,5,531853,1
29984,309714,0,7964,7772,0.040168,5,80021,1
29216,276725,0,6248,5620,0.029368,7,1955155,1
26559,226927,0,6132,5542,0.025158,4,136643,1
28034,259928,0,5220,2948,0.007694,15,942696,1
24988,39487,0,4567,1362,0.003907,15,297083,1
26754,233624,0,3455,2711,0.00878,15,1604054,1
25244,52748,0,3442,1842,0.005704,15,297012,1
24713,30711,0,3359,1494,0.00333,15,337285,1
26492,226099,0,3304,1070,0.005533,5,151292,1
