In [30]:
from pathlib import Path

import pandas as pd
import networkx as nx
from pyvis.network import Network

# Folders with processed scores and cleaned edges
PROC_DIR = Path("data/processed")
INTERIM_DIR = Path("data/interim")

# Load final scores for all campaigns (trained on all data)
scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores = pd.read_parquet(scores_path)

print(f"Scores table: {scores.shape[0]:,} rows, {scores.shape[1]} columns")
print("Campaigns in scores:", sorted(scores["product_name"].unique()))

# Pick one campaign for the first network visual
CAMPAIGN = "abc_reading"  # we can change this later if needed

scores_c = scores[scores["product_name"] == CAMPAIGN].copy()
print(f"\nCampaign: {CAMPAIGN}")
print(f"Accounts in campaign: {len(scores_c):,}")

# Load the cleaned edges for this campaign
edges_path = INTERIM_DIR / f"{CAMPAIGN}_edges_clean.parquet"
edges = pd.read_parquet(edges_path)

print(f"\nEdges table for {CAMPAIGN}: {edges.shape[0]:,} rows, {edges.shape[1]} columns")
print("Edge columns:", list(edges.columns))

print("\nSample of scores for this campaign:")
display(scores_c.head())

print("\nSample of edges for this campaign:")
display(edges.head())

Scores table: 30,089 rows, 24 columns
Campaigns in scores: ['abc_reading', 'electric_toothbrush', 'intelligent_floor_scrubber', 'ruby_face_cream', 'spark_thinking', 'supor_boosted_showerhead']

Campaign: abc_reading
Accounts in campaign: 6,869

Edges table for abc_reading: 439,620 rows, 3 columns
Edge columns: ['src_user_id', 'dst_user_id', 'interact_type']

Sample of scores for this campaign:


Unnamed: 0,user_id,comment,reposts,total_engagement,user_followers,user_friends,engagement_per_follower,in_degree,out_degree,pagerank,...,log1p_user_friends,log1p_comment,log1p_reposts,log1p_total_engagement,log1p_in_degree,log1p_out_degree,log1p_pagerank,label_high_engagement,score_logistic_full_final,rank_by_score_final
0,84769,3260,6030,9290,953194,898,0.009746,5676,44,0.012914,...,6.801283,8.089789,8.704668,9.136801,8.644178,3.806662,0.012831,1,1.0,1
1,185,2500,5825,8325,16153761,1324,0.000515,4700,6,0.00984,...,7.189168,7.824446,8.670086,9.027138,8.455531,1.94591,0.009792,1,1.0,2
2,84700,6890,3323,10213,385320,2564,0.026505,2712,122,0.008042,...,7.849714,8.837971,8.108924,9.231515,7.90581,4.812184,0.00801,1,1.0,3
3,71739,715,3827,4542,270321,1047,0.016802,3747,55,0.01097,...,6.954639,6.57368,8.250098,8.421343,8.228978,4.025352,0.01091,1,1.0,4
4,42692,2953,1955,4908,176281,12,0.027842,3928,35,0.011569,...,2.564949,7.990915,7.578657,8.498826,8.27614,3.583519,0.011502,1,1.0,5



Sample of edges for this campaign:


Unnamed: 0,src_user_id,dst_user_id,interact_type
0,201456,201455,comment
1,201504,201455,comment
2,201505,201455,comment
3,201506,201455,comment
4,201507,201455,reposts


In [31]:
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from pyvis.network import Network

# -------------------------------------------------------------------
# Settings
# -------------------------------------------------------------------
CAMPAIGN = "abc_reading"   # change to another campaign name if you like
TOP_N = 60                 # how many top accounts to show
MAX_EDGES = 1000           # cap on number of strongest edges to draw

PROC_DIR = Path("data/processed")
INTERIM_DIR = Path("data/interim")

# -------------------------------------------------------------------
# Load scores + edges for this campaign
# -------------------------------------------------------------------
scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores = pd.read_parquet(scores_path)

edges_path = INTERIM_DIR / f"{CAMPAIGN}_edges_clean.parquet"
edges = pd.read_parquet(edges_path)

scores_c = scores[scores["product_name"] == CAMPAIGN].copy()

print(f"Building STATIC visual for {CAMPAIGN}")
print(f"Accounts in campaign: {len(scores_c):,}")
print(f"Edges in cleaned graph: {len(edges):,}")

# -------------------------------------------------------------------
# Select top-N accounts by final model score
# -------------------------------------------------------------------
top_nodes = (
    scores_c.sort_values("score_logistic_full_final", ascending=False)
            .head(TOP_N)
            .copy()
)
top_ids = set(top_nodes["user_id"].astype(int))

# For highlighting: top 10 by score
top10_ids = set(
    top_nodes.sort_values("score_logistic_full_final", ascending=False)
             .head(10)["user_id"].astype(int)
)

# -------------------------------------------------------------------
# Filter edges where BOTH ends are in the top set, then keep strongest
# -------------------------------------------------------------------
edges_sub = edges[
    edges["src_user_id"].isin(top_ids) & edges["dst_user_id"].isin(top_ids)
].copy()

edge_counts = (
    edges_sub
    .value_counts(["src_user_id", "dst_user_id"])
    .reset_index(name="weight")
    .sort_values("weight", ascending=False)
    .head(MAX_EDGES)
)

print(f"Nodes (top-N): {len(top_nodes):,}, edges after filtering: {len(edge_counts):,}")

# -------------------------------------------------------------------
# Build NetworkX graph
# -------------------------------------------------------------------
G = nx.Graph()

for _, row in top_nodes.iterrows():
    uid = int(row["user_id"])
    G.add_node(
        uid,
        followers=int(row["user_followers"]),
        engagement=int(row["total_engagement"]),
        in_degree=int(row["in_degree"]),
        pagerank=float(row["pagerank"]),
        score=float(row["score_logistic_full_final"]),
        official=bool(row["is_official_influencer"]),
    )

for _, row in edge_counts.iterrows():
    src = int(row["src_user_id"])
    dst = int(row["dst_user_id"])
    w   = int(row["weight"])
    if src in G and dst in G:
        G.add_edge(src, dst, weight=w)

print(f"Subgraph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")

# -------------------------------------------------------------------
# Static layout: compute node positions once
# -------------------------------------------------------------------
pos = nx.spring_layout(G, k=0.4, iterations=80, seed=42)

# -------------------------------------------------------------------
# PyVis network setup (physics OFF for static view)
# -------------------------------------------------------------------
net = Network(
    height="750px",
    width="100%",
    bgcolor="#ffffff",
    font_color="#000000",
    notebook=True,
)

net.toggle_physics(False)

# Normalise PageRank for sizing
pr_values = np.array([attrs["pagerank"] for _, attrs in G.nodes(data=True)])
if pr_values.max() > 0:
    pr_norm = (pr_values - pr_values.min()) / (pr_values.max() - pr_values.min() + 1e-9)
else:
    pr_norm = pr_values

# Add nodes with plain-text tooltip (NO model score, NO official flag)
for (node_id, attrs), pr in zip(G.nodes(data=True), pr_norm):
    followers = attrs["followers"]
    engagement = attrs["engagement"]
    indeg = attrs["in_degree"]
    pr_val = attrs["pagerank"]
    score = attrs["score"]          # kept for completeness, not shown
    official = attrs["official"]    # used only for colouring

    # Bigger nodes for more central accounts
    size = 10 + 30 * pr

    # Colours:
    #   - red: top 10 by model score
    #   - blue: official influencers (not in top 10)
    #   - orange: other non-officials
    if node_id in top10_ids:
        color = "#e41a1c"  # red
    elif official:
        color = "#377eb8"  # blue
    else:
        color = "#fd8d3c"  # orange

    # Plain-text tooltip (no model score, no official flag)
    title = (
        f"User {node_id}\n"
        f"Followers: {followers:,}\n"
        f"Engagement (comments+reposts): {engagement:,}\n"
        f"In-degree (people who engaged): {indeg:,}\n"
        f"PageRank: {pr_val:.4f}"
    )

    x, y = pos[node_id]
    net.add_node(
        node_id,
        label=str(node_id),
        title=title,
        size=size,
        color=color,
        x=float(x * 1000),
        y=float(y * 1000),
        physics=False,
    )

# Add edges
for u, v, attrs in G.edges(data=True):
    w = attrs.get("weight", 1)
    net.add_edge(u, v, value=w)

# -------------------------------------------------------------------
# Save HTML into reports/graphs
# -------------------------------------------------------------------
out_dir = Path("reports/graphs")
out_dir.mkdir(parents=True, exist_ok=True)

html_path = out_dir / f"{CAMPAIGN}_network_top{TOP_N}_static.html"
net.show(str(html_path))

print(f"\nSaved static interactive network to: {html_path}")
print("Open this file in your browser to explore (zoom, drag, hover).")

Building STATIC visual for abc_reading
Accounts in campaign: 6,869
Edges in cleaned graph: 439,620
Nodes (top-N): 60, edges after filtering: 390
Subgraph: 60 nodes, 226 edges
reports/graphs/abc_reading_network_top60_static.html

Saved static interactive network to: reports/graphs/abc_reading_network_top60_static.html
Open this file in your browser to explore (zoom, drag, hover).


In [32]:
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from pyvis.network import Network

# -------------------------------------------------------------------
# Settings (you can tweak these if you want to see denser/sparser maps)
# -------------------------------------------------------------------
TOP_N = 60          # how many top accounts (by model score) to show per campaign
MAX_EDGES = 1000    # max number of strongest edges to draw per campaign

PROC_DIR = Path("data/processed")
INTERIM_DIR = Path("data/interim")
OUT_DIR = Path("reports/graphs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------------
# Load scores for all campaigns
# -------------------------------------------------------------------
scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores_all = pd.read_parquet(scores_path)

campaigns = sorted(scores_all["product_name"].unique())
print("Campaigns found:", campaigns)

# -------------------------------------------------------------------
# Helper: build and save one campaign graph
# -------------------------------------------------------------------
def build_static_graph_for_campaign(campaign: str):
    print(f"\n=== Building static graph for campaign: {campaign} ===")

    # Filter scores for this campaign
    scores_c = scores_all[scores_all["product_name"] == campaign].copy()
    print(f"Accounts in campaign: {len(scores_c):,}")

    # Load edges for this campaign
    edges_path = INTERIM_DIR / f"{campaign}_edges_clean.parquet"
    if not edges_path.exists():
        print(f"  WARNING: edges file not found: {edges_path}, skipping.")
        return

    edges = pd.read_parquet(edges_path)
    print(f"Edges in cleaned graph: {len(edges):,}")

    # Top-N accounts by final model score
    top_nodes = (
        scores_c.sort_values("score_logistic_full_final", ascending=False)
                .head(TOP_N)
                .copy()
    )
    top_ids = set(top_nodes["user_id"].astype(int))

    # Top 10 by score (for red nodes)
    top10_ids = set(
        top_nodes.sort_values("score_logistic_full_final", ascending=False)
                 .head(10)["user_id"].astype(int)
    )

    # Restrict edges to those between top-N nodes
    edges_sub = edges[
        edges["src_user_id"].isin(top_ids) &
        edges["dst_user_id"].isin(top_ids)
    ].copy()

    # Aggregate multiple interactions to a single weighted edge
    edge_counts = (
        edges_sub
        .value_counts(["src_user_id", "dst_user_id"])
        .reset_index(name="weight")
        .sort_values("weight", ascending=False)
        .head(MAX_EDGES)
    )

    print(f"Nodes (top-N): {len(top_nodes):,}, edges after filtering: {len(edge_counts):,}")

    # Build NetworkX graph
    G = nx.Graph()

    for _, row in top_nodes.iterrows():
        uid = int(row["user_id"])
        G.add_node(
            uid,
            followers=int(row["user_followers"]),
            engagement=int(row["total_engagement"]),
            in_degree=int(row["in_degree"]),
            pagerank=float(row["pagerank"]),
            score=float(row["score_logistic_full_final"]),   # kept but not shown
            official=bool(row["is_official_influencer"]),    # used only for colour
        )

    for _, row in edge_counts.iterrows():
        src = int(row["src_user_id"])
        dst = int(row["dst_user_id"])
        w   = int(row["weight"])
        if src in G and dst in G:
            G.add_edge(src, dst, weight=w)

    print(f"Subgraph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")

    if G.number_of_nodes() == 0:
        print("  No nodes to plot, skipping.")
        return

    # Static layout (same for all nodes in this campaign)
    pos = nx.spring_layout(G, k=0.4, iterations=80, seed=42)

    # PyVis network (physics off for static display)
    net = Network(
        height="750px",
        width="100%",
        bgcolor="#ffffff",
        font_color="#000000",
        notebook=True,
    )
    net.toggle_physics(False)

    # Normalise PageRank for node sizes
    pr_values = np.array([attrs["pagerank"] for _, attrs in G.nodes(data=True)])
    if pr_values.max() > 0:
        pr_norm = (pr_values - pr_values.min()) / (pr_values.max() - pr_values.min() + 1e-9)
    else:
        pr_norm = pr_values

    # Add nodes with plain-text tooltip (NO model score, NO official flag text)
    for (node_id, attrs), pr in zip(G.nodes(data=True), pr_norm):
        followers = attrs["followers"]
        engagement = attrs["engagement"]
        indeg = attrs["in_degree"]
        pr_val = attrs["pagerank"]
        official = attrs["official"]   # used only for colour

        # Node size: more central accounts appear larger
        size = 10 + 30 * pr

        # Colours:
        #   - red: top 10 by model score
        #   - blue: official influencers (not in top 10)
        #   - orange: other non-officials
        if node_id in top10_ids:
            color = "#e41a1c"  # red
        elif official:
            color = "#377eb8"  # blue
        else:
            color = "#fd8d3c"  # orange

        # Tooltip: business-facing fields only
        title = (
            f"User {node_id}\n"
            f"Followers: {followers:,}\n"
            f"Engagement (comments+reposts): {engagement:,}\n"
            f"In-degree (people who engaged): {indeg:,}\n"
            f"PageRank: {pr_val:.4f}"
        )

        x, y = pos[node_id]
        net.add_node(
            node_id,
            label=str(node_id),
            title=title,
            size=size,
            color=color,
            x=float(x * 1000),
            y=float(y * 1000),
            physics=False,
        )

    # Add edges
    for u, v, attrs in G.edges(data=True):
        w = attrs.get("weight", 1)
        net.add_edge(u, v, value=w)

    # Save HTML for this campaign
    html_path = OUT_DIR / f"{campaign}_network_top{TOP_N}_static.html"
    net.show(str(html_path))
    print(f"Saved static interactive network to: {html_path}")

# -------------------------------------------------------------------
# Run for all campaigns
# -------------------------------------------------------------------
for c in campaigns:
    build_static_graph_for_campaign(c)

print("\nDone. Open the HTML files in reports/graphs/ in your browser to explore.")

Campaigns found: ['abc_reading', 'electric_toothbrush', 'intelligent_floor_scrubber', 'ruby_face_cream', 'spark_thinking', 'supor_boosted_showerhead']

=== Building static graph for campaign: abc_reading ===
Accounts in campaign: 6,869
Edges in cleaned graph: 439,620
Nodes (top-N): 60, edges after filtering: 390
Subgraph: 60 nodes, 226 edges
reports/graphs/abc_reading_network_top60_static.html
Saved static interactive network to: reports/graphs/abc_reading_network_top60_static.html

=== Building static graph for campaign: electric_toothbrush ===
Accounts in campaign: 1,561
Edges in cleaned graph: 161,551
Nodes (top-N): 60, edges after filtering: 554
Subgraph: 60 nodes, 380 edges
reports/graphs/electric_toothbrush_network_top60_static.html
Saved static interactive network to: reports/graphs/electric_toothbrush_network_top60_static.html

=== Building static graph for campaign: intelligent_floor_scrubber ===
Accounts in campaign: 5,959
Edges in cleaned graph: 356,506
Nodes (top-N): 60, ed

In [33]:
from pathlib import Path
import pandas as pd

PROC_DIR = Path("data/processed")
OUT_DIR = Path("reports/tables")
OUT_DIR.mkdir(parents=True, exist_ok=True)

scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores_all = pd.read_parquet(scores_path)

TOP_K = 50

campaigns = sorted(scores_all["product_name"].unique())
print("Campaigns:", campaigns)

summary_rows = []

for c in campaigns:
    df_c = scores_all[scores_all["product_name"] == c].copy()
    print(f"\n=== {c} ===")
    print(f"Total accounts in campaign: {len(df_c):,}")

    # top-K by final logistic_full score (we sort by it, but we do not export the score itself)
    topk = (
        df_c.sort_values("score_logistic_full_final", ascending=False)
            .head(TOP_K)
            .copy()
    )

    # keep only business-facing columns (no model score, no top-20% label)
    cols_keep = [
        "product_name",
        "user_id",
        "is_official_influencer",
        "user_followers",
        "total_engagement",
        "in_degree",
        "pagerank",
        # we intentionally do NOT keep:
        # "score_logistic_full_final"
        # "label_high_engagement"
    ]
    topk = topk[cols_keep]

    # nicer column names
    topk = topk.rename(columns={
        "is_official_influencer": "official_influencer",
        "user_followers": "followers",
        "total_engagement": "engagement_comments_plus_reposts",
        "in_degree": "people_who_engaged",
        "pagerank": "pagerank_centrality",
    })

    # add rank column with the ORIGINAL name expected by the app
    topk.insert(1, "rank_by_model_score", range(1, len(topk) + 1))

    # save per-campaign CSV
    out_path = OUT_DIR / f"{c}_top{TOP_K}_recommended_accounts.csv"
    topk.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")

    # summary row (counts use the official_influencer flag)
    n_official = int(topk["official_influencer"].sum())
    n_non_official = len(topk) - n_official

    summary_rows.append({
        "product_name": c,
        "recommended_total": len(topk),
        "recommended_official": n_official,
        "recommended_non_official": n_non_official,
    })

# overall summary table
summary_df = pd.DataFrame(summary_rows)
summary_path = OUT_DIR / f"summary_top{TOP_K}_recommendations.csv"
summary_df.to_csv(summary_path, index=False)

print("\nSummary of recommendations per campaign:")
display(summary_df)
print(f"\nSaved summary to: {summary_path}")

Campaigns: ['abc_reading', 'electric_toothbrush', 'intelligent_floor_scrubber', 'ruby_face_cream', 'spark_thinking', 'supor_boosted_showerhead']

=== abc_reading ===
Total accounts in campaign: 6,869
Saved: reports/tables/abc_reading_top50_recommended_accounts.csv

=== electric_toothbrush ===
Total accounts in campaign: 1,561
Saved: reports/tables/electric_toothbrush_top50_recommended_accounts.csv

=== intelligent_floor_scrubber ===
Total accounts in campaign: 5,959
Saved: reports/tables/intelligent_floor_scrubber_top50_recommended_accounts.csv

=== ruby_face_cream ===
Total accounts in campaign: 3,059
Saved: reports/tables/ruby_face_cream_top50_recommended_accounts.csv

=== spark_thinking ===
Total accounts in campaign: 7,138
Saved: reports/tables/spark_thinking_top50_recommended_accounts.csv

=== supor_boosted_showerhead ===
Total accounts in campaign: 5,503
Saved: reports/tables/supor_boosted_showerhead_top50_recommended_accounts.csv

Summary of recommendations per campaign:


Unnamed: 0,product_name,recommended_total,recommended_official,recommended_non_official
0,abc_reading,50,1,49
1,electric_toothbrush,50,8,42
2,intelligent_floor_scrubber,50,4,46
3,ruby_face_cream,50,10,40
4,spark_thinking,50,5,45
5,supor_boosted_showerhead,50,4,46



Saved summary to: reports/tables/summary_top50_recommendations.csv
