In [2]:
from pathlib import Path

import pandas as pd
import networkx as nx
from pyvis.network import Network

# Folders with processed scores and cleaned edges
PROC_DIR = Path("data/processed")
INTERIM_DIR = Path("data/interim")

# Load final scores for all campaigns (trained on all data)
scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores = pd.read_parquet(scores_path)

print(f"Scores table: {scores.shape[0]:,} rows, {scores.shape[1]} columns")
print("Campaigns in scores:", sorted(scores["product_name"].unique()))

# Pick one campaign for the first network visual
CAMPAIGN = "abc_reading"  # we can change this later if needed

scores_c = scores[scores["product_name"] == CAMPAIGN].copy()
print(f"\nCampaign: {CAMPAIGN}")
print(f"Accounts in campaign: {len(scores_c):,}")

# Load the cleaned edges for this campaign
edges_path = INTERIM_DIR / f"{CAMPAIGN}_edges_clean.parquet"
edges = pd.read_parquet(edges_path)

print(f"\nEdges table for {CAMPAIGN}: {edges.shape[0]:,} rows, {edges.shape[1]} columns")
print("Edge columns:", list(edges.columns))

print("\nSample of scores for this campaign:")
display(scores_c.head())

print("\nSample of edges for this campaign:")
display(edges.head())

Scores table: 30,089 rows, 24 columns
Campaigns in scores: ['abc_reading', 'electric_toothbrush', 'intelligent_floor_scrubber', 'ruby_face_cream', 'spark_thinking', 'supor_boosted_showerhead']

Campaign: abc_reading
Accounts in campaign: 6,869

Edges table for abc_reading: 439,620 rows, 3 columns
Edge columns: ['src_user_id', 'dst_user_id', 'interact_type']

Sample of scores for this campaign:


Unnamed: 0,user_id,comment,reposts,total_engagement,user_followers,user_friends,engagement_per_follower,in_degree,out_degree,pagerank,...,log1p_user_friends,log1p_comment,log1p_reposts,log1p_total_engagement,log1p_in_degree,log1p_out_degree,log1p_pagerank,label_high_engagement,score_logistic_full_final,rank_by_score_final
0,185,2500,5825,8325,16153761,1324,0.000515,4700,6,0.00984,...,7.189168,7.824446,8.670086,9.027138,8.455531,1.94591,0.009792,1,1.0,1
1,16820,244,710,954,1853564,2395,0.000515,728,87,0.001706,...,7.781556,5.501258,6.566672,6.861711,6.591674,4.477337,0.001705,1,1.0,2
2,42692,2953,1955,4908,176281,12,0.027842,3928,35,0.011569,...,2.564949,7.990915,7.578657,8.498826,8.27614,3.583519,0.011502,1,1.0,3
3,42852,429,0,429,146,1283,2.938356,48,90,0.000286,...,7.157735,6.063785,0.0,6.063785,3.89182,4.51086,0.000286,1,1.0,4
4,45225,322,105,427,10072677,481,4.2e-05,253,28,0.00234,...,6.177944,5.777652,4.663439,6.059123,5.537334,3.367296,0.002338,1,1.0,5



Sample of edges for this campaign:


Unnamed: 0,src_user_id,dst_user_id,interact_type
0,201456,201455,comment
1,201504,201455,comment
2,201505,201455,comment
3,201506,201455,comment
4,201507,201455,reposts


In [8]:
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from pyvis.network import Network

# -------------------------------------------------------------------
# Settings
# -------------------------------------------------------------------
CAMPAIGN = "abc_reading"   # change to another campaign name if you like
TOP_N = 60                 # how many top accounts to show
MAX_EDGES = 1000           # cap on number of strongest edges to draw

PROC_DIR = Path("data/processed")
INTERIM_DIR = Path("data/interim")

# -------------------------------------------------------------------
# Load scores + edges for this campaign
# -------------------------------------------------------------------
scores_path = PROC_DIR / "model_final_scores_all_campaigns.parquet"
scores = pd.read_parquet(scores_path)

edges_path = INTERIM_DIR / f"{CAMPAIGN}_edges_clean.parquet"
edges = pd.read_parquet(edges_path)

scores_c = scores[scores["product_name"] == CAMPAIGN].copy()

print(f"Building STATIC visual for {CAMPAIGN}")
print(f"Accounts in campaign: {len(scores_c):,}")
print(f"Edges in cleaned graph: {len(edges):,}")

# -------------------------------------------------------------------
# Select top-N accounts by final model score
# -------------------------------------------------------------------
top_nodes = (
    scores_c.sort_values("score_logistic_full_final", ascending=False)
            .head(TOP_N)
            .copy()
)
top_ids = set(top_nodes["user_id"].astype(int))

# For highlighting: top 10 by score
top10_ids = set(
    top_nodes.sort_values("score_logistic_full_final", ascending=False)
             .head(10)["user_id"].astype(int)
)

# -------------------------------------------------------------------
# Filter edges where BOTH ends are in the top set, then keep strongest
# -------------------------------------------------------------------
edges_sub = edges[
    edges["src_user_id"].isin(top_ids) & edges["dst_user_id"].isin(top_ids)
].copy()

edge_counts = (
    edges_sub
    .value_counts(["src_user_id", "dst_user_id"])
    .reset_index(name="weight")
    .sort_values("weight", ascending=False)
    .head(MAX_EDGES)
)

print(f"Nodes (top-N): {len(top_nodes):,}, edges after filtering: {len(edge_counts):,}")

# -------------------------------------------------------------------
# Build NetworkX graph
# -------------------------------------------------------------------
G = nx.Graph()

for _, row in top_nodes.iterrows():
    uid = int(row["user_id"])
    G.add_node(
        uid,
        followers=int(row["user_followers"]),
        engagement=int(row["total_engagement"]),
        in_degree=int(row["in_degree"]),
        pagerank=float(row["pagerank"]),
        score=float(row["score_logistic_full_final"]),
        official=bool(row["is_official_influencer"]),
    )

for _, row in edge_counts.iterrows():
    src = int(row["src_user_id"])
    dst = int(row["dst_user_id"])
    w   = int(row["weight"])
    if src in G and dst in G:
        G.add_edge(src, dst, weight=w)

print(f"Subgraph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")

# -------------------------------------------------------------------
# Static layout: compute node positions once
# -------------------------------------------------------------------
pos = nx.spring_layout(G, k=0.4, iterations=80, seed=42)

# -------------------------------------------------------------------
# PyVis network setup (physics OFF for static view)
# -------------------------------------------------------------------
net = Network(
    height="750px",
    width="100%",
    bgcolor="#ffffff",
    font_color="#000000",
    notebook=True,
)

net.toggle_physics(False)

# Normalise PageRank for sizing
pr_values = np.array([attrs["pagerank"] for _, attrs in G.nodes(data=True)])
if pr_values.max() > 0:
    pr_norm = (pr_values - pr_values.min()) / (pr_values.max() - pr_values.min() + 1e-9)
else:
    pr_norm = pr_values

# Add nodes with plain-text tooltip
for (node_id, attrs), pr in zip(G.nodes(data=True), pr_norm):
    followers = attrs["followers"]
    engagement = attrs["engagement"]
    indeg = attrs["in_degree"]
    pr_val = attrs["pagerank"]
    score = attrs["score"]
    official = attrs["official"]

    # Bigger nodes for more central accounts
    size = 10 + 30 * pr

    # Colours:
    #   - red: top 10 by model score
    #   - blue: official influencers (not in top 10)
    #   - orange: other non-officials
    if node_id in top10_ids:
        color = "#e41a1c"  # red
    elif official:
        color = "#377eb8"  # blue
    else:
        color = "#fd8d3c"  # orange

    # Plain-text tooltip (no HTML)
    title = (
        f"User {node_id}\n"
        f"Followers: {followers:,}\n"
        f"Engagement (comments+reposts): {engagement:,}\n"
        f"In-degree (people who engaged): {indeg:,}\n"
        f"PageRank: {pr_val:.4f}\n"
        f"Model score: {score:.3f}\n"
        f"Official influencer: {official}"
    )

    x, y = pos[node_id]
    net.add_node(
        node_id,
        label=str(node_id),
        title=title,
        size=size,
        color=color,
        x=float(x * 1000),
        y=float(y * 1000),
        physics=False,
    )

# Add edges
for u, v, attrs in G.edges(data=True):
    w = attrs.get("weight", 1)
    net.add_edge(u, v, value=w)

# -------------------------------------------------------------------
# Save HTML into reports/graphs
# -------------------------------------------------------------------
out_dir = Path("reports/graphs")
out_dir.mkdir(parents=True, exist_ok=True)

html_path = out_dir / f"{CAMPAIGN}_network_top{TOP_N}_static.html"
net.show(str(html_path))

print(f"\nSaved static interactive network to: {html_path}")
print("Open this file in your browser to explore (zoom, drag, hover).")

Building STATIC visual for abc_reading
Accounts in campaign: 6,869
Edges in cleaned graph: 439,620
Nodes (top-N): 60, edges after filtering: 307
Subgraph: 60 nodes, 207 edges
reports/graphs/abc_reading_network_top60_static.html

Saved static interactive network to: reports/graphs/abc_reading_network_top60_static.html
Open this file in your browser to explore (zoom, drag, hover).
