In [13]:
# pip install SPARQLWrapper networkx pyvis
import os
import re
import pickle
from collections import deque

from SPARQLWrapper import SPARQLWrapper, JSON
import networkx as nx
from pyvis.network import Network

SPARQL_ENDPOINT = "https://sparql.dblp.org/sparql"

In [14]:
root_person_uri = "https://dblp.org/pid/82/7468"
max_depth = 2

In [15]:
def sanitize_uri(uri: str) -> str:
    """
    将URI中的非字母数字字符用'_'替换。
    并加上前缀'n_'以确保ID不以数字开头。
    """
    # 将所有非字母数字的字符替换成下划线
    safe_id = re.sub(r"[^a-zA-Z0-9]+", "_", uri)
    # 以 n_ 做前缀，确保它符合Graphviz对标识符的要求
    safe_id = f"n_{safe_id}"
    return safe_id


def sanitize_nodes_and_edges(nodes, edges):
    """
    将原本以URI为key的节点和边，转换成以“安全ID”为key的结构。
    返回 (safe_nodes, safe_edges, uri_mapping) 三个值：

    - safe_nodes: dict
        key = 安全ID(仅包含字母数字和下划线)，
        value = { "label": 原先的显示名称, "original_uri": 原URI 等 }
    - safe_edges: dict
        key = (安全ID1, 安全ID2) (确保无向图中小的ID在前),
        value = { "weight": ..., ... }
    - uri_mapping: dict
        key = 原URI, value = 安全ID (供您在其他地方可能需要反查)
    """
    safe_nodes = {}
    safe_edges = {}
    uri_mapping = {}

    # 1. 为每个节点分配安全ID
    for old_uri, info in nodes.items():
        new_id = sanitize_uri(old_uri)
        uri_mapping[old_uri] = new_id

        # 将 name 作为可视化的 label，也可保留更多元数据
        safe_nodes[new_id] = {"name": info["name"], "original_uri": old_uri}

    # 2. 处理边，对 (uri1, uri2) 做同样的映射
    for (uri1, uri2), einfo in edges.items():
        weight = einfo.get("weight", 1)

        safe_u1 = uri_mapping[uri1]
        safe_u2 = uri_mapping[uri2]

        # 确保无向图key有固定顺序(小的ID放前面)
        # 防止 (u1, u2) 和 (u2, u1) 重复
        if safe_u1 < safe_u2:
            edge_key = (safe_u1, safe_u2)
        else:
            edge_key = (safe_u2, safe_u1)

        if edge_key not in safe_edges:
            safe_edges[edge_key] = {"weight": weight}
        else:
            # 如果该对已经存在, 累加或别的逻辑(按您需求)
            safe_edges[edge_key]["weight"] += weight

    return safe_nodes, safe_edges, uri_mapping

In [16]:
def get_person_name(person_uri: str) -> str:
    """
    查询给定 person_uri 的 dblp:creatorName。
    如果查询不到，则使用 URI 的最后一段做后备。
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setReturnFormat(JSON)

    query = f"""
    PREFIX dblp: <https://dblp.org/rdf/schema#>
    SELECT ?name
    WHERE {{
        OPTIONAL {{ <{person_uri}> dblp:creatorName ?name . }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    try:
        results = sparql.query().convert()
        bindings = results["results"]["bindings"]
        if bindings:
            # 如果有name就返回
            name = bindings[0].get("name", {}).get("value")
            if name:
                return name
        # 若没拿到，则降级到URI后缀
        return "Unknown"
    except Exception as e:
        print(f"[ERROR] SPARQL query failed when fetching name for {person_uri}: {e}")
        return "Unknown"

In [17]:
# [Debug]
get_person_name("https://dblp.org/pid/82/7468")

'Zied Bouyahia'

In [18]:
def get_coauthor_info(person_uri):
    """
    查询给定作者(person_uri)的所有合作者，以及他们的合作文章数量。

    返回: List[ (coauthor_uri, coauthor_name, pub_count) ]
    - coauthor_uri   : 合作者的URI
    - coauthor_name  : 合作者的名字(若无则为空)
    - pub_count      : 与 person_uri 之间的合作文章数
    """
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setReturnFormat(JSON)

    # 查询思路:
    #   1. 找出 person_uri 的所有论文(?pub)
    #   2. 同时也是由 ?coauthor 撰写的论文
    #   3. 按 ?coauthor 分组，计算论文总数
    #   4. 过滤自己(不包括 person_uri 本人)
    query = f"""
    PREFIX dblp: <https://dblp.org/rdf/schema#>

    SELECT ?coauthor (SAMPLE(?coauthorName) AS ?name) (COUNT(DISTINCT ?pub) AS ?pubCount)
    WHERE {{
        ?pub dblp:authoredBy <{person_uri}> ;
             dblp:authoredBy ?coauthor .
        FILTER (?coauthor != <{person_uri}>)

        OPTIONAL {{ ?coauthor dblp:creatorName ?coauthorName . }}
    }}
    GROUP BY ?coauthor
    """

    sparql.setQuery(query)

    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"[ERROR] SPARQL query failed for {person_uri}: {e}")
        return []

    coauthors = []
    for row in results["results"]["bindings"]:
        co_uri = row["coauthor"]["value"]
        co_name = row["name"]["value"] if "name" in row else ""
        pub_count = int(row["pubCount"]["value"]) if "pubCount" in row else 0
        coauthors.append((co_uri, co_name, pub_count))

    return coauthors

In [19]:
# [Debug]
get_coauthor_info("https://dblp.org/pid/82/7468")

[('https://dblp.org/pid/03/6572', 'Hedi Haddad', 12),
 ('https://dblp.org/pid/05/5769', 'Ahmed Nait-Sidi-Moh', 1),
 ('https://dblp.org/pid/08/565', 'Khaled Ghédira', 4),
 ('https://dblp.org/pid/17/848', 'Nafaâ Jabeur', 9),
 ('https://dblp.org/pid/184/9350', 'Leila Horchani', 2),
 ('https://dblp.org/pid/206/1503', 'Hana Gharrad', 1),
 ('https://dblp.org/pid/246/1303', 'Shafique A. Chaudhry', 1),
 ('https://dblp.org/pid/349/4118', 'Mahmoud Mastouri', 1),
 ('https://dblp.org/pid/70/7470', 'Monia Bellalouna', 4),
 ('https://dblp.org/pid/76/6334', 'Stéphane Derrode', 5),
 ('https://dblp.org/pid/81/3284', 'Wojciech Pieczynski', 4),
 ('https://dblp.org/pid/82/2493', 'Ansar Yasar', 2),
 ('https://dblp.org/pid/88/7260', 'Patrick Jaillet', 2),
 ('https://dblp.org/pid/88/7651', 'Fatma Outay', 1)]

In [20]:
def build_coauthor_network(root_person_uri, max_depth=1):
    """
    构建合作者网络，使用 BFS 遍历，可指定最大深度 max_depth。

    返回:
      nodes: dict,
        key = 作者URI,
        value = {"name": 作者姓名(或URI后缀)}
      edges: dict,
        key = (uri1, uri2) 排序元组(保证无向一致)
        value = {"weight": 合作次数}
    """
    nodes = {}
    edges = {}

    queue = deque()
    visited = set()

    # 初始化队列
    queue.append((root_person_uri, 0))
    visited.add(root_person_uri)

    # 我们也可以先为root节点获取一个名字(可写成一个函数), 这里简单处理:
    root_name = get_person_name(root_person_uri)
    nodes[root_person_uri] = {"name": root_name}

    while queue:
        current_person, depth = queue.popleft()

        # 日志输出：告诉我们正在处理谁，以及深度
        # print(f"[INFO] Depth={depth}: Processing author {current_person} ...")

        if depth >= max_depth:
            # 达到或超过最大深度，不再向下搜索
            continue

        # 获取当前作者的所有合作者信息
        coauthors = get_coauthor_info(current_person)
        # print(f"       Found {len(coauthors)} coauthors for {current_person}.")

        for co_uri, co_name, pub_count in coauthors:
            # 如果还没在节点里，记录下名字
            if co_uri not in nodes:
                nodes[co_uri] = {"name": co_name if co_name else "Unknown"}

            # 构造无向边的key(排序后存储)
            edge_key = tuple(sorted([current_person, co_uri]))

            if edge_key not in edges:
                edges[edge_key] = {"weight": pub_count}
            else:
                # 如果已经有了边，则将合作次数累加或取最大值，视需求而定
                # 这里以累加为例:
                # edges[edge_key]["weight"] += pub_count
                pass

            # BFS入队处理
            if co_uri not in visited:
                visited.add(co_uri)
                queue.append((co_uri, depth + 1))

    nodes, edges, _ = sanitize_nodes_and_edges(nodes, edges)

    return nodes, edges

In [21]:
def build_adjacency_list(nodes, edges):
    adj_list = {}

    for node_uri in nodes:
        adj_list[node_uri] = []

    for (uri1, uri2), edge_info in edges.items():
        weight = edge_info.get("weight", 1)

        adj_list[uri1].append((uri2, weight))
        adj_list[uri2].append((uri1, weight))

    return adj_list

In [22]:
# [Debug]
nodes, edges = build_coauthor_network("https://dblp.org/pid/82/7468", max_depth=1)

build_adjacency_list(nodes, edges)

{'n_https_dblp_org_pid_82_7468': [('n_https_dblp_org_pid_03_6572', 12),
  ('n_https_dblp_org_pid_05_5769', 1),
  ('n_https_dblp_org_pid_08_565', 4),
  ('n_https_dblp_org_pid_17_848', 9),
  ('n_https_dblp_org_pid_184_9350', 2),
  ('n_https_dblp_org_pid_206_1503', 1),
  ('n_https_dblp_org_pid_246_1303', 1),
  ('n_https_dblp_org_pid_349_4118', 1),
  ('n_https_dblp_org_pid_70_7470', 4),
  ('n_https_dblp_org_pid_76_6334', 5),
  ('n_https_dblp_org_pid_81_3284', 4),
  ('n_https_dblp_org_pid_82_2493', 2),
  ('n_https_dblp_org_pid_88_7260', 2),
  ('n_https_dblp_org_pid_88_7651', 1)],
 'n_https_dblp_org_pid_03_6572': [('n_https_dblp_org_pid_82_7468', 12)],
 'n_https_dblp_org_pid_05_5769': [('n_https_dblp_org_pid_82_7468', 1)],
 'n_https_dblp_org_pid_08_565': [('n_https_dblp_org_pid_82_7468', 4)],
 'n_https_dblp_org_pid_17_848': [('n_https_dblp_org_pid_82_7468', 9)],
 'n_https_dblp_org_pid_184_9350': [('n_https_dblp_org_pid_82_7468', 2)],
 'n_https_dblp_org_pid_206_1503': [('n_https_dblp_org_pid_

In [23]:
def visualize_interactive_pyvis(
    nodes, edges, root_id, output_filename="coauthor_network_interactive"
):
    G = nx.Graph()
    for node_id, info in nodes.items():
        if node_id == root_id:
            G.add_node(node_id, label=info["name"], color="#ED3B3E", font={"size": 24})
        else:
            G.add_node(node_id, label=info["name"], font={"size": 18})
    for (u, v), edge_info in edges.items():
        G.add_edge(
            u,
            v,
            weight=edge_info["weight"],
            label=f"{edge_info['weight']}",
            font={"size": 12},
        )

    os.makedirs(f"output/{output_filename}", exist_ok=True)

    net = Network(height="100vh", width="100vw")
    net.from_nx(G)
    net.save_graph(f"output/{output_filename}/graph.html")

    with open(f"output/{output_filename}/graph.pkl", "wb") as f:
        pickle.dump(G, f)

In [24]:
# [Debug]
visualize_interactive_pyvis(
    nodes,
    edges,
    root_id=sanitize_uri("https://dblp.org/pid/82/7468"),
    output_filename="coauthor_network_interactive_test",
)

In [25]:
if __name__ == "__main__":
    print(
        f"=== Building co-author network for {root_person_uri}, up to depth={max_depth} ==="
    )
    nodes, edges = build_coauthor_network(root_person_uri, max_depth=max_depth)

    print(f"Total authors found: {len(nodes)}")
    print(f"Total edges found: {len(edges)}")

    # 可视化图
    visualize_interactive_pyvis(
        nodes,
        edges,
        root_id=sanitize_uri(root_person_uri),
        output_filename=f"Coauthor Network of {get_person_name(root_person_uri)} with Depth {max_depth}",
    )

    # 日志：完成
    print("[INFO] Done.")

=== Building co-author network for https://dblp.org/pid/82/7468, up to depth=2 ===
Total authors found: 759
Total edges found: 878
[INFO] Done.
