In [4]:
import networkx as nx
import pandas as pd

In [5]:

def load_and_clean_graph(input_file, output_file, unwanted_types):
    G = nx.read_graphml(input_file)
    nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient="index")
    nodes_df["id"] = nodes_df.index 
    edges_df = pd.DataFrame(list(G.edges(data=True)), columns=["source", "target", "attributes"])
    nodes_cleaned = nodes_df[~nodes_df["type"].isin(unwanted_types)]
 
    filtered_node_ids = set(nodes_cleaned["id"])
    edges_cleaned = edges_df[
        edges_df["source"].isin(filtered_node_ids) & edges_df["target"].isin(filtered_node_ids)
    ]
    G_cleaned = nx.DiGraph()

    for _, node in nodes_cleaned.iterrows():
        node_id = node["id"]
        attributes = node.drop("id").to_dict()
        G_cleaned.add_node(node_id, **attributes)
 
    for _, edge in edges_cleaned.iterrows():
        source = edge["source"]
        target = edge["target"]
        attributes = edge["attributes"]
        G_cleaned.add_edge(source, target, **attributes)

    nx.write_graphml(G_cleaned, output_file)
    print(f"Cleaned graph saved to {output_file}")

In [None]:

input_graph_file = "data/10_58200__cleaned_graph.graphml"  
output_graph_file = "data/test.graphml"  

unwanted_types = [
    "detectedLanguage",
    # "mimeType",
    "ancestors",
    "processingAgent",
    "processingStatus",
    "processedAt",
]

load_and_clean_graph(input_graph_file, output_graph_file, unwanted_types)


Cleaned graph saved to data/test.graphml
