<a href="https://colab.research.google.com/github/selgebali/Colabs/blob/main/the_motherload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
import json
import pandas as pd
import networkx as nx
import plotly.graph_objs as go
import plotly.io as pio

# Custom color palette
custom_colors = ['#243B54', '#00B1E2', '#5B88B9', '#46BCAB', '#90D7CD', '#BC2B66']

# Function to fetch data from the API endpoint with dynamic resource type
def fetch_api_data(url, resource_type, limit=None):
    resources = []
    while url:
        response = requests.get(url)  # Send a request to the given API URL
        print(f"Fetching data from URL: {url}")  # Debug log: API URL being fetched
        response.raise_for_status()  # Raise an error if the request was unsuccessful
        data = response.json()  # Parse the JSON response
        print(f"API response status code: {response.status_code}")  # Debug log: Status code of response
        resources.extend(parse_resources(data['data'], resource_type))  # Parse the resources
        url = data['links'].get('next') if limit is None else None  # Update URL to next page if no limit specified
        if limit and len(resources) >= limit:
            resources = resources[:limit]  # Limit the number of resources if a limit is specified
            break
    total_resources = data['meta'].get('total', 'Unknown')  # Get total number of resources from metadata
    print(f"Total number of resources available: {total_resources}")  # Print the total number of resources available
    print(f"Number of resources fetched: {len(resources)}")  # Debug log: Number of resources fetched
    return resources

# Function to parse resources
def parse_resources(data, resource_type):
    resources = []
    for item in data:
        # Normalize resourceTypeGeneral to handle variations like hyphens and casing
        resource_type_general = item['attributes']['types'].get('resourceTypeGeneral', '').replace('-', '').lower()
        if resource_type_general == resource_type.replace('-', '').lower():
            resource_data = {
                "doi": item['attributes'].get('doi', 'No DOI available'),
                "resourceTypeGeneral": item['attributes']['types'].get('resourceTypeGeneral', 'No resourceTypeGeneral available'),
                "resourceType": item['attributes']['types'].get('resourceType', 'No resource type available'),
                "schemaOrg": item['attributes']['types'].get('schemaOrg', 'No schemaOrg available'),
                "creators": [],
                "contributors": [],
                "publishers": item['attributes'].get('publisher', 'No publisher information'),
                "relatedItems": []
            }
            # Add creators to the resource data
            for creator in item['attributes'].get('creators', []):
                resource_data['creators'].append({
                    "name": creator.get('name', 'No name available'),
                    "identifier": creator['nameIdentifiers'][0]['nameIdentifier'] if (creator.get('nameIdentifiers') and creator['nameIdentifiers']) else 'No identifier'
                })
            # Add contributors to the resource data
            for contributor in item['attributes'].get('contributors', []):
                resource_data['contributors'].append({
                    "name": contributor.get('name', 'No name available'),
                    "type": contributor.get('contributorType', 'No type available'),
                    "identifier": contributor['nameIdentifiers'][0]['nameIdentifier'] if (contributor.get('nameIdentifiers') and contributor['nameIdentifiers']) else 'No identifier'
                })
            # Add related items if they exist
            if 'relatedIdentifiers' in item['attributes']:
                for related in item['attributes']['relatedIdentifiers']:
                    resource_data['relatedItems'].append({
                        "identifier": related.get('relatedIdentifier', 'No identifier available'),
                        "relationType": related.get('relationType', 'No relation type available')
                    })
            resources.append(resource_data)  # Append parsed resource data to the list
    print(f"Parsed {len(resources)} resources.")  # Debug log: Number of resources parsed
    return resources

# Function to visualize the resource network and append results
def visualize_resource_plotly(resource, html_file, nodes_csv, edges_csv):
    G = nx.DiGraph()  # Create a directed graph

    # Update central node to include additional information
    resource_node_label = f"{resource['resourceTypeGeneral']}:<br>DOI: {resource['doi']}<br>Resource Type: {resource['resourceType']}<br>SchemaOrg: {resource['schemaOrg']}"
    G.add_node(resource_node_label, label=resource_node_label, size=300, color=custom_colors[0])
    print(f"Added central node: {resource_node_label}")  # Debug log: Central node added

    # Add creators
    for creator in resource["creators"]:
        creator_label = f"Creator: {creator['name']}<br>ID: {creator['identifier']}"
        G.add_node(creator_label, label=creator_label, size=300, color=custom_colors[1])
        G.add_edge(resource_node_label, creator_label, relationType='Created by')
        print(f"Added creator node and edge: {creator_label}")  # Debug log: Creator node and edge added

    # Add contributors
    for contributor in resource["contributors"]:
        contributor_label = f"Contributor: {contributor['name']}<br>Type: {contributor['type']}<br>ID: {contributor['identifier']}"
        G.add_node(contributor_label, label=contributor_label, size=300, color=custom_colors[2])
        G.add_edge(resource_node_label, contributor_label, relationType='Contributed by')
        print(f"Added contributor node and edge: {contributor_label}")  # Debug log: Contributor node and edge added

    # Add publisher
    publisher_label = f"Publisher: <br>{resource['publishers']}"
    G.add_node(publisher_label, label=publisher_label, size=300, color=custom_colors[3])
    G.add_edge(resource_node_label, publisher_label, relationType='Published by')
    print(f"Added publisher node and edge: {publisher_label}")  # Debug log: Publisher node and edge added

    # Add related items
    for item in resource["relatedItems"]:
        item_node_label = f"{item['relationType']}: {item['identifier']}"
        G.add_node(item_node_label, label=item_node_label, size=300, color=custom_colors[4])
        G.add_edge(resource_node_label, item_node_label, relationType=item["relationType"])
        print(f"Added related item node and edge: {item_node_label}")  # Debug log: Related item node and edge added

    # Generate positions for the nodes
    pos = nx.spring_layout(G, seed=42)

    node_x, node_y, node_text, node_hovertext, node_sizes, node_colors = [], [], [], [], [], []
    for node in G.nodes(data=True):
        x, y = pos[node[0]]
        node_x.append(x)
        node_y.append(y)
        node_text.append(node[1]['label'])
        node_hovertext.append(node[1]['label'])
        node_sizes.append(node[1]['size'])
        node_colors.append(node[1]['color'])

    # Prepare edge attributes
    edge_x, edge_y, edge_text = [], [], []
    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create edge trace
    edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=3, color='gray'), hoverinfo='none', mode='lines')

    # Create node trace
    node_trace = go.Scatter(
        x=node_x, y=node_y, text=node_text, hovertext=node_hovertext, mode='markers+text',
        textposition='middle center', marker=dict(size=node_sizes, color=node_colors)
    )

    # Create the Plotly figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=f"Knowledge Graph of {resource['doi']}",
                        titlefont_size=16, showlegend=False, hovermode='closest', margin=dict(b=20, l=5, r=5, t=40),
                        width=1500, height=1000, xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False,  zeroline=False, showticklabels=False),
                        plot_bgcolor='white',  # Set plot background color here
                        paper_bgcolor='white'  # Set overall figure background color
                    )
                      )
    fig.show()
    print(f"Displayed knowledge graph for resource: {resource['doi']}")  # Debug log: Graph displayed

    # Append to HTML file
    with open(html_file, 'a') as f:
        f.write(pio.to_html(fig, include_plotlyjs='cdn'))
        print(f"Appended graph to HTML file: {html_file}")  # Debug log: Graph appended to HTML

    # Append nodes to CSV
    node_data = [{"Node": node[0], "Label": node[1]['label'], "Size": node[1]['size'], "Color": node[1]['color']} for node in G.nodes(data=True)]
    nodes_df = pd.DataFrame(node_data)
    nodes_df.to_csv(nodes_csv, mode='a', index=False, header=not nodes_df.empty, encoding='utf-8')
    print(f"Appended nodes to CSV file: {nodes_csv}")  # Debug log: Nodes appended to CSV

    # Append edges to CSV
    edge_data = [{"Source": edge[0], "Target": edge[1], "RelationType": edge[2].get('relationType', 'Related to')} for edge in G.edges(data=True)]
    edges_df = pd.DataFrame(edge_data)
    edges_df.to_csv(edges_csv, mode='a', index=False, header=not edges_df.empty, encoding='utf-8')
    print(f"Appended edges to CSV file: {edges_csv}")  # Debug log: Edges appended to CSV

# Example usage
resource_type = "physical-object"
limit = 20  # Set limit to None to fetch all available resources
api_url = f"https://api.datacite.org/dois?resource-type-id={resource_type.lower()}&page[size]={limit if limit else 100}"

html_file = "merged_knowledge_graphs.html"
nodes_csv = "merged_nodes.csv"
edges_csv = "merged_edges.csv"

# Initialize HTML file with basic structure
with open(html_file, 'w') as f:
    f.write("<html><head><title>Knowledge Graphs</title></head><body>")
    print(f"Initialized HTML file: {html_file}")  # Debug log: HTML file initialized

resources = fetch_api_data(api_url, resource_type, limit=limit)
print(f"Number of resources to visualize: {len(resources)}")  # Debug log: Number of resources to visualize
for resource in resources:
    visualize_resource_plotly(resource, html_file, nodes_csv, edges_csv)

# Close the HTML file after writing all content
with open(html_file, 'a') as f:
    f.write("</body></html>")
    print(f"Closed HTML file: {html_file}")  # Debug log: HTML file closed

print(f"Graphs saved in {html_file}, nodes in {nodes_csv}, and edges in {edges_csv}")

Initialized HTML file: merged_knowledge_graphs.html
Fetching data from URL: https://api.datacite.org/dois?resource-type-id=physical-object&page[size]=20
API response status code: 200
Parsed 20 resources.
Total number of resources available: 13464190
Number of resources fetched: 20
Number of resources to visualize: 20
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dfc70zy<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dfc70zy
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6df7yb97<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6df7yb97
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6df58b2k<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6df58b2k
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6df1wn05<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: Floyer, E.A.<br>ID: No identifier
Added contributor node and edge: Contributor: REDON, Bérangère<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6df1wn05
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6ded8v5n<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6ded8v5n
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6ded2ji2<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6ded2ji2
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6deblf8u<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6deblf8u
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6de87ckq<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6de87ckq
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6de2r9h7<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6de2r9h7
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dd40485<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dd40485
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dd04l95<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: Ancient World Mapping Center, AWMC<br>ID: No identifier
Added contributor node and edge: Contributor: Droux, Jean-Philippe<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dd04l95
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dcdj74r<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dcdj74r
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dccmry7<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added creator node and edge: Creator: Parker<br>ID: No identifier
Added contributor node and edge: Contributor: REDON, Bérangère<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dccmry7
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dcc337t<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dcc337t
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dbb0lh1<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dbb0lh1
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6db9wa09<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6db9wa09
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6db9ea2k<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6db9ea2k
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6daf1m9c<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6daf1m9c
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6dad35sw<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Suarez, Romain<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6dad35sw
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Added central node: PhysicalObject:<br>DOI: 10.34847/nkl.6daajoy7<br>Resource Type: No resource type available<br>SchemaOrg: CreativeWork
Added creator node and edge: Creator: :unkn<br>ID: No identifier
Added contributor node and edge: Contributor: Arrif, Abdelmajid<br>Type: ContactPerson<br>ID: No identifier
Added publisher node and edge: Publisher: <br>NAKALA - https://nakala.fr (Huma-Num - CNRS)


Displayed knowledge graph for resource: 10.34847/nkl.6daajoy7
Appended graph to HTML file: merged_knowledge_graphs.html
Appended nodes to CSV file: merged_nodes.csv
Appended edges to CSV file: merged_edges.csv
Closed HTML file: merged_knowledge_graphs.html
Graphs saved in merged_knowledge_graphs.html, nodes in merged_nodes.csv, and edges in merged_edges.csv
