<a href="https://colab.research.google.com/github/selgebali/Colabs/blob/main/Scripts_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import pandas as pd
import plotly.graph_objects as go

# Base API URL and client ID
base_url = "https://api.datacite.org/dois"
client_id = "zypi.vnulkh"

# List of metadata fields to query
metadata_fields = [
    "creators", "titles", "publisher", "publicationYear", "resourceTypeGeneral",
    "subjects", "contributors", "dates", "descriptions", "geoLocations", "language",
    "relatedIdentifiers", "sizes", "formats", "version", "rightsList", "fundingReferences",
    "relatedItems", "identifiers"
]

# Dictionary to store the results
results = {"Field": [], "DOICount": [], "Missing": [], "PresentPercentage": [], "MissingPercentage": []}

# Function to get the total count of DOIs for a specific query
def get_doi_count(query_url):
    response = requests.get(query_url)
    if response.status_code == 200:
        return response.json()["meta"]["total"]
    else:
        print(f"Error querying {query_url}: {response.status_code}")
        return 0

# Get the total number of DOIs for the client
total_dois_url = f"{base_url}?client-id={client_id}&page[size]=0"
total_dois = get_doi_count(total_dois_url)

print(f"Total DOIs: {total_dois}")

# Loop over each metadata field and get the DOI count
for field in metadata_fields:
    field_url = f"{base_url}?client-id={client_id}&query={field}:*&page[size]=0"
    field_count = get_doi_count(field_url)

    # Calculate missing as total DOIs minus the count for the field
    missing_count = total_dois - field_count

    # Calculate percentages
    present_percentage = (field_count / total_dois) * 100
    missing_percentage = (missing_count / total_dois) * 100

    # Store the results
    results["Field"].append(field)
    results["DOICount"].append(field_count)
    results["Missing"].append(missing_count)
    results["PresentPercentage"].append(present_percentage)
    results["MissingPercentage"].append(missing_percentage)

# Convert the results into a pandas DataFrame
df = pd.DataFrame(results)

# Sort the DataFrame by 'PresentPercentage' in ascending order for better visualization
df_sorted = df.sort_values(by='PresentPercentage', ascending=True)

# Function to assign properties to categories
def assign_categories():
    """Defines the categories for the properties."""
    mandatory_properties = ['creators', 'titles', 'publisher', 'publicationYear', 'resourceTypeGeneral']
    recommended_properties = ['subjects', 'contributors', 'dates', 'relatedIdentifiers', 'descriptions', 'geoLocations']
    optional_properties = ['language', 'identifiers', 'sizes', 'formats', 'version', 'rightsList', 'fundingReferences', 'relatedItems']
    return mandatory_properties, recommended_properties, optional_properties

# Create the stacked horizontal bar chart with Plotly
def plot_stacked_bar_chart(df):
    """Plots a horizontal stacked bar chart of Present vs Missing for each property."""

    # Extract the sorted properties, present, and missing values
    sorted_properties = df['Field'].values
    sorted_present = df['PresentPercentage'].values
    sorted_missing = df['MissingPercentage'].values

    # Create a mapping of colors based on the categories
    mandatory_properties, recommended_properties, optional_properties = assign_categories()
    property_colors = []
    for prop in sorted_properties:
        if prop in mandatory_properties:
            property_colors.append('#243b54')  # Mandatory category color
        elif prop in recommended_properties:
            property_colors.append('#BC2B66')  # Recommended category color
        elif prop in optional_properties:
            property_colors.append('#0D60D4')  # Optional category color

    # Create the stacked horizontal bar chart with Plotly
    fig = go.Figure()

    # Add Present bars
    fig.add_trace(go.Bar(
        y=sorted_properties,  # Set y-axis to properties (categories) for horizontal bars
        x=sorted_present,
        name='Present',
        orientation='h',  # Horizontal bars
        marker=dict(color='#243b54'),
        text=[f'<b>{p:.2f}%</b>' for p in sorted_present],  # Add percentage labels for Present
        textposition='inside',  # Place text inside the bars for better readability
        insidetextanchor='end',  # Ensures text is well-placed for horizontal bars
    ))

    # Add Missing bars
    fig.add_trace(go.Bar(
        y=sorted_properties,  # Set y-axis to properties (categories) for horizontal bars
        x=sorted_missing,
        name='Missing',
        orientation='h',  # Horizontal bars
        marker=dict(color='#00B1E2'),
        base=sorted_present,  # Stack missing on top of present
        text=[f'<b>{m:.2f}%</b>' for m in sorted_missing],  # Add percentage labels for Missing
        textposition='none',  # Hide text for Missing (optional)
    ))

    # Update layout with colored y-axis labels
    fig.update_layout(
        barmode='stack',
        title='Metadata Completeness by Property (Horizontal, Sorted by Present)',
        xaxis_title='Percentage',
        font=dict(size=16),  # Font size for the chart
        hovermode='y unified',
        width=1200,  # Chart width
        height=900,  # Chart height
        plot_bgcolor='white',  # Set plot background to white
        paper_bgcolor='white',  # Set entire figure background to white
        bargap=0.1,  # Small gap between bars
        margin=dict(l=120, r=100, t=40, b=40),  # Adjust margins
        showlegend=True,  # Show legend
        yaxis=dict(
            tickmode='array',  # Custom tick values and text for y-axis
            tickvals=sorted_properties,  # Y-axis labels (properties)
            ticktext=[f"<span style='color:{color};'>{prop}</span>" for prop, color in zip(sorted_properties, property_colors)],  # Apply colors to properties
        )
    )

    # Show the interactive plotly figure
    fig.show()

# Plot the results
plot_stacked_bar_chart(df_sorted)


Total DOIs: 306291


In [3]:
import pandas as pd
import requests
import plotly.graph_objs as go
import networkx as nx
import numpy as np

# Part 1: Querying DataCite GraphQL API
def query_datacite():
    # Define the GraphQL query
    query = '''
    {
      publications {
        totalCount
        datasetConnectionCount
        softwareConnectionCount
        personConnectionCount
        organizationConnectionCount
        funderConnectionCount
      }
      datasets {
        totalCount
        softwareConnectionCount
        personConnectionCount
        organizationConnectionCount
        funderConnectionCount
      }
      softwares {
        totalCount
        personConnectionCount
        organizationConnectionCount
        funderConnectionCount
      }
      people(query: "*") {
        totalCount
        organizationConnectionCount
      }
      organizations {
        totalCount
      }
      funders {
        totalCount
      }
    }
    '''

    # Send the request to the DataCite GraphQL API
    url = "https://api.datacite.org/graphql"
    response = requests.post(url, json={'query': query})

    # Check for errors in the response
    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return None
    else:
        data = response.json()
        if "errors" in data:
            print("GraphQL Errors:", data["errors"])
            return None
        else:
            # Safely extract data
            publications = {'id': 'Publication', 'label': 'Publication', 'count': data.get('data', {}).get('publications', {}).get('totalCount', 0)}
            datasets = {'id': 'Dataset', 'label': 'Dataset', 'count': data.get('data', {}).get('datasets', {}).get('totalCount', 0)}
            softwares = {'id': 'Software', 'label': 'Software', 'count': data.get('data', {}).get('softwares', {}).get('totalCount', 0)}
            people = {'id': 'Person', 'label': 'Person', 'count': data.get('data', {}).get('people', {}).get('totalCount', 0)}
            organizations = {'id': 'Organization', 'label': 'Organization', 'count': data.get('data', {}).get('organizations', {}).get('totalCount', 0)}
            funders = {'id': 'Funder', 'label': 'Funder', 'count': data.get('data', {}).get('funders', {}).get('totalCount', 0)}

            # Create DataFrames for nodes and edges
            nodes = pd.DataFrame([publications, datasets, softwares, people, organizations, funders])
            edges = pd.DataFrame([
                {'source': 'Publication', 'target': 'Dataset', 'count': data.get('data', {}).get('publications', {}).get('datasetConnectionCount', 0)},
                {'source': 'Publication', 'target': 'Software', 'count': data.get('data', {}).get('publications', {}).get('softwareConnectionCount', 0)},
                {'source': 'Publication', 'target': 'Person', 'count': data.get('data', {}).get('publications', {}).get('personConnectionCount', 0)},
                {'source': 'Publication', 'target': 'Organization', 'count': data.get('data', {}).get('publications', {}).get('organizationConnectionCount', 0)},
                {'source': 'Publication', 'target': 'Funder', 'count': data.get('data', {}).get('publications', {}).get('funderConnectionCount', 0)},
                {'source': 'Dataset', 'target': 'Software', 'count': data.get('data', {}).get('datasets', {}).get('softwareConnectionCount', 0)},
                {'source': 'Dataset', 'target': 'Person', 'count': data.get('data', {}).get('datasets', {}).get('personConnectionCount', 0)},
                {'source': 'Dataset', 'target': 'Organization', 'count': data.get('data', {}).get('datasets', {}).get('organizationConnectionCount', 0)},
                {'source': 'Dataset', 'target': 'Funder', 'count': data.get('data', {}).get('datasets', {}).get('funderConnectionCount', 0)},
                {'source': 'Software', 'target': 'Person', 'count': data.get('data', {}).get('softwares', {}).get('personConnectionCount', 0)},
                {'source': 'Software', 'target': 'Organization', 'count': data.get('data', {}).get('softwares', {}).get('organizationConnectionCount', 0)},
                {'source': 'Software', 'target': 'Funder', 'count': data.get('data', {}).get('softwares', {}).get('funderConnectionCount', 0)},
                {'source': 'Person', 'target': 'Organization', 'count': data.get('data', {}).get('people', {}).get('organizationConnectionCount', 0)}
            ])

            # Combine with additional nodes and edges from the second code snippet
            additional_nodes = [
                {'id': 'Dissertations', 'label': 'Dissertations', 'count': 465664},
                {'id': 'Repositories', 'label': 'Repositories', 'count': 2822}
            ]
            additional_edges = [
                {'source': 'Dissertations', 'target': 'Person', 'count': 21144},
                {'source': 'Dissertations', 'target': 'Organization', 'count': 91},
                {'source': 'Dissertations', 'target': 'Funder', 'count': 592},
                {'source': 'Dissertations', 'target': 'Repositories', 'count': 257}
            ]

            # Append additional nodes and edges
            nodes = pd.concat([nodes, pd.DataFrame(additional_nodes)], ignore_index=True)
            edges = pd.concat([edges, pd.DataFrame(additional_edges)], ignore_index=True)

            # Ensure no NaN values are present in nodes and edges DataFrame
            nodes['count'] = nodes['count'].fillna(0).astype(int)

            return nodes, edges
    return None, None

# Part 2: Plotting the Graph

def plot_network_graph(nodes, edges):
    # Create a NetworkX graph
    G = nx.Graph()

    # Add nodes with attributes
    for node in nodes.itertuples():
        G.add_node(node.id, label=node.label, size=node.count, color='#7f7f7f')

    # Add edges with attributes
    for edge in edges.itertuples():
        G.add_edge(edge.source, edge.target, weight=int(edge.count))

    # Define node positions using circular layout for better spacing
    pos = nx.circular_layout(G)

    # Extract node properties
    node_x = [pos[node][0] for node in G.nodes]
    node_y = [pos[node][1] for node in G.nodes]
    node_size = [G.nodes[node]['size'] for node in G.nodes]
    node_color = [G.nodes[node]['color'] for node in G.nodes]
    node_text = [f"{G.nodes[node]['label']} <br>({G.nodes[node]['size']})" for node in G.nodes]
    node_hovertext = [f"{G.nodes[node]['label']}<br>Count: {G.nodes[node]['size']}" for node in G.nodes]

    # Ensure node sizes are non-NaN and non-negative
    node_size = [size if size > 0 else 10 for size in node_size]

    # Normalize node sizes
    max_size = max(node_size) if len(node_size) > 0 else 1
    node_size = [((size / max_size) * 60) + 150 for size in node_size]

    # Define distinct colors for node types to differentiate
    color_map = {
        'Publication': '#243B54',
        'Dataset': '#00B1E2',
        'Software': '#5B88B9',
        'Person': '#46BCAB',
        'Organization': '#90D7CD',
        'Funder': '#BC2B66',
        'Dissertations': '#F07C73',
        'Repositories': '#AEC7E8'
    }
    node_color = [color_map.get(G.nodes[node]['label'], '#7f7f7f') for node in G.nodes]

    # Create node traces with text labels
    node_trace = go.Scatter(
        x=node_x,
        y=node_y,
        mode='markers+text',
        text=node_text,
        hovertext=node_hovertext,
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color=node_color,
            size=node_size,
            line_width=2,
            opacity=1.0
        ),
        textposition='middle center',
        textfont=dict(
            size=18,
            family='Arial',
            color='White',
            weight='bold'
        )
    )

    # Create edge traces with curved lines based on edge weight
    edge_traces = []
    edge_annotations = []
    edge_weights = [e[2]['weight'] for e in G.edges(data=True)]
    max_weight = max(edge_weights) if len(edge_weights) > 0 else 1
    min_weight = min(edge_weights) if len(edge_weights) > 0 else 0

    for edge in G.edges(data=True):
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        weight = edge[2]['weight']

        # Adjust scaling factor for edge thickness
        if max_weight != min_weight:
            width = ((weight - min_weight) / (max_weight - min_weight)) * 20 + 12
        else:
            width = 2

        # Generate slight curve for edges (using quadratic Bezier curve approximation)
        t = np.linspace(0, 1, 100)
        x_mid = (x0 + x1) / 2
        y_mid = (y0 + y1) / 2 + 0.1
        x_values = (1 - t) ** 2 * x0 + 2 * (1 - t) * t * x_mid + t ** 2 * x1
        y_values = (1 - t) ** 2 * y0 + 2 * (1 - t) * t * y_mid + t ** 2 * y1

        # Use a more visible color for edges
        edge_color = '#888'

        # Create edge trace
        edge_trace = go.Scatter(
            x=x_values,
            y=y_values,
            line=dict(width=width, color=edge_color),
            hoverinfo='text',
            hovertext=f"{G.nodes[edge[0]]['label']} - {G.nodes[edge[1]]['label']}<br>Connections: {weight}",
            mode='lines'
        )
        edge_traces.append(edge_trace)

        # Add edge annotations for displaying counts on edges
        edge_annotations.append(dict(
            x=(x0 + x1) / 2,
            y=(y0 + y1) / 2,
            text=str(weight),
            showarrow=False,
            font=dict(size=14, color='black'),
            align='center'
        ))

    # Create the figure
    fig = go.Figure(data=edge_traces + [node_trace],
                    layout=go.Layout(
                        title=dict(
                            text='Merged Network Graph: Number of Nodes and Connections',
                            font=dict(size=20),
                            x=0.5, y=0.98,
                            xanchor='center', yanchor='top'
                        ),
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        annotations=edge_annotations,
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        width=1500, height=1000,
                        plot_bgcolor='white',
                        paper_bgcolor='white'
                    ))

    # Show the figure
    fig.show()

# Main function to run both parts
if __name__ == "__main__":
    nodes, edges = query_datacite()
    if nodes is not None and edges is not None:
        plot_network_graph(nodes, edges)
    else:
        print("Failed to retrieve data from DataCite API.")