<a href="https://colab.research.google.com/github/rafabarros95/DS_LOD_and_Knowledge_Graphs_2024_Rafael_Barros_and_Habiba_Naeem/blob/main/OpenAlex_Core_Relationships_Extracted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests pandas



## Importing some Dependencies

In [None]:
import requests
import pandas as pd
import time
from google.colab import files

## Function to fetch 5000 works from OpenAlex API

In [None]:
def get_works(page=1, per_page=200):
    """
    Retrieves a batch of Works from the OpenAlex API for the given page.
    Returns the 'results' list from the API response.
    """
    base_url = "https://api.openalex.org/works"
    params = {
        "page": page,
        "per_page": per_page
    }
    try:
        resp = requests.get(base_url, params=params)
        resp.raise_for_status()
        data = resp.json()
        return data.get("results", [])
    except Exception as e:
        print(f"Error fetching page {page}: {str(e)}")
        return []


## Retrieving the Data

In [None]:
all_works = []
num_pages = 25    # ~5,000 works, since 25 x 200 = 5,000
per_page = 200

for p in range(1, num_pages + 1):
    works_batch = get_works(page=p, per_page=per_page)
    if not works_batch:
        break  # stop if there's an error or no data
    all_works.extend(works_batch)
    time.sleep(0.2)

print(f"Total works retrieved: {len(all_works)}")


Total works retrieved: 5000


## Core Relationships extracted

In [None]:
triples = []  # will hold tuples of (subject, relationship, object)

for work in all_works:
    work_id = work.get("id")  # e.g. "https://openalex.org/W123456789"
    if not work_id:
        continue

    # 1. Work CITES Work ( Core Relationship )
    referenced_works = work.get("referenced_works", [])
    for ref_id in referenced_works:
        triples.append((work_id, "CITES", ref_id))

    # 2. Work AUTHORED_BY Author ( Core Relationship )
    authorships = work.get("authorships", [])
    for auth in authorships:
        author_id = auth.get("author", {}).get("id")
        if author_id:
            triples.append((work_id, "AUTHORED_BY", author_id))

        # 3. Work AFFILIATED_WITH Institution ( Core Relationship )
        institutions = auth.get("institutions", [])
        for inst in institutions:
            inst_id = inst.get("id")
            if inst_id:
                triples.append((work_id, "AFFILIATED_WITH", inst_id))

    # 4. Work PUBLISHED_IN Venue ( Core Relationship )
    host_venue = work.get("host_venue", {})
    venue_id = host_venue.get("id")
    if venue_id:
        triples.append((work_id, "PUBLISHED_IN", venue_id))

    # 5. Work HAS_CONCEPT Concept ( Core Relationship )
    concepts = work.get("concepts", [])
    for concept in concepts:
        concept_id = concept.get("id")
        if concept_id:
            triples.append((work_id, "HAS_CONCEPT", concept_id))


## Cleaning and Exporting the Data

In [None]:
# we have covered as triple: (subject-relationship-object)

# Build a DataFrame
df_triples = pd.DataFrame(triples, columns=["subject", "relationship", "object"])

# Remove duplicates
df_triples.drop_duplicates(inplace=True)

# Drop rows with None/NaN
df_triples.dropna(subset=["subject", "relationship", "object"], inplace=True)

# Show summary
print("Number of relationship records:", len(df_triples))
df_triples.head(10)


Number of relationship records: 348565


Unnamed: 0,subject,relationship,object
0,https://openalex.org/W1775749144,CITES,https://openalex.org/W1507976594
1,https://openalex.org/W1775749144,CITES,https://openalex.org/W1515052776
2,https://openalex.org/W1775749144,CITES,https://openalex.org/W1535643256
3,https://openalex.org/W1775749144,CITES,https://openalex.org/W1540384225
4,https://openalex.org/W1775749144,CITES,https://openalex.org/W1557790601
5,https://openalex.org/W1775749144,CITES,https://openalex.org/W1573055659
6,https://openalex.org/W1775749144,CITES,https://openalex.org/W1576013682
7,https://openalex.org/W1775749144,CITES,https://openalex.org/W195762992
8,https://openalex.org/W1775749144,CITES,https://openalex.org/W1979011732
9,https://openalex.org/W1775749144,CITES,https://openalex.org/W1979607356


## Saving File

In [None]:
# Save to CSV
csv_filename = "openalex_kg_relationships.csv"
df_triples.to_csv(csv_filename, index=False)

# Download in Colab
files.download(csv_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Next Step: Generate Nodes.csv and Edges.csv - Gephi Visualization

In [3]:
import pandas as pd
from google.colab import files

# 1. Read the existing relationships CSV
df = pd.read_csv("/content/sample_data/openalex_kg_relationships.csv")

# Quick check
print("Number of relationships:", len(df))
df.head()


Number of relationships: 348565


Unnamed: 0,subject,relationship,object
0,https://openalex.org/W1775749144,CITES,https://openalex.org/W1507976594
1,https://openalex.org/W1775749144,CITES,https://openalex.org/W1515052776
2,https://openalex.org/W1775749144,CITES,https://openalex.org/W1535643256
3,https://openalex.org/W1775749144,CITES,https://openalex.org/W1540384225
4,https://openalex.org/W1775749144,CITES,https://openalex.org/W1557790601


## Nodes Table

In [4]:
# Get all unique node IDs
unique_node_ids = set(df["subject"]).union(set(df["object"]))
print("Total unique node IDs:", len(unique_node_ids))

def get_node_type(openalex_id):
    """
    Based on the suffix after 'https://openalex.org/',
    return a type (Work, Author, Institution, Venue, Concept).
    """
    # e.g., "https://openalex.org/W123456" -> "W123456" -> first char = "W"
    # fallback type = "Other" if not recognized
    suffix = openalex_id.split('/')[-1]  # e.g. "W123456"
    prefix = suffix[0]                  # e.g. "W"
    if prefix == "W":
        return "Work"
    elif prefix == "A":
        return "Author"
    elif prefix == "I":
        return "Institution"
    elif prefix == "V":
        return "Venue"
    elif prefix == "C":
        return "Concept"
    else:
        return "Other"

# Create a list of dicts for each node
nodes_data = []
for node_id in unique_node_ids:
    node_type = get_node_type(node_id)
    nodes_data.append({
        "Id": node_id,         # Node's unique ID (for Gephi)
        "Label": node_id,
        "Type": node_type
    })

# Convert to DataFrame
df_nodes = pd.DataFrame(nodes_data, columns=["Id", "Label", "Type"])
print("Nodes DataFrame shape:", df_nodes.shape)
df_nodes.head()


Total unique node IDs: 224579
Nodes DataFrame shape: (224579, 3)


Unnamed: 0,Id,Label,Type
0,https://openalex.org/W2086472796,https://openalex.org/W2086472796,Work
1,https://openalex.org/W2050199024,https://openalex.org/W2050199024,Work
2,https://openalex.org/W2069525186,https://openalex.org/W2069525186,Work
3,https://openalex.org/A5073904173,https://openalex.org/A5073904173,Author
4,https://openalex.org/W2092594657,https://openalex.org/W2092594657,Work


## Build Edges Table

In [5]:
# Build edges DataFrame
# We rename columns to match Gephi's expected naming
df_edges = df.rename(columns={
    "subject": "Source",
    "object": "Target",
    "relationship": "Label"
})

# We define the edge type for Gephi.
df_edges["Type"] = "Directed"

# Reorder columns for clarity
df_edges = df_edges[["Source", "Target", "Type", "Label"]]

print("Edges DataFrame shape:", df_edges.shape)
df_edges.head()


Edges DataFrame shape: (348565, 4)


Unnamed: 0,Source,Target,Type,Label
0,https://openalex.org/W1775749144,https://openalex.org/W1507976594,Directed,CITES
1,https://openalex.org/W1775749144,https://openalex.org/W1515052776,Directed,CITES
2,https://openalex.org/W1775749144,https://openalex.org/W1535643256,Directed,CITES
3,https://openalex.org/W1775749144,https://openalex.org/W1540384225,Directed,CITES
4,https://openalex.org/W1775749144,https://openalex.org/W1557790601,Directed,CITES


## Exporting and Download

In [6]:
# Save nodes
nodes_filename = "nodes.csv"
df_nodes.to_csv(nodes_filename, index=False)
files.download(nodes_filename)

# Save edges
edges_filename = "edges.csv"
df_edges.to_csv(edges_filename, index=False)
files.download(edges_filename)

print("Done! Downloaded nodes.csv and edges.csv for Gephi.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done! Downloaded nodes.csv and edges.csv for Gephi.
