<a href="https://colab.research.google.com/github/rafabarros95/DS_LOD_and_Knowledge_Graphs_2024_Rafael_Barros_and_Habiba_Naeem/blob/main/LOD_OpenAlex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Knowledge Graph with OpenAlex

## Extra necessary packages

In [None]:
!pip install pandas requests tqdm




## Imports

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import json
import math

In [4]:
# Defining constants
API_URL = "https://api.openalex.org/works"
TOTAL_WORKS = 5000
BATCH_SIZE = 200  # Maximum allowed by OpenAlex per request
USER_AGENT = "OpenAlex-KG-Example/1.0 (mailto:your_email@example.com)" # just used the suggestion provided but no real email


## Fetching Data through API

In [6]:
def fetch_openalex_works(total_works=5000, batch_size=200):
    """
    Fetches 'total_works' number of Works from OpenAlex using pagination.
    Returns a list of Work JSON objects.
    """
    all_works = []
    pages_needed = math.ceil(total_works / batch_size)

    for page in tqdm(range(1, pages_needed + 1), desc="Fetching Works"):
        # Calculate how many items to fetch in this batch
        per_page = batch_size if page < pages_needed else (total_works % batch_size or batch_size)

        params = {
            'page': page,
            'per-page': per_page
        }
        try:
            r = requests.get(API_URL, headers={"User-Agent": USER_AGENT}, params=params)
            r.raise_for_status()
            data = r.json()
            all_works.extend(data.get('results', []))
        except requests.exceptions.RequestException as e:
            print(f"Error at page {page}: {e}")
            break

    return all_works

## Data Extraction - Relevant Relationships Triples


In [5]:
def build_relationship_triples(works):
    """
    Given a list of Work objects from OpenAlex, build a list of relationship triples.

    Each triple is a dict with:
       subject_id, subject_type, subject_name,
       relationship,
       object_id,  object_type,  object_name
    """
    triples = []

    for w in works:
        work_id = w.get('id')  # e.g. "https://openalex.org/W12345"
        work_title = w.get('title', 'Unknown Title')

        # -------------------------
        # 1. (Work) authored_by -> (Author)
        # 2. (Author) affiliated_with -> (Institution)
        # -------------------------
        authorships = w.get('authorships', [])
        for auth in authorships:
            author_info = auth.get('author', {})

            # (Work) authored_by -> (Author)
            author_id = author_info.get('id')        # e.g. "https://openalex.org/A98765"
            author_name = author_info.get('display_name', 'Unknown Author')
            if author_id:
                triples.append({
                    "subject_id":   work_id,
                    "subject_type": "Work",
                    "subject_name": work_title,
                    "relationship": "authored_by",
                    "object_id":    author_id,
                    "object_type":  "Author",
                    "object_name":  author_name
                })

            # (Author) affiliated_with -> (Institution)
            # Some authors have multiple affiliations
            affiliations = author_info.get('affiliations', [])
            for aff in affiliations:
                inst_id = aff.get('id')  # e.g. "https://openalex.org/I12345"
                inst_name = aff.get('display_name', 'Unknown Institution')
                if inst_id:
                    triples.append({
                        "subject_id":   author_id,
                        "subject_type": "Author",
                        "subject_name": author_name,
                        "relationship": "affiliated_with",
                        "object_id":    inst_id,
                        "object_type":  "Institution",
                        "object_name":  inst_name
                    })

        # -------------------------
        # 3. (Work) published_in -> (Venue)
        # -------------------------
        host_venue = w.get('host_venue', {})
        venue_id = host_venue.get('id')  # e.g. "https://openalex.org/V12345"
        venue_name = host_venue.get('display_name', None)
        if venue_id and venue_name:
            triples.append({
                "subject_id":   work_id,
                "subject_type": "Work",
                "subject_name": work_title,
                "relationship": "published_in",
                "object_id":    venue_id,
                "object_type":  "Venue",
                "object_name":  venue_name
            })

        # -------------------------
        # 4. (Work) has_topic -> (Concept)
        #    Also handle domain, subfield, field
        # -------------------------
        concepts = w.get('concepts', [])
        for concept in concepts:
            concept_id = concept.get('id')  # e.g. "https://openalex.org/C2778791732"
            concept_name = concept.get('display_name', 'Unknown Concept')
            if not concept_id:
                continue

            # has_topic
            triples.append({
                "subject_id":   work_id,
                "subject_type": "Work",
                "subject_name": work_title,
                "relationship": "has_topic",
                "object_id":    concept_id,
                "object_type":  "Concept",
                "object_name":  concept_name
            })

            # belongs_to_domain, belongs_to_subfield, belongs_to_field
            # Based on concept["level"]: 0=domain, 1=subfield, 2=field
            level = concept.get('level')
            if level == 0:
                triples.append({
                    "subject_id":   work_id,
                    "subject_type": "Work",
                    "subject_name": work_title,
                    "relationship": "belongs_to_domain",
                    "object_id":    concept_id,
                    "object_type":  "Domain",
                    "object_name":  concept_name
                })
            elif level == 1:
                triples.append({
                    "subject_id":   work_id,
                    "subject_type": "Work",
                    "subject_name": work_title,
                    "relationship": "belongs_to_subfield",
                    "object_id":    concept_id,
                    "object_type":  "Subfield",
                    "object_name":  concept_name
                })
            elif level == 2:
                triples.append({
                    "subject_id":   work_id,
                    "subject_type": "Work",
                    "subject_name": work_title,
                    "relationship": "belongs_to_field",
                    "object_id":    concept_id,
                    "object_type":  "Field",
                    "object_name":  concept_name
                })

    return triples

## Loading the Data into Dataframe

In [7]:
works_data = fetch_openalex_works(TOTAL_WORKS, BATCH_SIZE)
print(f"Fetched {len(works_data)} works from OpenAlex.")

# Build the triple-based relationships
relationship_triples = build_relationship_triples(works_data)
print(f"Built {len(relationship_triples)} relationship triples.")

# Convert to DataFrame
df_triples = pd.DataFrame(relationship_triples)

#  cleaning:
#  Dropping exact duplicates (if the same triple was generated more than once)
df_triples.drop_duplicates(
    subset=[
        "subject_id",
        "relationship",
        "object_id"
    ],
    inplace=True
)

# 2) Replacing empty strings with NaN for clarity from pandas
df_triples.replace('', pd.NA, inplace=True)

# Showing a small sample
df_triples.sample(5)

Fetching Works: 100%|██████████| 25/25 [01:13<00:00,  2.94s/it]


Fetched 5000 works from OpenAlex.
Built 145027 relationship triples.


Unnamed: 0,subject_id,subject_type,subject_name,relationship,object_id,object_type,object_name
77806,https://openalex.org/W2071128523,Work,Analysis of a complex of statistical variables...,belongs_to_subfield,https://openalex.org/C145420912,Subfield,Mathematics education
19454,https://openalex.org/W2010094123,Work,Free radicals in biology and medicine,belongs_to_field,https://openalex.org/C161790260,Field,Catalysis
46465,https://openalex.org/W3147181238,Work,Business Research Methods,has_topic,https://openalex.org/C194943564,Concept,Participant observation
20591,https://openalex.org/W2774486220,Work,<b>lmerTest</b> Package: Tests in Linear Mixed...,has_topic,https://openalex.org/C41008148,Concept,Computer science
7370,https://openalex.org/W2035266068,Work,Molecular dynamics with coupling to an externa...,has_topic,https://openalex.org/C121864883,Concept,Statistical physics


## Importing into CSV File

In [None]:
csv_filename = "openalex_kg_relationships.csv"
df_triples.to_csv(csv_filename, index=False)

print(f"\nCSV file '{csv_filename}' created with {len(df_triples)} unique relationships.\n")

#
from google.colab import files
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>