<a href="https://colab.research.google.com/github/rafabarros95/DS_LOD_and_Knowledge_Graphs_2024_Rafael_Barros_and_Habiba_Naeem/blob/main/LOD_OpenAlex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Knowledge Graph with OpenAlex

## Installing required Libraries if needed

In [14]:
!pip install requests neo4j

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.27.0


## Imports

In [15]:
import requests
import pandas as pd
import networkx as nx
import time
from math import ceil
from tqdm import tqdm


## Define Parameters for Data Exploration

In [1]:
# OpenAlex API - URL
BASE_URL = 'https://api.openalex.org/works'

# Parameters
PER_PAGE = 200  # Max per_page allowed by OpenAlex
TOTAL_WORKS = 10000  # Total number of works to fetch
MAX_PAGE = TOTAL_WORKS // PER_PAGE


## Fetching Data

In [16]:
import requests
import json

base_url = "https://api.openalex.org/works"
per_page = 200
total_needed = 10000

works_data = []

# Starting cursor. '*' means the first page.
params = {
    "per-page": per_page,
    "cursor": "*",
}

while len(works_data) < total_needed:
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # raise an error if not 200
        data = response.json()

        new_results = data.get("results", [])
        if not new_results:
            print("No more results. Stopping.")
            break

        works_data.extend(new_results)

        # Get the next_cursor from the meta block to paginate
        next_cursor = data["meta"].get("next_cursor", None)
        if not next_cursor:
            print("No next_cursor found. Possibly end of data.")
            break

        params["cursor"] = next_cursor  # Update cursor
        print(f"Collected so far: {len(works_data)}")

    except Exception as e:
        print(f"Error: {e}")
        break

# Trim if over 10k
if len(works_data) > total_needed:
    works_data = works_data[:total_needed]

print(f"Total works collected: {len(works_data)}")


Collected so far: 200
Collected so far: 400
Collected so far: 600
Collected so far: 800
Collected so far: 1000
Collected so far: 1200
Collected so far: 1400
Collected so far: 1600
Collected so far: 1800
Collected so far: 2000
Collected so far: 2200
Collected so far: 2400
Collected so far: 2600
Collected so far: 2800
Collected so far: 3000
Collected so far: 3200
Collected so far: 3400
Collected so far: 3600
Collected so far: 3800
Collected so far: 4000
Collected so far: 4200
Collected so far: 4400
Collected so far: 4600
Collected so far: 4800
Collected so far: 5000
Collected so far: 5200
Collected so far: 5400
Collected so far: 5600
Collected so far: 5800
Collected so far: 6000
Collected so far: 6200
Collected so far: 6400
Collected so far: 6600
Collected so far: 6800
Collected so far: 7000
Collected so far: 7200
Collected so far: 7400
Collected so far: 7600
Collected so far: 7800
Collected so far: 8000
Collected so far: 8200
Collected so far: 8400
Collected so far: 8600
Collected so fa

## Data Inspection


In [17]:
# Inspect one item to see the structure
if len(works_data) > 0:
    print(json.dumps(works_data[0], indent=2))
else:
    print("No data collected.")




{
  "id": "https://openalex.org/W1775749144",
  "doi": "https://doi.org/10.1016/s0021-9258(19)52451-6",
  "title": "PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT",
  "display_name": "PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT",
  "publication_year": 1951,
  "publication_date": "1951-11-01",
  "ids": {
    "openalex": "https://openalex.org/W1775749144",
    "doi": "https://doi.org/10.1016/s0021-9258(19)52451-6",
    "mag": "1775749144",
    "pmid": "https://pubmed.ncbi.nlm.nih.gov/14907713"
  },
  "language": "en",
  "primary_location": {
    "is_oa": true,
    "landing_page_url": "https://doi.org/10.1016/s0021-9258(19)52451-6",
    "pdf_url": null,
    "source": {
      "id": "https://openalex.org/S140251998",
      "display_name": "Journal of Biological Chemistry",
      "issn_l": "0021-9258",
      "issn": [
        "0021-9258",
        "1067-8816",
        "1083-351X"
      ],
      "is_oa": true,
      "is_in_doaj": true,
      "is_core": true,
      "host_organization":

## Loading the Data into Pandas Dataframe

In [20]:
import pandas as pd

# Prepare a list of records where each record is a flat dict
records = []

for work in works_data:
    work_id = work["id"]
    work_title = work.get("title", "No Title")
    publication_year = work.get("publication_year", None)

    # Concepts may be a list, we'll be creating pipe-separated strings
    concepts = work.get("concepts", [])
    concept_ids = "|".join([c["id"] for c in concepts])
    concept_names = "|".join([c["display_name"] for c in concepts])

    record = {
        "id": work_id,
        "title": work_title,
        "publication_year": publication_year,
        "concept_ids": concept_ids,
        "concept_names": concept_names
    }
    records.append(record)

df = pd.DataFrame(records)


## Checking for Inconsistency

NameError: name 'head' is not defined

In [23]:

df.isnull().sum()



Unnamed: 0,0
id,0
title,27
publication_year,0
concept_ids,0
concept_names,0


## Importing into CSV File

In [26]:
import csv

output_filename = "openalex_works_10000.csv"

# Open a CSV writer
with open(output_filename, mode="w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)

    # Write header row
    writer.writerow(["id", "title", "publication_year", "concept_ids", "concept_names"])

    for work in works_data:
        work_id = work["id"]
        work_title = work.get("title", "No Title")
        publication_year = work.get("publication_year", None)

        # Extract concept info
        concepts = work.get("concepts", [])
        concept_ids = "|".join([c["id"] for c in concepts])               # e.g. "https://openalex.org/C123|https://openalex.org/C456"
        concept_names = "|".join([c["display_name"] for c in concepts])   # e.g. "Machine Learning|Artificial Intelligence"

        # Write row
        writer.writerow([work_id, work_title, publication_year, concept_ids, concept_names])

print(f"CSV file '{output_filename}' created successfully!")


CSV file 'openalex_works_10000.csv' created successfully!


## Downloading





In [27]:
# Downloading the csv file
from google.colab import files
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Checking the Authorships

In [30]:
# Print authors for the first 5 Works as an example
for i, work in enumerate(works_data[:5]):
    print(f"=== Work {i+1} ===")
    print("Work ID:", work.get("id", "No ID"))
    print("Work Title:", work.get("title", "No Title"))

    authorships = work.get("authorships", [])
    if not authorships:
        print("  No authors listed.")
        continue

    for a in authorships:
        author_data = a.get("author", {})
        author_id = author_data.get("id", "No ID")
        author_name = author_data.get("display_name", "No Name")
        print(f"  Author ID: {author_id}, Name: {author_name}")
    print()


=== Work 1 ===
Work ID: https://openalex.org/W1775749144
Work Title: PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT
  Author ID: https://openalex.org/A5110127684, Name: OliverH. Lowry
  Author ID: https://openalex.org/A5032482932, Name: NiraJ. Rosebrough
  Author ID: https://openalex.org/A5004071084, Name: A. Farr
  Author ID: https://openalex.org/A5109894513, Name: RoseJ. Randall

=== Work 2 ===
Work ID: https://openalex.org/W2582743722
Work Title: R: A language and environment for statistical computing.
  Author ID: https://openalex.org/A5087294524, Name: R Core Team

=== Work 3 ===
Work ID: https://openalex.org/W2100837269
Work Title: Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4
  Author ID: https://openalex.org/A5030122660, Name: Ulrich K. Laemmli

=== Work 4 ===
Work ID: https://openalex.org/W2128635872
Work Title: A Rapid and Sensitive Method for the Quantitation of Microgram Quantities of Protein Utilizing the Principle of Protein-Dye Bi

## Pandas Dataframe for Authors

In [31]:
import pandas as pd

author_records = []

for work in works_data:
    work_id = work.get("id", "No Work ID")
    work_title = work.get("title", "No Title")

    authorships = work.get("authorships", [])
    for auth in authorships:
        author_data = auth.get("author", {})
        author_id = author_data.get("id", "No Author ID")
        author_name = author_data.get("display_name", "Unknown Author")

        # Some works also have "author_position" in the authorship object
        author_position = auth.get("author_position", None)

        # Build a row (dictionary)
        row = {
            "work_id": work_id,
            "work_title": work_title,
            "author_id": author_id,
            "author_name": author_name,
            "author_position": author_position
        }
        author_records.append(row)

# Convert to a DataFrame
df_authors = pd.DataFrame(author_records)
df_authors.head(10)


Unnamed: 0,work_id,work_title,author_id,author_name,author_position
0,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5110127684,OliverH. Lowry,first
1,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5032482932,NiraJ. Rosebrough,middle
2,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5004071084,A. Farr,middle
3,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5109894513,RoseJ. Randall,last
4,https://openalex.org/W2582743722,R: A language and environment for statistical ...,https://openalex.org/A5087294524,R Core Team,first
5,https://openalex.org/W2100837269,Cleavage of Structural Proteins during the Ass...,https://openalex.org/A5030122660,Ulrich K. Laemmli,first
6,https://openalex.org/W2128635872,A Rapid and Sensitive Method for the Quantitat...,https://openalex.org/A5047160510,Mark A. Bradford,first
7,https://openalex.org/W4293247451,A rapid and sensitive method for the quantitat...,https://openalex.org/A5021181975,Marion M. Bradford,first
8,https://openalex.org/W2194775991,Deep Residual Learning for Image Recognition,https://openalex.org/A5100700361,Kaiming He,first
9,https://openalex.org/W2194775991,Deep Residual Learning for Image Recognition,https://openalex.org/A5100362465,Xiangyu Zhang,middle


## Inspecting the df.head()

In [32]:
df_authors.info()
df_authors.head(5)  # see the top 5 rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50529 entries, 0 to 50528
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   work_id          50529 non-null  object
 1   work_title       50465 non-null  object
 2   author_id        50529 non-null  object
 3   author_name      50529 non-null  object
 4   author_position  50529 non-null  object
dtypes: object(5)
memory usage: 1.9+ MB


Unnamed: 0,work_id,work_title,author_id,author_name,author_position
0,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5110127684,OliverH. Lowry,first
1,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5032482932,NiraJ. Rosebrough,middle
2,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5004071084,A. Farr,middle
3,https://openalex.org/W1775749144,PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT,https://openalex.org/A5109894513,RoseJ. Randall,last
4,https://openalex.org/W2582743722,R: A language and environment for statistical ...,https://openalex.org/A5087294524,R Core Team,first


## Exporting Authors to CSV file

In [33]:
df_authors.to_csv("openalex_work_authors.csv", index=False, encoding="utf-8")
print("CSV file 'openalex_work_authors.csv' has been saved!")


CSV file 'openalex_work_authors.csv' has been saved!


## Downloading

In [34]:
from google.colab import files
files.download("openalex_work_authors.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>