<a href="https://colab.research.google.com/github/rafabarros95/DS_LOD_and_Knowledge_Graphs_2024_Rafael_Barros_and_Habiba_Naeem/blob/main/LOD_OpenAlex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Knowledge Graph with OpenAlex

## Extra necessary packages

In [22]:
!pip install pandas requests tqdm




## Imports

In [23]:
import requests
import pandas as pd
from tqdm import tqdm
import json
from urllib.parse import urlencode

In [31]:
# Define constants
API_URL = "https://api.openalex.org/works"
TOTAL_WORKS = 5000
BATCH_SIZE = 200  # Maximum allowed by OpenAlex per request
HEADERS = {

}

## Fetching Data through API

In [25]:
# Function to fetch data from OpenAlex - 5000 Works
def fetch_openalex_works(total_works=5000, batch_size=200):
    works = []
    pages = total_works // batch_size
    remainder = total_works % batch_size
    total_pages = pages + (1 if remainder else 0)

    for page in tqdm(range(total_pages), desc="Fetching works"):
        limit = batch_size if page < pages else remainder
        params = {
            'per-page': limit,
            'page': page + 1
        }
        response = requests.get(API_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            works.extend(data['results'])
        else:
            print(f"Failed to fetch page {page + 1}: Status code {response.status_code}")
            break
    return works

## Data Extraction - Relevant Relationships


In [26]:
# Function to extract relationships listed to data{}
def extract_relationships(works):
    data = {
        'work_id': [],
        'title': [],
        'authored_by': [],
        'has_topic': [],
        'affiliated_with': [],
        'belongs_to_domain': [],
        'belongs_to_subfield': [],
        'belongs_to_field': [],
        'published_in': []
    }

    for work in works:
        data['work_id'].append(work.get('id', None))
        data['title'].append(work.get('title', None))

        # Authored_by
        authors = work.get('authorships', [])
        author_names = [author.get('author', {}).get('display_name') for author in authors if author.get('author', {}).get('display_name')]
        data['authored_by'].append('; '.join(author_names) if author_names else None)

        # has_topic
        concepts = work.get('concepts', [])
        topics = [concept.get('display_name') for concept in concepts if concept.get('display_name')]
        data['has_topic'].append('; '.join(topics) if topics else None)

        # affiliated_with
        institutions = set()
        for author in authors:
            insts = author.get('author', {}).get('affiliations', [])
            for inst in insts:
                inst_name = inst.get('display_name')
                if inst_name:
                    institutions.add(inst_name)
        data['affiliated_with'].append('; '.join(institutions) if institutions else None)

        # belongs_to_domain, subfield, field
        # OpenAlex classifies concepts into three hierarchical levels: domain, subfield, field
        domains = set()
        subfields = set()
        fields = set()
        for concept in concepts:
            if 'level' in concept:
                if concept['level'] == 0:
                    domains.add(concept.get('display_name'))
                elif concept['level'] == 1:
                    subfields.add(concept.get('display_name'))
                elif concept['level'] == 2:
                    fields.add(concept.get('display_name'))
        data['belongs_to_domain'].append('; '.join(domains) if domains else None)
        data['belongs_to_subfield'].append('; '.join(subfields) if subfields else None)
        data['belongs_to_field'].append('; '.join(fields) if fields else None)

        # published_in
        venue = work.get('host_venue', {})
        venue_name = venue.get('display_name')
        data['published_in'].append(venue_name if venue_name else None)

    return pd.DataFrame(data)

## Loading the Data into Dataframe

In [32]:
# Fetch works from OpenAlex
works = fetch_openalex_works(TOTAL_WORKS, BATCH_SIZE)

Fetching works: 100%|██████████| 25/25 [00:23<00:00,  1.05it/s]


## Checking for Inconsistency followed by Cleaning

In [33]:
if not works:
    print("No works were fetched. Please check the API parameters and try again.")
else:
    # Extract relationships
    df = extract_relationships(works)

    # Cleaning the data: Replace empty strings with NaN and drop duplicates.
    # By default, OpenAlex Structure in Json data format is quite clean actually
    df.replace('', pd.NA, inplace=True)
    df.drop_duplicates(subset=['work_id'], inplace=True)

    # Display first few rows
    print(df.head())

                            work_id  \
0  https://openalex.org/W1775749144   
1  https://openalex.org/W2582743722   
2  https://openalex.org/W2100837269   
3  https://openalex.org/W2128635872   
4  https://openalex.org/W4293247451   

                                               title  \
0  PROTEIN MEASUREMENT WITH THE FOLIN PHENOL REAGENT   
1  R: A language and environment for statistical ...   
2  Cleavage of Structural Proteins during the Ass...   
3  A Rapid and Sensitive Method for the Quantitat...   
4  A rapid and sensitive method for the quantitat...   

                                         authored_by  \
0  OliverH. Lowry; NiraJ. Rosebrough; A. Farr; Ro...   
1                                        R Core Team   
2                                  Ulrich K. Laemmli   
3                                   Mark A. Bradford   
4                                 Marion M. Bradford   

                                           has_topic affiliated_with  \
0  Reagent; Chemist

## Importing into CSV File

In [34]:
# Save to CSV
csv_filename = 'openalex_works_relationships.csv'
df.to_csv(csv_filename, index=False)

# Download link
from google.colab import files
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>