# Collecter des données OpenAlex

## Découvrir l'API

Jeter un coup d'oeil ici : https://docs.openalex.org/how-to-use-the-api/api-overview

Il faut un token

In [1]:
import yaml
with open('creds.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

https://openalex.org/works?page=1&filter=title_and_abstract.search:%22computational+social+science%22

In [2]:
import requests

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

params = {
    "q": "computational social science",
    "per-page": 25,
    "page": 1,
}

r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
data = r.json()
#print(data["results"])


Get all the data

In [3]:
import requests
import time


keyword = "computational social science"
per_page = 25     
max_results = None 

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

cursor = "*"
all_works = []
n = 0

while True:
    params = {
        "filter": f'title_and_abstract.search:"{keyword}"',
        "per-page": per_page,
        "select": "id,type,primary_location,title,abstract_inverted_index,publication_year,publication_date,open_access,relevance_score",
        "cursor": cursor
    }

    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    data = r.json()

    results = data.get("results", [])
    total = data["meta"]["count"]

    if not results:
        break

    all_works.extend(results)
    n += len(results)

    print(f"Fetched {n} / {total} (cursor={cursor})")

    if max_results is not None and n >= max_results:
        break

    cursor = data["meta"].get("next_cursor")
    if not cursor:
        break

    time.sleep(0.2)

print(f"\nDone. Collected {len(all_works)} works total.\n")


Fetched 25 / 2102 (cursor=*)
Fetched 50 / 2102 (cursor=IlsxOTIuODc5NzUsIDE2MDMyMzg0MDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XMzA5NDQyMzA2OCddIg==)
Fetched 75 / 2102 (cursor=IlsxMjYuMjQwMjcsIDE0MjU5NDU2MDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XMjAwMzc2MTkyMCddIg==)
Fetched 100 / 2102 (cursor=Ils5OS4xMDIzNiwgMTM4ODUzNDQwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyNDg2NDU0NTg4J10i)
Fetched 125 / 2102 (cursor=Ils4Mi4zNjYyOCwgMTQ3NDkzNDQwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1czMjE3MzM0NDg0J10i)
Fetched 150 / 2102 (cursor=Ils2Ny42NjQ5OSwgMTY0NDg4MzIwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c0MjEzMzc5MzY3J10i)
Fetched 175 / 2102 (cursor=Ils1OC4wMTMwOTYsIDEzNzQwMTkyMDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XMjI3ODIxNjIxMSddIg==)
Fetched 200 / 2102 (cursor=Ils1MS43MzgwNDUsIDE3MDQwNjcyMDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XNDM5ODE5OTIxMSddIg==)
Fetched 225 / 2102 (cursor=Ils0NS4yNjA0OCwgMTY2MTk5MDQwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c0Mzc5NDEwNzc3J10i)
Fetched 250 / 2102 (cursor=Ils0Mi4wNTAyLCAxNTA4MzcxMj

Sauvegarder les données

In [7]:
import pickle
import os

os.makedirs("data", exist_ok=True)

with open("data/all_works.pkl", "wb") as f:
    pickle.dump(all_works, f)

## Mettre sous la forme d'un dataframe

In [21]:
def reconstruct_abstract(inv_index):
    if not inv_index:
        return None
    
    # Determine abstract length
    max_position = max(pos for positions in inv_index.values() for pos in positions)
    abstract_words = [None] * (max_position + 1)

    # Place words in correct positions
    for word, positions in inv_index.items():
        for pos in positions:
            abstract_words[pos] = word

    return " ".join([i for i in abstract_words if i is not None])

def extract_journal_name(primary_location):
    try:
        return primary_location["source"]["display_name"]
    except:
        return None

# reconstruct_abstract(all_works[200]["abstract_inverted_index"])

In [22]:
import pandas as pd
df = pd.DataFrame(all_works)
df["abstract"] = df["abstract_inverted_index"].apply(reconstruct_abstract)
df["journal"] = df["primary_location"].apply(extract_journal_name)
df.to_csv("data/css_openalex_26022026.csv", index=False)