# Collecter des données OpenAlex

## Découvrir l'API

Jeter un coup d'oeil ici : https://docs.openalex.org/how-to-use-the-api/api-overview

Il faut un token. Pour cela on crée un fichier `creds.yaml` qui contient le token récupéré sur le compte que l'on crée sur OpenAlex, et qui contient

````yaml
token-openalex: TOKEN
````

Ouvrir le fichier pour récupérer le token

In [1]:
import yaml
with open('creds.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

Faire un test sur une requête

https://openalex.org/works?page=1&filter=title_and_abstract.search:%22computational+social+science%22

In [3]:
import requests

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

params = {
    "q": "computational social science",
    "per-page": 25,
    "page": 1,
}

r = requests.get(url, headers=headers, params=params)
r.raise_for_status()
data = r.json()
#print(data["results"])


Get all the data

In [4]:
import requests
import time


keyword = "computational social science"
per_page = 25     
max_results = None 

url = "https://api.openalex.org/works"

headers = {
    "Authorization": f"Bearer {config['token-openalex']}"
}

cursor = "*"
all_works = []
n = 0

while True:
    params = {
        "filter": f'title_and_abstract.search:"{keyword}"',
        "per-page": per_page,
        "select": "id,type,primary_location,title,abstract_inverted_index,publication_year,publication_date,open_access,relevance_score",
        "cursor": cursor
    }

    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    data = r.json()

    results = data.get("results", [])
    total = data["meta"]["count"]

    if not results:
        break

    all_works.extend(results)
    n += len(results)

    print(f"Fetched {n} / {total} (cursor={cursor})")

    if max_results is not None and n >= max_results:
        break

    cursor = data["meta"].get("next_cursor")
    if not cursor:
        break

    time.sleep(0.2)

print(f"\nDone. Collected {len(all_works)} works total.\n")


Fetched 25 / 2102 (cursor=*)
Fetched 50 / 2102 (cursor=IlsxOTIuODc5NzUsIDE2MDMyMzg0MDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XMzA5NDQyMzA2OCddIg==)
Fetched 75 / 2102 (cursor=IlsxMjYuMjM4LCAxNDI1OTQ1NjAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzIwMDM3NjE5MjAnXSI=)
Fetched 100 / 2102 (cursor=Ils5OS4xMDIzNiwgMTM4ODUzNDQwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1cyNDg2NDU0NTg4J10i)
Fetched 125 / 2102 (cursor=Ils4Mi4zNjYyOCwgMTQ3NDkzNDQwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1czMjE3MzM0NDg0J10i)
Fetched 150 / 2102 (cursor=Ils2Ny42NjQ5OSwgMTY0NDg4MzIwMDAwMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c0MjEzMzc5MzY3J10i)
Fetched 175 / 2102 (cursor=Ils1OC4wMTUzLCAxMzc0MDE5MjAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzIyNzgyMTYyMTEnXSI=)
Fetched 200 / 2102 (cursor=Ils1MS43MzgwNDUsIDE3MDQwNjcyMDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XNDM5ODE5OTIxMSddIg==)
Fetched 225 / 2102 (cursor=Ils0NS4xNzgyNzYsIDE2NjE5OTA0MDAwMDAsICdodHRwczovL29wZW5hbGV4Lm9yZy9XNDM3OTQxMDc3NyddIg==)
Fetched 250 / 2102 (cursor=Ils0Mi4wMTgxNjYsIDE2MzY1MDI0MD

Sauvegarder les données

In [5]:
import pickle
import os

os.makedirs("data", exist_ok=True)

with open("data/all_works.pkl", "wb") as f:
    pickle.dump(all_works, f)

In [6]:
type(all_works)

list

In [7]:
len(all_works)

2128

In [8]:
all_works[0]

{'id': 'https://openalex.org/W2159397589',
 'type': 'article',
 'primary_location': {'id': 'doi:10.1126/science.1167742',
  'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1126/science.1167742',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S3880285',
   'display_name': 'Science',
   'issn_l': '0036-8075',
   'issn': ['0036-8075', '1095-9203'],
   'is_oa': False,
   'is_in_doaj': False,
   'is_core': True,
   'host_organization': 'https://openalex.org/P4310315823',
   'host_organization_name': 'American Association for the Advancement of Science',
   'host_organization_lineage': ['https://openalex.org/P4310315823'],
   'host_organization_lineage_names': ['American Association for the Advancement of Science'],
   'type': 'journal'},
  'license': None,
  'license_id': None,
  'version': 'publishedVersion',
  'is_accepted': True,
  'is_published': True,
  'raw_source_name': 'Science',
  'raw_type': 'journal-article'},
 'title': 'Computational Social Science',
 'abs

## Mettre sous la forme d'un dataframe

In [9]:
def reconstruct_abstract(inv_index):
    if not inv_index:
        return None
    
    # Determine abstract length
    max_position = max(pos for positions in inv_index.values() for pos in positions)
    abstract_words = [None] * (max_position + 1)

    # Place words in correct positions
    for word, positions in inv_index.items():
        for pos in positions:
            abstract_words[pos] = word

    return " ".join([i for i in abstract_words if i is not None])

def extract_journal_name(primary_location):
    try:
        return primary_location["source"]["display_name"]
    except:
        return None

# reconstruct_abstract(all_works[200]["abstract_inverted_index"])

In [10]:
import pandas as pd
df = pd.DataFrame(all_works)
df["abstract"] = df["abstract_inverted_index"].apply(reconstruct_abstract)
df["journal"] = df["primary_location"].apply(extract_journal_name)
df.to_csv("data/css_openalex_26022026.csv", index=False)

In [11]:
df

Unnamed: 0,id,type,primary_location,title,abstract_inverted_index,publication_year,publication_date,open_access,relevance_score,abstract,journal
0,https://openalex.org/W2159397589,article,"{'id': 'doi:10.1126/science.1167742', 'is_oa':...",Computational Social Science,"{'A': [0], 'field': [1], 'is': [2], 'emerging'...",2009.0,2009-02-06,"{'is_oa': True, 'oa_status': 'green', 'oa_url'...",1360.754000,A field is emerging that leverages the capacit...,Science
1,https://openalex.org/W2070907364,article,"{'id': 'doi:10.1140/epjst/e2012-01697-8', 'is_...",Manifesto of computational social science,,2012.0,2012-11-01,"{'is_oa': True, 'oa_status': 'hybrid', 'oa_url...",498.112580,,The European Physical Journal Special Topics
2,https://openalex.org/W3081158114,article,"{'id': 'doi:10.1126/science.aaz8170', 'is_oa':...",Computational social science: Obstacles and op...,"{'Data': [0], 'sharing,': [1], 'research': [2]...",2020.0,2020-08-28,"{'is_oa': True, 'oa_status': 'green', 'oa_url'...",438.540130,"Data sharing, research ethics, and incentives ...",Science
3,https://openalex.org/W3022499311,article,{'id': 'doi:10.1146/annurev-soc-121919-054621'...,Computational Social Science and Sociology,"{'The': [0], 'integration': [1], 'of': [2, 16,...",2020.0,2020-04-28,"{'is_oa': True, 'oa_status': 'hybrid', 'oa_url...",413.034240,The integration of social science with compute...,Annual Review of Sociology
4,https://openalex.org/W3174174150,article,"{'id': 'doi:10.1038/s41586-021-03659-0', 'is_o...",Integrating explanation and prediction in comp...,,2021.0,2021-06-30,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",408.089800,,Nature
...,...,...,...,...,...,...,...,...,...,...,...
2123,https://openalex.org/W4245458837,article,"{'id': 'doi:10.1002/cplx.20383', 'is_oa': Fals...",Complexity at large,"{'The': [0, 155, 250, 320, 367, 466, 497, 529,...",2011.0,2011-08-26,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",0.095437,The following news item is taken in part from ...,Complexity
2124,https://openalex.org/W4413141860,article,{'id': 'doi:10.22541/au.175494733.30349065/v1'...,AI-Driven Behavioral and Sociocultural Analysi...,"{'Author:': [0], 'Khan': [1], 'Tahsin': [2], '...",2025.0,2025-08-11,"{'is_oa': True, 'oa_status': 'gold', 'oa_url':...",0.087923,Author: Khan Tahsin AbrarAffiliation: Independ...,
2125,https://openalex.org/W4231832138,paratext,"{'id': 'doi:10.1108/s1746-979120200000016018',...",Index,"{'Citation': [0], '(2020),': [1], '""Index"",': ...",2020.0,2020-10-06,"{'is_oa': True, 'oa_status': 'bronze', 'oa_url...",0.067887,"Citation (2020), ""Index"", Härtel, C.E.J., Zerb...",Research on emotion in organizations
2126,https://openalex.org/W4405505218,book-chapter,"{'id': 'doi:10.70593/978-81-982935-8-9_4', 'is...",Publishing academic books in emerging fields: ...,"{'Publishing': [0, 240, 261, 325, 561, 3442, 3...",2024.0,2024-12-05,"{'is_oa': True, 'oa_status': 'gold', 'oa_url':...",0.046431,Publishing academic books in emerging fields p...,
