# Extraire des données de HAL

Extraire les données

In [None]:
import requests
import json

rows = 1000
url = "https://api.archives-ouvertes.fr/search/"

params = {
    "q": "collCode_s:CREST",
    "fl": "*",
    "rows": rows,
    "wt": "json"
}

all_docs = []
start = 0

while True:
    params["start"] = start
    params["rows"] = rows

    r = requests.get(url, params=params)
    r.raise_for_status()

    resp = r.json()
    docs = resp["response"]["docs"]

    if not docs:
        break

    all_docs.extend(docs)
    start += rows
    print(f"Fetched {len(docs)} documents, total so far: {len(all_docs)}")

with open("hal_crest_data.json", "w", encoding="utf-8") as f:
    json.dump(all_docs, f, ensure_ascii=False, indent=4)

print(f"Total documents fetched: {len(all_docs)}")

Fetched 1000 documents, total so far: 1000
Fetched 1000 documents, total so far: 2000
Fetched 891 documents, total so far: 2891
Total documents fetched: 2891


Regarder un document

In [1]:
#all_docs[0]

## Extraire l'information utile

Faire une fonction qui : 
- prend un élément
- renvoie le titre, l'abstract, la date de publication, le type de document et la liste des auteurs

Charger le jeu de données

In [38]:
import json
with open("hal_crest_data.json", "r", encoding = "utf-8") as f:
    data = json.load(f)
len(data)

2891

Un élément

In [44]:
#data[0]

Définir une fonction pour extraire les données

In [39]:
def extract_info(doc):
    """
    Extraire les données
    """
    return {
        "docid": doc.get("docid"),
        "title": doc.get("title_s", ""),
        "abstract": doc.get("abstract_s", ""),
        "abstract_eng": doc.get("en_abstract_s", ""),
        "publicationDate_tdate": doc.get("publicationDate_tdate", ""),
        "document_type": doc.get("primaryDomain_s", ""),
        "authors": doc.get("authFullName_s", []),
    }

In [14]:
# extract_info(data[2])

Construire le corpus

In [17]:
corpus = []
for doc in data:
    corpus += [extract_info(doc)]

In [19]:
len(corpus)

2891

In [21]:
import pandas as pd

# extraire les données
corpus = [extract_info(doc) for doc in data]

# mettre sous un format dataframe
df = pd.DataFrame(corpus)
df.set_index("docid", inplace=True)
df.head()

Unnamed: 0_level_0,title,abstract,abstract_eng,publicationDate_tdate,document_type,authors
docid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3707033,[Unemployment and Online Labor - Evidence from...,,,2022-06-13T00:00:00Z,shs.eco,"[Ulrich Laitenberger, Steffen Viete, Olga Sliv..."
3026577,[The effect of price magnitude on analysts' fo...,[Recent research in finance shows that the mag...,[Recent research in finance shows that the mag...,2021-01-01T00:00:00Z,shs.eco,"[Tristan Roger, Wael Bousselmi, Patrick Roger,..."
4358212,[Model-based Clustering with Missing Not At Ra...,"[Model-based unsupervised learning, as any lea...","[Model-based unsupervised learning, as any lea...",2024-06-18T00:00:00Z,stat.ml,"[Aude Sportisse, Matthieu Marbac, Fabien Lapor..."
5444020,[Stationarity and ergodic properties for some ...,,,2023-12-01T00:00:00Z,phys.phys.phys-data-an,"[Paul Doukhan, Lionel Truquet, Michael H. Neum..."
4996040,[Inferring the parameters of Taylor’s power la...,[Taylor’s power law of fluctuation scaling has...,[Taylor’s power law of fluctuation scaling has...,2025-11-26T00:00:00Z,stat,"[Lionel Truquet, Joel E. Cohen, Paul Doukhan]"


In [42]:
df = pd.DataFrame([extract_info(doc) for doc in data]).set_index("docid")

Mettre en forme 
- ne garder que les éléments contenant un abstract
- mettre en forme les dates

In [43]:
# enlever les lignes sans abstract
df_ss = df[df["abstract"].notna() & (df["abstract"] != "")]

# mettre la date en format date
df_ss["date"] = pd.to_datetime(df_ss["publicationDate_tdate"], errors='coerce').dt.date

# écrire tout dans un fichier excel
df_ss.to_excel("data_hal_crest.xlsx")
df_ss.shape

(2308, 7)