In [1]:
import requests
import csv
import xml.etree.ElementTree as ET

def get_arxiv_papers(query, max_results=1000):
    """Récupère les liens vers les articles ArXiv à l'aide de l'API ArXiv."""
    base_url = "http://export.arxiv.org/api/query?"
    all_links = []
    start = 0

    while len(all_links) < max_results:
        url = f"{base_url}search_query={query}&start={start}&max_results=200"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Erreur HTTP {response.status_code} lors de la requête à l'API ArXiv.")
            break

        # Analyse XML
        root = ET.fromstring(response.text)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")

        if not entries:
            break  # Pas plus de résultats

        for entry in entries:
            pdf_link = None
            for link in entry.findall("{http://www.w3.org/2005/Atom}link"):
                if link.attrib.get("title") == "pdf":
                    pdf_link = link.attrib["href"]
                    break

            if pdf_link:
                print(pdf_link)  # Affiche chaque lien PDF
                all_links.append(pdf_link)

        start += 200

    return all_links[:max_results]


def save_links_to_csv(links, filename="DataMongo.csv"):
    """Enregistre les liens dans un fichier CSV."""
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Link"])
        writer.writerows([[link] for link in links])


if __name__ == "__main__":
    fields = {
        "cs.LG": "cat:cs.LG",  # Apprentissage automatique


        }
    all_links = []
    for category, query in fields.items():
        print(f"Récupération des articles pour {category}...")
        links = get_arxiv_papers(query, max_results=1000)
        all_links.extend(links)

    save_links_to_csv(all_links)
    print(f"Liens enregistrés dans DataMongo.csv")


Récupération des articles pour cs.LG...
http://arxiv.org/pdf/cs/9905014v1
http://arxiv.org/pdf/cs/9905015v1
http://arxiv.org/pdf/cs/0001004v1
http://arxiv.org/pdf/cs/0002006v1
http://arxiv.org/pdf/cs/0009001v3
http://arxiv.org/pdf/cs/0009007v1
http://arxiv.org/pdf/cs/0011032v1
http://arxiv.org/pdf/cs/0011044v1
http://arxiv.org/pdf/cs/0103003v1
http://arxiv.org/pdf/cs/0110036v1
http://arxiv.org/pdf/cs/0211003v1
http://arxiv.org/pdf/cs/0211007v1
http://arxiv.org/pdf/cs/0309015v1
http://arxiv.org/pdf/cs/0311042v1
http://arxiv.org/pdf/cs/0312004v1
http://arxiv.org/pdf/cs/0401005v1
http://arxiv.org/pdf/cs/0412003v1
http://arxiv.org/pdf/cs/0502016v1
http://arxiv.org/pdf/cs/0504001v1
http://arxiv.org/pdf/cs/0506004v4
http://arxiv.org/pdf/cs/0506007v2
http://arxiv.org/pdf/cs/0506057v2
http://arxiv.org/pdf/cs/0506085v1
http://arxiv.org/pdf/cs/0507033v2
http://arxiv.org/pdf/cs/0507044v1
http://arxiv.org/pdf/cs/0507062v1
http://arxiv.org/pdf/cs/0509055v1
http://arxiv.org/pdf/cs/0510038v4
http://a

In [11]:
import requests
import json
import xml.etree.ElementTree as ET

def get_arxiv_papers(query, max_results=1000):
    """Récupère les données des articles ArXiv à l'aide de l'API ArXiv."""
    base_url = "http://export.arxiv.org/api/query?"
    papers = []
    start = 0

    while len(papers) < max_results:
        url = f"{base_url}search_query={query}&start={start}&max_results=200"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Erreur HTTP {response.status_code} lors de la requête à l'API ArXiv.")
            break

        # Analyse XML
        root = ET.fromstring(response.text)
        entries = root.findall("{http://www.w3.org/2005/Atom}entry")

        if not entries:
            break  # Pas plus de résultats

        for entry in entries:
            paper_data = {}
            paper_data['id'] = entry.find("{http://www.w3.org/2005/Atom}id").text
            paper_data['title'] = entry.find("{http://www.w3.org/2005/Atom}title").text
            paper_data['summary'] = entry.find("{http://www.w3.org/2005/Atom}summary").text
            paper_data['published'] = entry.find("{http://www.w3.org/2005/Atom}published").text

            pdf_link = None
            for link in entry.findall("{http://www.w3.org/2005/Atom}link"):
                if link.attrib.get("title") == "pdf":
                    pdf_link = link.attrib["href"]
                    break

            if pdf_link:
                paper_data['pdf_link'] = pdf_link
                papers.append(paper_data)

        start += 200

    return papers[:max_results]

def save_papers_to_json(papers, filename="stat_ML"):
    """Enregistre les données des articles dans un fichier JSON."""
    with open(filename, "w", encoding="utf-8") as jsonfile:
        json.dump(papers, jsonfile, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    fields = {
       # "cs.LG": "cat:cs.LG",  # Apprentissage automatique
      # "cs.AI": "cat:cs.AI",  # Intelligence artificielle
        "stat.ML":"cat:stat.ML",
    }
    all_papers = []

    for category, query in fields.items():
        print(f"Récupération des articles pour {category}...")
        papers = get_arxiv_papers(query, max_results=1000)
        all_papers.extend(papers)

    save_papers_to_json(all_papers)
    print(f"Données des articles enregistrées dans stat_ML.json")

Récupération des articles pour stat.ML...
Données des articles enregistrées dans stat_ML.json
