In [1]:
def preview_file(filename, n=3):
    with open(filename, 'r', encoding='utf-8') as f:
        for _ in range(n):
            line = f.readline()
            print(line[:500])  # affiche 500 premiers caractères

preview_file("arxiv-metadata-oai-snapshot.json")


{"id":"0704.0001","submitter":"Pavel Nadolsky","authors":"C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan","title":"Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies","comments":"37 pages, 15 figures; published version","journal-ref":"Phys.Rev.D76:013009,2007","doi":"10.1103/PhysRevD.76.013009","report-no":"ANL-HEP-PR-07-12","categories":"hep-ph","license":null,"abstract":"  A fully differential calculation in perturbative quantum chromodynamics is\n
{"id":"0704.0002","submitter":"Louis Theran","authors":"Ileana Streinu and Louis Theran","title":"Sparsity-certifying Graph Decompositions","comments":"To appear in Graphs and Combinatorics","journal-ref":null,"doi":null,"report-no":null,"categories":"math.CO cs.CG","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","abstract":"  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors, and use\nit obtain a characterization of the family of $(k,\\ell)$-sparse graphs and

In [3]:
import json
import pandas as pd
import re

def extract_year(article_id):
    # id au format YYMM.XXXX, on convertit YY en 2000+YY si < 50 sinon 1900+YY
    match = re.match(r'(\d{2})\d{2}\.\d+', article_id)
    if match:
        yy = int(match.group(1))
        year = 2000 + yy if yy < 50 else 1900 + yy
        return year
    return None

def process_arxiv_file_balanced(input_filepath, min_year=2020, max_year=2025, max_articles=3000, max_per_year=700):
    articles = []
    authors_list = []
    counts_per_year = {year: 0 for year in range(min_year, max_year + 1)}
    total_count = 0

    with open(input_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                article = json.loads(line)
            except json.JSONDecodeError:
                continue

            year = extract_year(article.get('id', ''))
            if year is None or year < min_year or year > max_year:
                continue

            cats = article.get('categories', '')
            if not any(c.strip().startswith('cs') for c in cats.split()):
                continue

            # Stopper la collecte si quota max atteint pour cette année
            if counts_per_year[year] >= max_per_year:
                continue

            # Nettoyage basique des champs texte
            for field in ['title', 'abstract', 'comments', 'journal-ref', 'doi', 'submitter']:
                if field in article and article[field]:
                    article[field] = article[field].replace('\n', ' ').strip()
                else:
                    article[field] = ''

            article['year'] = year
            article['categories'] = cats

            articles.append(article)
            counts_per_year[year] += 1
            total_count += 1

            authors_str = article.get('authors', '')
            authors = [a.strip() for a in authors_str.split(',') if a.strip()]
            for auth in authors:
                authors_list.append({
                    'article_id': article['id'],
                    'author_name': auth
                })

            # Arrêter si on a atteint le max total d’articles
            if total_count >= max_articles:
                break

            # Arrêter si toutes les années ont atteint leur quota max
            if all(count >= max_per_year for count in counts_per_year.values()):
                break

    df_articles = pd.DataFrame(articles)
    df_authors = pd.DataFrame(authors_list).drop_duplicates()

    print("Répartition des articles par année :", counts_per_year)
    print("Nombre total d'articles :", total_count)

    return df_articles, df_authors

# Exemple d'utilisation :
df_articles, df_authors = process_arxiv_file_balanced("arxiv-metadata-oai-snapshot.json")


Répartition des articles par année : {2020: 700, 2021: 700, 2022: 700, 2023: 700, 2024: 200, 2025: 0}
Nombre total d'articles : 3000


In [4]:

# 4. Sauvegarder résultats
df_articles.to_csv('arxiv_cs_2020_2025_articles.csv', index=False, encoding='utf-8-sig')
df_authors.to_csv('arxiv_cs_2020_2025_authors.csv', index=False, encoding='utf-8-sig')

pretraitement

In [5]:
import pandas as pd

# Chemins vers les fichiers CSV
ARTICLES_FILE = "arxiv_cs_2020_2025_articles.csv"
AUTHORS_FILE = "arxiv_cs_2020_2025_authors.csv"

# Chargement
df_articles = pd.read_csv(ARTICLES_FILE, encoding='utf-8-sig')
df_authors = pd.read_csv(AUTHORS_FILE, encoding='utf-8-sig')

print("=== Aperçu des articles ===")
print(df_articles.head(3))
print("\nColonnes articles:", df_articles.columns.tolist())
print("\nTypes articles:")
print(df_articles.dtypes)

print("\nNombre d'articles :", len(df_articles))
print("Articles par année:")
print(df_articles['year'].value_counts().sort_index())

print("\nCatégories (extraits):")
print(df_articles['categories'].value_counts().head(10))

print("\nValeurs manquantes par colonne (articles):")
print(df_articles.isna().sum())

print("\n=== Aperçu des auteurs ===")
print(df_authors.head(3))
print("\nColonnes auteurs:", df_authors.columns.tolist())
print("\nTypes auteurs:")
print(df_authors.dtypes)

print("\nNombre total d'auteurs extraits:", len(df_authors))
print("Nombre d'auteurs uniques :", df_authors['author_name'].nunique())

print("\nValeurs manquantes par colonne (auteurs):")
print(df_authors.isna().sum())

print("\nDoublons dans auteurs (article_id + author_name):", df_authors.duplicated(subset=['article_id', 'author_name']).sum())


=== Aperçu des articles ===
           id       submitter  \
0  2001.00001   Maria Mannone   
1  2001.00003  Chengyue Jiang   
2  2001.00004  Rakesh Mohanty   

                                             authors  \
0  Maria Mannone, Federico Favali, Balandino Di D...   
1  Chengyue Jiang, Zhonglin Nian, Kaihao Guo, Sha...   
2  Rakesh Mohanty, Debasis Dwibedy, Shreeya Swaga...   

                                               title  \
0  Quantum GestART: Identifying and Applying Corr...   
1                        Learning Numeral Embeddings   
2  New Competitive Analysis Results of Online Lis...   

                                            comments  \
0  Accepted for publication, Journal of Mathemati...   
1                                                NaN   
2  9 pages, In Proceeding of the 14th Annual ADMA...   

                              journal-ref                            doi  \
0  Journal of Mathematics and Music, 2020  10.1080/17459737.2020.1726691   
1           

In [6]:
import pandas as pd
import re

# Fichier source
ARTICLES_FILE = "arxiv_cs_2020_2025_articles.csv"

# Chargement
df_articles = pd.read_csv(ARTICLES_FILE, encoding='utf-8-sig')

# 1. Nettoyage général des articles ----------------------

# Suppression doublons selon id
df_articles = df_articles.drop_duplicates(subset=['id']).reset_index(drop=True)

# Suppression colonne 'comments' trop incomplète
if 'comments' in df_articles.columns:
    df_articles.drop(columns=['comments'], inplace=True)

# Colonnes texte où remplacer NaN par chaîne vide
text_cols = ['journal-ref', 'doi', 'report-no', 'license', 'abstract', 'title', 'categories', 'submitter']
for col in text_cols:
    if col in df_articles.columns:
        df_articles[col] = df_articles[col].fillna('')

# Nettoyage retours à la ligne et espaces dans 'title' et 'abstract'
for col in ['title', 'abstract']:
    df_articles[col] = df_articles[col].str.replace('\n', ' ', regex=True).str.strip()

# 2. Extraction + nettoyage auteurs ----------------------

def clean_latex(name):
    """Simplifie certains accents LaTeX courants dans les noms."""
    name = name.replace("\\'", "'")      # accent aigu
    name = re.sub(r"\\v\{([a-zA-Z])\}", r"\1", name)  # accent caron
    name = name.replace("\\\"", '"')     # guillemets
    name = name.replace("~", " ")        # espace insécable
    return name.strip()

def split_authors(authors_str):
    """Sépare la chaîne 'authors' sur ' and ' en liste d'auteurs nettoyés."""
    if pd.isna(authors_str) or authors_str.strip() == '':
        return []
    authors = authors_str.split(' and ')
    return [clean_latex(a.strip()) for a in authors]

# Extraire auteurs par article et exploser en lignes séparées
df_authors = df_articles[['id', 'authors']].copy()
df_authors['author_name'] = df_authors['authors'].apply(split_authors)
df_authors = df_authors.explode('author_name').reset_index(drop=True)
df_authors['author_name'] = df_authors['author_name'].fillna('').str.strip()

# Supprimer doublons article+auteur
df_authors = df_authors.drop_duplicates(subset=['id', 'author_name']).reset_index(drop=True)

# Renommer colonne id pour cohérence
df_authors.rename(columns={'id': 'article_id'}, inplace=True)

# 3. Sauvegarde ------------------------------------------

df_articles.to_csv("arxiv_cs_2020_2025_articles_clean.csv", index=False, encoding='utf-8-sig')
df_authors.to_csv("arxiv_cs_2020_2025_authors_clean.csv", index=False, encoding='utf-8-sig')

print("✅ Nettoyage terminé.")
print(f"Articles nettoyés: {len(df_articles)}")
print(f"Auteurs extraits (lignes): {len(df_authors)}")
print(f"Auteurs uniques: {df_authors['author_name'].nunique()}")


✅ Nettoyage terminé.
Articles nettoyés: 3000
Auteurs extraits (lignes): 4320
Auteurs uniques: 4189


In [7]:
articles_par_year = df_articles.groupby('year').size().reset_index(name='count')
print(articles_par_year)


   year  count
0  2020    700
1  2021    700
2  2022    700
3  2023    700
4  2024    200


In [7]:
import pandas as pd

ARTICLES_FILE = "arxiv_cs_2020_2025_articles_clean.csv"
AUTHORS_FILE = "arxiv_cs_2020_2025_authors_clean.csv"

# Chargement
df_articles = pd.read_csv(ARTICLES_FILE, encoding='utf-8-sig')
df_authors = pd.read_csv(AUTHORS_FILE, encoding='utf-8-sig')


print("=== Aperçu des articles ===")
print(df_articles.head(3))  



=== Aperçu des articles ===
           id       submitter  \
0  2001.00001   Maria Mannone   
1  2001.00003  Chengyue Jiang   
2  2001.00004  Rakesh Mohanty   

                                             authors  \
0  Maria Mannone, Federico Favali, Balandino Di D...   
1  Chengyue Jiang, Zhonglin Nian, Kaihao Guo, Sha...   
2  Rakesh Mohanty, Debasis Dwibedy, Shreeya Swaga...   

                                               title  \
0  Quantum GestART: Identifying and Applying Corr...   
1                        Learning Numeral Embeddings   
2  New Competitive Analysis Results of Online Lis...   

                              journal-ref                            doi  \
0  Journal of Mathematics and Music, 2020  10.1080/17459737.2020.1726691   
1                                     NaN                            NaN   
2                                     NaN                            NaN   

  report-no     categories                                            license  \
0  

In [8]:
print("aperçu des auteurs === ")    
print(df_authors.head(3))

aperçu des auteurs === 
   article_id                                            authors  \
0  2001.00001  Maria Mannone, Federico Favali, Balandino Di D...   
1  2001.00003  Chengyue Jiang, Zhonglin Nian, Kaihao Guo, Sha...   
2  2001.00004  Rakesh Mohanty, Debasis Dwibedy, Shreeya Swaga...   

                                         author_name  
0  Maria Mannone, Federico Favali, Balandino Di D...  
1  Chengyue Jiang, Zhonglin Nian, Kaihao Guo, Sha...  
2  Rakesh Mohanty, Debasis Dwibedy, Shreeya Swaga...  


In [None]:
ARTICLES_FILE = "arxiv_cs_2020_2025_articles_clean.csv"
AUTHORS_FILE = "arxiv_cs_2020_2025_authors_clean.csv"

# Chargement
df_articles = pd.read_csv(ARTICLES_FILE, encoding='utf-8-sig')
df_authors = pd.read_csv(AUTHORS_FILE, encoding='utf-8-sig')



In [25]:
import pandas as pd

# Charger le fichier existant
df_authors = pd.read_csv("arxiv_cs_2020_2025_authors_clean.csv", encoding='utf-8-sig')

# Nettoyage et explosion des auteurs
df_authors_exp = df_authors[['article_id', 'authors']].copy()

# Remplacer " and " par ", " pour uniformiser la séparation
df_authors_exp['author_name'] = df_authors_exp['authors'].str.replace(" and ", ", ")

# Split en liste et explode
df_authors_exp['author_name'] = df_authors_exp['author_name'].str.split(',')

df_authors_exp = df_authors_exp.explode('author_name')

# Nettoyer espaces
df_authors_exp['author_name'] = df_authors_exp['author_name'].str.strip()

# Supprimer colonne 'authors' initiale
df_authors_exp = df_authors_exp.drop(columns=['authors'])

# Supprimer doublons éventuels
df_authors_exp = df_authors_exp.drop_duplicates(subset=['article_id', 'author_name']).reset_index(drop=True)

# Sauvegarder dans un nouveau fichier CSV
df_authors_exp.to_csv("arxiv_cs_2020_2025_authors_clean_expanded.csv", index=False, encoding='utf-8-sig')

print("✅ Nettoyage, explosion auteurs et sauvegarde terminés.")
print(f"Nombre de lignes dans le fichier nettoyé : {len(df_authors_exp)}")


✅ Nettoyage, explosion auteurs et sauvegarde terminés.
Nombre de lignes dans le fichier nettoyé : 11967


In [28]:
ARTICLES_FILE = "arxiv_cs_2020_2025_articles_clean.csv"
AUTHORS_FILE = "arxiv_cs_2020_2025_authors_clean_expanded.csv"

# Chargement
df_articles = pd.read_csv(ARTICLES_FILE, encoding='utf-8-sig')
df_authors = pd.read_csv(AUTHORS_FILE, encoding='utf-8-sig')

print(df_articles.head(3))
print(df_authors.head(3))

           id       submitter  \
0  2001.00001   Maria Mannone   
1  2001.00003  Chengyue Jiang   
2  2001.00004  Rakesh Mohanty   

                                             authors  \
0  Maria Mannone, Federico Favali, Balandino Di D...   
1  Chengyue Jiang, Zhonglin Nian, Kaihao Guo, Sha...   
2  Rakesh Mohanty, Debasis Dwibedy, Shreeya Swaga...   

                                               title  \
0  Quantum GestART: Identifying and Applying Corr...   
1                        Learning Numeral Embeddings   
2  New Competitive Analysis Results of Online Lis...   

                              journal-ref                            doi  \
0  Journal of Mathematics and Music, 2020  10.1080/17459737.2020.1726691   
1                                     NaN                            NaN   
2                                     NaN                            NaN   

  report-no     categories                                            license  \
0       NaN  math.HO cs.MM  htt