In [1]:
import pandas as pd
from tqdm import tqdm
import json
import glob

In [2]:
!curl -L -o /content/arxiv.zip https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1464M  100 1464M    0     0   185M      0  0:00:07  0:00:07 --:--:--  163M


In [3]:
!unzip "/content/arxiv.zip"

Archive:  /content/arxiv.zip
replace arxiv-metadata-oai-snapshot.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
total_papers = 0
with open('/content/arxiv-metadata-oai-snapshot.json', 'r') as f:
    for _ in tqdm(f, desc="Counting papers"):
        total_papers += 1

Counting papers: 2720631it [00:05, 492358.58it/s]


In [5]:
chunks = []
chunk_size = 100000

with open('/content/arxiv-metadata-oai-snapshot.json', 'r') as f:
    for i, line in enumerate(tqdm(f, total=total_papers)):
        paper = json.loads(line)
        # Extract ONLY what you need
        chunks.append({
            'id': paper['id'],
            'abstract': paper.get('abstract', ''),
            'categories': paper.get('categories', '').split(),  # Primary category
            'year': paper.get('update_date', '')[:4]  # Just the year
        })

        # Save memory by dumping chunks to disk periodically
        if (i + 1) % chunk_size == 0:
            pd.DataFrame(chunks).to_parquet(f'chunk_{i}.parquet')  # Parquet is compressed
            chunks = []  # Reset
if chunks:
    pd.DataFrame(chunks).to_parquet(f'chunk_{i}_final.parquet')
# Merge all chunks at the end (if needed)
df = pd.concat([pd.read_parquet(f) for f in glob.glob('chunk_*.parquet')])

100%|██████████| 2720631/2720631 [01:42<00:00, 26519.61it/s]


In [6]:
df.head()

Unnamed: 0,id,abstract,categories,year
0,812.3874,No-scale supersymmetry or gaugino mediation ...,[hep-ph],2014
1,812.3875,Numerous authors have referred to room-tempe...,[cond-mat.mtrl-sci],2009
2,812.3876,A comprehensive number of integrals emerging...,[hep-lat],2009
3,812.3877,Isoscalar and isovector particle densities a...,[nucl-th],2009
4,812.3878,We study the propagation of a color singlet ...,"[nucl-th, hep-ph]",2010


In [7]:
df.columns

Index(['id', 'abstract', 'categories', 'year'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2720631 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   id          object
 1   abstract    object
 2   categories  object
 3   year        object
dtypes: object(4)
memory usage: 103.8+ MB


In [9]:
df['categories']

Unnamed: 0,categories
0,[hep-ph]
1,[cond-mat.mtrl-sci]
2,[hep-lat]
3,[nucl-th]
4,"[nucl-th, hep-ph]"
...,...
99995,[gr-qc]
99996,[math.OC]
99997,[math.AC]
99998,"[cs.LG, cs.RO]"


In [11]:

import re

# Check if all IDs match the expected format (####.####)
def is_valid_id(paper_id):
    return bool(re.match(r'^\d{4}\.\d{4}$', paper_id))

# Test on a sample of IDs (or the entire DataFrame if small)
sample_ids = df['id'].sample(10, random_state=42)
print("Sample IDs and their validity:")
for paper_id in sample_ids:
    print(f"{paper_id} → Valid: {is_valid_id(paper_id)}")

# Check if any ID does NOT match the format
invalid_ids = df[~df['id'].str.match(r'^\d{4}\.\d{4}$', na=False)]
print(f"\nNumber of invalid IDs: {len(invalid_ids)}")
if len(invalid_ids) > 0:
    print("Examples of invalid IDs:")
    print(invalid_ids['id'].head())

Sample IDs and their validity:
hep-ex/0505035 → Valid: False
hep-ph/0112317 → Valid: False
1404.5617 → Valid: True
cs/0606051 → Valid: False
astro-ph/0609648 → Valid: False
0902.4863 → Valid: True
2207.09640 → Valid: False
2110.09179 → Valid: False
2403.06655 → Valid: False
hep-ph/0702048 → Valid: False

Number of invalid IDs: 2133882
Examples of invalid IDs:
0    quant-ph/0011094
1    quant-ph/0011095
2    quant-ph/0011096
3    quant-ph/0011097
4    quant-ph/0011098
Name: id, dtype: object


In [12]:
df[df['id'] == 'quant-ph/0011094']

Unnamed: 0,id,abstract,categories,year
0,quant-ph/0011094,We have tested complementarity for the ensem...,[quant-ph],2007


In [13]:
def extract_id(paper_id):
    # For old format: strip subject prefix
    if '/' in paper_id:
        return paper_id.split('/')[-1]
    # For new/modern: keep as-is
    return paper_id

df['id'] = df['id'].apply(extract_id)

In [14]:
df

Unnamed: 0,id,abstract,categories,year
0,0812.3874,No-scale supersymmetry or gaugino mediation ...,[hep-ph],2014
1,0812.3875,Numerous authors have referred to room-tempe...,[cond-mat.mtrl-sci],2009
2,0812.3876,A comprehensive number of integrals emerging...,[hep-lat],2009
3,0812.3877,Isoscalar and isovector particle densities a...,[nucl-th],2009
4,0812.3878,We study the propagation of a color singlet ...,"[nucl-th, hep-ph]",2010
...,...,...,...,...
99995,1710.04612,A promising theory in modifying general rela...,[gr-qc],2018
99996,1710.04613,We consider an $\ell_0$-minimization problem...,[math.OC],2019
99997,1710.04614,"Given an ideal I in a polynomial ring, we co...",[math.AC],2017
99998,1710.04615,Imitation learning is a powerful paradigm fo...,"[cs.LG, cs.RO]",2018


In [16]:
# Count papers with multiple categories
multi_cat = df['categories'].apply(len) > 1
print(f"{multi_cat.sum()} papers have multiple categories.")

# Show an example
if multi_cat.any():
    print("\nExample multi-category paper:")
    print(df[multi_cat].iloc[0])

1284561 papers have multiple categories.

Example multi-category paper:
id                                                    0812.3878
abstract        We study the propagation of a color singlet ...
categories                                    [nucl-th, hep-ph]
year                                                       2010
Name: 4, dtype: object


In [19]:
df

Unnamed: 0,id,abstract,categories,year
0,0812.3874,No-scale supersymmetry or gaugino mediation ...,[hep-ph],2014
1,0812.3875,Numerous authors have referred to room-tempe...,[cond-mat.mtrl-sci],2009
2,0812.3876,A comprehensive number of integrals emerging...,[hep-lat],2009
3,0812.3877,Isoscalar and isovector particle densities a...,[nucl-th],2009
4,0812.3878,We study the propagation of a color singlet ...,"[nucl-th, hep-ph]",2010
...,...,...,...,...
99995,1710.04612,A promising theory in modifying general rela...,[gr-qc],2018
99996,1710.04613,We consider an $\ell_0$-minimization problem...,[math.OC],2019
99997,1710.04614,"Given an ideal I in a polynomial ring, we co...",[math.AC],2017
99998,1710.04615,Imitation learning is a powerful paradigm fo...,"[cs.LG, cs.RO]",2018


#Abstract Cleaning

In [28]:
def clean_abstract(text):
    """
    Clean abstracts for embedding/clustering:
    1. Lowercase
    2. Replace LaTeX symbols (e.g., \ell_0 → L0, \alpha_1 → alpha1)
    3. Remove LaTeX equations (e.g., \frac{1}{2})
    4. Remove URLs, arXiv IDs, emails
    5. Keep letters, numbers, basic punctuation
    6. Remove extra whitespace
    """
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Step 1: Replace LaTeX symbols with subscripts (e.g., \ell_0 → L0)
    latex_symbols = {
        r'\\ell': 'L',       # \ell → L
        r'\\alpha': 'alpha',
        r'\\beta': 'beta',
        r'\\gamma': 'gamma',
        r'\\theta': 'theta',
        r'\\lambda': 'lambda',
        r'\\sigma': 'sigma',
        r'\\mu': 'mu',
        r'\\epsilon': 'epsilon'
    }

    # Handle subscripts (e.g., \ell_0 → L0, \alpha_{1} → alpha1)
    for pattern, replacement in latex_symbols.items():
        # Case 1: \symbol_{number} → symbolnumber (e.g., \alpha_{1} → alpha1)
        text = re.sub(pattern + r'_\{(\d+)\}', replacement + r'\1', text)
        # Case 2: \symbol_number → symbolnumber (e.g., \alpha_1 → alpha1)
        text = re.sub(pattern + r'_(\d+)', replacement + r'\1', text)
        # Case 3: Standalone symbols (e.g., \alpha → alpha)
        text = re.sub(pattern, replacement, text)

    # Step 2: Remove remaining LaTeX commands (e.g., \frac{1}{2})
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', ' ', text)

    # Step 3: Remove URLs, arXiv IDs, emails
    text = re.sub(r'http\S+|www\S+|arxiv:\d+\.\d+|@\S+', ' ', text)

    # Step 4: Keep letters, numbers, and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?\'"-]', ' ', text)

    # Step 5: Collapse whitespace and trim
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [29]:
tqdm.pandas(desc="Cleaning abstracts")
df['cleaned_abstract'] = df['abstract'].progress_apply(clean_abstract)

Cleaning abstracts: 100%|██████████| 2720631/2720631 [08:51<00:00, 5119.49it/s]


In [30]:
df

Unnamed: 0,id,abstract,categories,year,cleaned_abstract
0,0812.3874,No-scale supersymmetry or gaugino mediation ...,[hep-ph],2014,no-scale supersymmetry or gaugino mediation au...
1,0812.3875,Numerous authors have referred to room-tempe...,[cond-mat.mtrl-sci],2009,numerous authors have referred to room-tempera...
2,0812.3876,A comprehensive number of integrals emerging...,[hep-lat],2009,a comprehensive number of integrals emerging i...
3,0812.3877,Isoscalar and isovector particle densities a...,[nucl-th],2009,isoscalar and isovector particle densities are...
4,0812.3878,We study the propagation of a color singlet ...,"[nucl-th, hep-ph]",2010,we study the propagation of a color singlet q ...
...,...,...,...,...,...
99995,1710.04612,A promising theory in modifying general rela...,[gr-qc],2018,a promising theory in modifying general relati...
99996,1710.04613,We consider an $\ell_0$-minimization problem...,[math.OC],2019,we consider an L0 -minimization problem where ...
99997,1710.04614,"Given an ideal I in a polynomial ring, we co...",[math.AC],2017,"given an ideal i in a polynomial ring, we cons..."
99998,1710.04615,Imitation learning is a powerful paradigm fo...,"[cs.LG, cs.RO]",2018,imitation learning is a powerful paradigm for ...


In [31]:
df[df['id']=="1710.04613"]

Unnamed: 0,id,abstract,categories,year,cleaned_abstract
99996,1710.04613,We consider an $\ell_0$-minimization problem...,[math.OC],2019,we consider an L0 -minimization problem where ...


In [32]:
df.to_csv("/content/arxiv.csv")