In [1]:
import json

def load_local(fn):
    with open(fn, "r+", encoding="utf-8") as f:
        d = [json.loads(line.strip()) for line in f if line.strip()]
    return d

def print_json(d, fn):
    with open(fn, "w+", encoding="utf-8") as f:
        for x in d:
            f.write(json.dumps(x, ensure_ascii=False) + "\n")

def yield_local(fn):
    with open(fn, "r+", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                try:
                    yield json.loads(line)
                except:
                    pass

In [None]:
import sys
sys.path.append("P:\\AI4S\\survey_eval\\latex_parser\\")
from latex_parser.tex_parser import LatexPaperParser, process_input_commands
# base = "P:\\AI4S\\survey_eval\\crawled_papers\\cs\\2501.00842"
# with open(f"{base}\\elsarticle-template-num.tex", encoding='utf-8') as f:
#     content = process_input_commands(f.read(), base)
# parser = LatexPaperParser(content, base)
# paper = parser.parse()
# all_citations = paper.map_citations_to_sentence()
# print(len(all_citations))
from pylatexenc.latexwalker import LatexWalker
from pylatexenc.latex2text import LatexNodes2Text
content = "\\href{https://www.itu.int/rec/R-REC-M.2160/en}{M.2160}\\href{https://www.itu.int/rec/R-REC-M.2160/en}"
import re

print(content)
from pylatexenc.latex2text import LatexNodes2Text

nodes, _, _ = LatexWalker(content).get_latex_nodes()
converter = LatexNodes2Text(
    math_mode="verbatim",
)
print(nodes)
print(converter.nodelist_to_text(nodes))

M.2160
[LatexCharsNode(parsing_state=<parsing state 2310916137072>, pos=0, len=6, chars='M.2160')]
M.2160


In [None]:
import bibtexparser

def parse_bib_file(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        bib_database = bibtexparser.load(f)
    citations = {}
    for entry in bib_database.entries:
        citation_key = entry.get('ID', '')
        if citation_key and citation_key not in citations:
            citations[citation_key] = entry
    return citations

parse_bib_file("P:\\AI4S\\survey_eval\\crawled_papers\\cs\\2502.15573\\anthology.bib")

In [5]:
import re
arxiv_pattern = re.compile(r"(?<![0-9])[0-9]{4}\.[0-9]{4,5}(?![0-9])")

def get_arxiv_cites(f):
    arxiv_index, no_arxiv_titles = set(), set()
    citations = load_local(f)
    for x in citations:
        cite = x['citation']
        for k in cite:
            arxiv_key = ""
            if 'journal' in cite[k]:
                arxiv_key = arxiv_pattern.findall(cite[k]['journal'])
                if arxiv_key: arxiv_key = arxiv_key[-1]
            if not arxiv_key and 'volume' in cite[k]:
                arxiv_key = arxiv_pattern.findall(cite[k]['volume'])
                if arxiv_key: arxiv_key = arxiv_key[-1]
            if arxiv_key:
                cite[k]['journal'] = 'arXiv'
                cite[k]['volume'] = arxiv_key
                arxiv_index.add(arxiv_key)
            else:
                no_arxiv_titles.add(cite[k]['title'])
    return citations, arxiv_index, no_arxiv_titles

In [4]:
import arxiv
import time

def search_arxiv_by_title(title, max_results=3, retry=3):
    max_retry = retry
    while retry > 0:
        try:
            time.sleep(3 ** (max_retry - retry))
            client = arxiv.Client()
            search = arxiv.Search(query=f'ti:"{title}"', max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
            results = list(client.results(search))
            return results[0].entry_id.split("/")[-1] if results else None
        except Exception as e:
            print(f"Error: {e}, Retry: {retry}")
            retry -= 1

In [100]:
import requests
import time
while True:
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    title = "Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images"
    params = {'query': title, 'limit': 3, 'fields': 'paperId,title,openAccessPdf,url,citationCount'}
    # with self.session.with_api_key(self.api_key[self.times_429 % len(self.api_key)]) as temp_session:                
    response = requests.get(url, params=params, timeout=60)
    if response.status_code == 200: break
    time.sleep(1)
print(response.json())

{'total': 2, 'offset': 0, 'data': [{'paperId': '048d133e2ec513ce385c8e736df715d8ff496e17', 'url': 'https://www.semanticscholar.org/paper/048d133e2ec513ce385c8e736df715d8ff496e17', 'title': 'Recipe1M: A Dataset for Learning Cross-Modal Embeddings for Cooking Recipes and Food Images', 'citationCount': 344, 'openAccessPdf': {'url': '', 'status': None, 'license': None}}, {'paperId': '02c009f41b66d2f977fb663f3cb69329f0f03d3f', 'url': 'https://www.semanticscholar.org/paper/02c009f41b66d2f977fb663f3cb69329f0f03d3f', 'title': 'Recipe1M+: A Dataset for Learning Cross-Modal Embeddings for Cooking Recipes and Food Images', 'citationCount': 141, 'openAccessPdf': {'url': 'https://dspace.mit.edu/bitstream/1721.1/130340/2/tpami19.pdf', 'status': 'GREEN', 'license': 'CCBYNCSA'}}]}


In [82]:
import tqdm, glob
arxiv_pattern = re.compile(r"(?<![0-9])[0-9]{4}\.[0-9]{4,5}(?![0-9])")
paper_dataset = {"inline arXiv": [], "arXiv": [], "s2": [], "Network Error": [], "": []}
arxiv_set, arxiv_set_7 = set(), {}

def fetch_arxiv_id_7(title):
    title = f'ti:{cite['title']}'
    client = arxiv.Client(num_retries=5)
    search = arxiv.Search(query=title, max_results=3, sort_by=arxiv.SortCriterion.Relevance)
    for result in client.results(search):
        text = result.entry_id.split("/")
        return f"{text[-2]}/{text[-1]}"


for f in tqdm.tqdm(glob.glob("crawled_papers/cs/*/citations-clean.jsonl")):
    for i, x in enumerate(yield_local(f)):
        for m in x['citation']:
            cite = x['citation'][m]
            cite['paper'] = f.split("/")[-2]
            cite['sentence'] = i
            cite['key'] = m
            if "source" in cite: 
                if "arXiv" in cite['source']:
                    if len(cite['volume']) == 7:
                        if cite['volume'] in arxiv_set_7: cite['volume'] = arxiv_set_7[cite['volume']]
                        else: 
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                    else:
                        arxiv_set.add(cite['volume'])
                paper_dataset[cite['source']].append(cite)
            else:
                if "journal" in cite:
                    arxiv_id = arxiv_pattern.findall(cite['journal'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        cite['volume'] = arxiv_id[0]
                        paper_dataset[cite['source']].append(cite)
                        continue
                    arxiv_id = re.findall(r"(?<![0-9])[0-9]{7}(?![0-9])", cite['journal'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        if arxiv_id[0] in arxiv_set_7: cite['volume'] = arxiv_set_7[arxiv_id[0]]
                        else:
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                        paper_dataset[cite['source']].append(cite)
                        continue
                if "volume" in cite:
                    arxiv_id = arxiv_pattern.findall(cite['volume'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        cite['volume'] = arxiv_id[0]
                        paper_dataset[cite['source']].append(cite)
                        continue
                    arxiv_id = re.findall(r"(?<![0-9])[0-9]{7}(?![0-9])", cite['volume'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        if arxiv_id[0] in arxiv_set_7: cite['volume'] = arxiv_set_7[arxiv_id[0]]
                        else:
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                        paper_dataset[cite['source']].append(cite)
                        continue
                paper_dataset[''].append(cite)
print({k: len(v) for k, v in paper_dataset.items()})

  3%|▎         | 24/724 [00:17<05:45,  2.02it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as MacRoman
100%|██████████| 724/724 [07:56<00:00,  1.52it/s]

{'inline arXiv': 62130, 'arXiv': 1460, 's2': 2471, 'Network Error': 29, '': 45660}





In [90]:
print("2005.14165" in arxiv_set)

True


In [98]:
paper_dataset_f = {"inline arXiv": [], "arXiv": [], "s2": [], "Network Error": [], "": [], "no title": []}
title_to_url = {}
for p in paper_dataset:
    for x in paper_dataset[p]:
        if 'title' not in x: 
            if 'info' not in x: 
                paper_dataset_f['no title'].append(x)
                continue
            else: x['title'] = x['info']
        if x['title'] not in title_to_url:
            if 'arXiv' in p:
                title_to_url[x['title']] = x['volume']
            elif p == 's2':
                title_to_url[x['title']] = x['url']
            elif 'url' in x:
                x['source'] = 's2'
                title_to_url[x['title']] = x['url']
                paper_dataset_f['s2'].append(x)
                continue
            else:
                if 'source' in x: del x['source']
            paper_dataset_f[p].append(x)
print({k: len(v) for k, v in paper_dataset_f.items()})

{'inline arXiv': 49199, 'arXiv': 813, 's2': 4592, 'Network Error': 18, '': 10048, 'no title': 174}


In [99]:
print(paper_dataset_f[''][0])

{'year': '2021', 'pages': '187--203', 'number': '1', 'volume': '43', 'journal': 'IEEE Transactions on Pattern Analysis and Machine Intelligence', 'author': 'Mar{\\i}n, Javier and Biswas, Aritro and Ofli, Ferda and Hynes, Nicholas and Salvador, Amaia and Aytar, Yusuf and Weber, Ingmar and Torralba, Antonio', 'title': 'Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images', 'ENTRYTYPE': 'article', 'ID': 'marin2021recipe1m+', 'paper': '2501.01958', 'sentence': 9, 'key': 'marin2021recipe1m+'}


In [103]:
# print_json([x for p in paper_dataset for x in paper_dataset[p]], "crawled_papers/citations/all.jsonl")
print({k: len(v) for k, v in paper_dataset_f.items()})
paper_dataset_f[''] += paper_dataset_f['Network Error']
del paper_dataset_f['Network Error']
for p in paper_dataset_f:
    print_json(paper_dataset_f[p], f'crawled_papers/citations/{p.replace(" ", "_") if p else "null"}.jsonl')

{'inline arXiv': 49199, 'arXiv': 813, 's2': 4592, 'Network Error': 18, '': 10048, 'no title': 174}


In [None]:
import tqdm, glob
arxiv_set = {}
for f in tqdm.tqdm(glob.glob("crawled_papers/cs/*/citations-clean.jsonl")):
    d = load_local(f)
    for x in d:
        for m in x['citation']:
            cite = x['citation'][m]
            if "source" in cite and "arXiv" in cite['source'] and len(cite['volume']) == 7:
                if cite['volume'] in arxiv_set:
                    cite['volume'] = arxiv_set[cite['volume']]
                    continue
                while True:
                    try:
                        title = f'ti:{cite['title']}'
                        client = arxiv.Client(num_retries=retry)
                        search = arxiv.Search(query=title, max_results=3, sort_by=arxiv.SortCriterion.Relevance)
                        for r in client.results(search):
                            text = result.entry_id.split("/")
                            cite['volume'] = arxiv_set[cite['volume']] = f"{text[-2]}/{text[-1]}"
                            break
                        break
                    except Exception as e:
                        print(e)
                        pass
    print_json(d, f)

In [124]:
sources = ['inline_arXiv', 'arXiv', 's2']
paper_dataset = {k: load_local(f"crawled_papers/citations/{k}.jsonl") for k in sources}
urls = {}
for k in paper_dataset:
    for x in paper_dataset[k]:
        urls[x['title']] = x['url'] if k == "s2" else f"https://arxiv.org/pdf/{x['volume']}"
print({k: len(v) for k, v in paper_dataset.items()}, len(urls))

{'inline_arXiv': 49199, 'arXiv': 916, 's2': 4661} 54773


In [123]:
with open("crawled_papers/citations/urls.json", "w+") as f: json.dump(urls, f)

In [56]:
response = requests.get("https://api.semanticscholar.org/datasets/v1/release/latest").json()
print(response['release_id'])

2025-10-21


In [104]:
import datasets
s2orc = datasets.load_dataset("alienai/peS2O", trust_remote_code=True)