In [45]:
import json

def load_local(fn):
    with open(fn, "r+", encoding="utf-8") as f:
        d = [json.loads(line.strip()) for line in f if line.strip()]
    return d

def print_json(d, fn):
    with open(fn, "w+", encoding="utf-8") as f:
        for x in d:
            f.write(json.dumps(x, ensure_ascii=False) + "\n")

def yield_local(fn):
    with open(fn, "r+", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                try:
                    yield json.loads(line)
                except:
                    pass

In [None]:
import sys
sys.path.append("P:\\AI4S\\survey_eval\\latex_parser\\")
from latex_parser.tex_parser import LatexPaperParser, process_input_commands
# base = "P:\\AI4S\\survey_eval\\crawled_papers\\cs\\2501.00842"
# with open(f"{base}\\elsarticle-template-num.tex", encoding='utf-8') as f:
#     content = process_input_commands(f.read(), base)
# parser = LatexPaperParser(content, base)
# paper = parser.parse()
# all_citations = paper.map_citations_to_sentence()
# print(len(all_citations))
from pylatexenc.latexwalker import LatexWalker
from pylatexenc.latex2text import LatexNodes2Text
content = "\\href{https://www.itu.int/rec/R-REC-M.2160/en}{M.2160}\\href{https://www.itu.int/rec/R-REC-M.2160/en}"
import re

print(content)
from pylatexenc.latex2text import LatexNodes2Text

nodes, _, _ = LatexWalker(content).get_latex_nodes()
converter = LatexNodes2Text(
    math_mode="verbatim",
)
print(nodes)
print(converter.nodelist_to_text(nodes))

M.2160
[LatexCharsNode(parsing_state=<parsing state 2310916137072>, pos=0, len=6, chars='M.2160')]
M.2160


In [None]:
import bibtexparser

def parse_bib_file(filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        bib_database = bibtexparser.load(f)
    citations = {}
    for entry in bib_database.entries:
        citation_key = entry.get('ID', '')
        if citation_key and citation_key not in citations:
            citations[citation_key] = entry
    return citations

parse_bib_file("P:\\AI4S\\survey_eval\\crawled_papers\\cs\\2502.15573\\anthology.bib")

In [5]:
import re
arxiv_pattern = re.compile(r"(?<![0-9])[0-9]{4}\.[0-9]{4,5}(?![0-9])")

def get_arxiv_cites(f):
    arxiv_index, no_arxiv_titles = set(), set()
    citations = load_local(f)
    for x in citations:
        cite = x['citation']
        for k in cite:
            arxiv_key = ""
            if 'journal' in cite[k]:
                arxiv_key = arxiv_pattern.findall(cite[k]['journal'])
                if arxiv_key: arxiv_key = arxiv_key[-1]
            if not arxiv_key and 'volume' in cite[k]:
                arxiv_key = arxiv_pattern.findall(cite[k]['volume'])
                if arxiv_key: arxiv_key = arxiv_key[-1]
            if arxiv_key:
                cite[k]['journal'] = 'arXiv'
                cite[k]['volume'] = arxiv_key
                arxiv_index.add(arxiv_key)
            else:
                no_arxiv_titles.add(cite[k]['title'])
    return citations, arxiv_index, no_arxiv_titles

In [4]:
import arxiv
import time

def search_arxiv_by_title(title, max_results=3, retry=3):
    max_retry = retry
    while retry > 0:
        try:
            time.sleep(3 ** (max_retry - retry))
            client = arxiv.Client()
            search = arxiv.Search(query=f'ti:"{title}"', max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
            results = list(client.results(search))
            return results[0].entry_id.split("/")[-1] if results else None
        except Exception as e:
            print(f"Error: {e}, Retry: {retry}")
            retry -= 1

In [100]:
import requests
import time
while True:
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    title = "Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images"
    params = {'query': title, 'limit': 3, 'fields': 'paperId,title,openAccessPdf,url,citationCount'}
    # with self.session.with_api_key(self.api_key[self.times_429 % len(self.api_key)]) as temp_session:                
    response = requests.get(url, params=params, timeout=60)
    if response.status_code == 200: break
    time.sleep(1)
print(response.json())

{'total': 2, 'offset': 0, 'data': [{'paperId': '048d133e2ec513ce385c8e736df715d8ff496e17', 'url': 'https://www.semanticscholar.org/paper/048d133e2ec513ce385c8e736df715d8ff496e17', 'title': 'Recipe1M: A Dataset for Learning Cross-Modal Embeddings for Cooking Recipes and Food Images', 'citationCount': 344, 'openAccessPdf': {'url': '', 'status': None, 'license': None}}, {'paperId': '02c009f41b66d2f977fb663f3cb69329f0f03d3f', 'url': 'https://www.semanticscholar.org/paper/02c009f41b66d2f977fb663f3cb69329f0f03d3f', 'title': 'Recipe1M+: A Dataset for Learning Cross-Modal Embeddings for Cooking Recipes and Food Images', 'citationCount': 141, 'openAccessPdf': {'url': 'https://dspace.mit.edu/bitstream/1721.1/130340/2/tpami19.pdf', 'status': 'GREEN', 'license': 'CCBYNCSA'}}]}


In [82]:
import tqdm, glob
arxiv_pattern = re.compile(r"(?<![0-9])[0-9]{4}\.[0-9]{4,5}(?![0-9])")
paper_dataset = {"inline arXiv": [], "arXiv": [], "s2": [], "Network Error": [], "": []}
arxiv_set, arxiv_set_7 = set(), {}

def fetch_arxiv_id_7(title):
    title = f'ti:{cite['title']}'
    client = arxiv.Client(num_retries=5)
    search = arxiv.Search(query=title, max_results=3, sort_by=arxiv.SortCriterion.Relevance)
    for result in client.results(search):
        text = result.entry_id.split("/")
        return f"{text[-2]}/{text[-1]}"


for f in tqdm.tqdm(glob.glob("crawled_papers/cs/*/citations-clean.jsonl")):
    for i, x in enumerate(yield_local(f)):
        for m in x['citation']:
            cite = x['citation'][m]
            cite['paper'] = f.split("/")[-2]
            cite['sentence'] = i
            cite['key'] = m
            if "source" in cite: 
                if "arXiv" in cite['source']:
                    if len(cite['volume']) == 7:
                        if cite['volume'] in arxiv_set_7: cite['volume'] = arxiv_set_7[cite['volume']]
                        else: 
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                    else:
                        arxiv_set.add(cite['volume'])
                paper_dataset[cite['source']].append(cite)
            else:
                if "journal" in cite:
                    arxiv_id = arxiv_pattern.findall(cite['journal'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        cite['volume'] = arxiv_id[0]
                        paper_dataset[cite['source']].append(cite)
                        continue
                    arxiv_id = re.findall(r"(?<![0-9])[0-9]{7}(?![0-9])", cite['journal'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        if arxiv_id[0] in arxiv_set_7: cite['volume'] = arxiv_set_7[arxiv_id[0]]
                        else:
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                        paper_dataset[cite['source']].append(cite)
                        continue
                if "volume" in cite:
                    arxiv_id = arxiv_pattern.findall(cite['volume'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        cite['volume'] = arxiv_id[0]
                        paper_dataset[cite['source']].append(cite)
                        continue
                    arxiv_id = re.findall(r"(?<![0-9])[0-9]{7}(?![0-9])", cite['volume'])
                    if arxiv_id:
                        cite['source'] = "inline arXiv"
                        if arxiv_id[0] in arxiv_set_7: cite['volume'] = arxiv_set_7[arxiv_id[0]]
                        else:
                            arxiv_id = fetch_arxiv_id_7(cite['title'])
                            if arxiv_id: cite['volume'] = arxiv_set_7[cite['volume']] = arxiv_id
                            else: continue
                        paper_dataset[cite['source']].append(cite)
                        continue
                paper_dataset[''].append(cite)
print({k: len(v) for k, v in paper_dataset.items()})

  3%|▎         | 24/724 [00:17<05:45,  2.02it/s]Bozo feed; consider handling: document declared as utf-8, but parsed as MacRoman
100%|██████████| 724/724 [07:56<00:00,  1.52it/s]

{'inline arXiv': 62130, 'arXiv': 1460, 's2': 2471, 'Network Error': 29, '': 45660}





In [98]:
paper_dataset_f = {"inline arXiv": [], "arXiv": [], "s2": [], "Network Error": [], "": [], "no title": []}
title_to_url = {}
for p in paper_dataset:
    for x in paper_dataset[p]:
        if 'title' not in x: 
            if 'info' not in x: 
                paper_dataset_f['no title'].append(x)
                continue
            else: x['title'] = x['info']
        if x['title'] not in title_to_url:
            if 'arXiv' in p:
                title_to_url[x['title']] = x['volume']
            elif p == 's2':
                title_to_url[x['title']] = x['url']
            elif 'url' in x:
                x['source'] = 's2'
                title_to_url[x['title']] = x['url']
                paper_dataset_f['s2'].append(x)
                continue
            else:
                if 'source' in x: del x['source']
            paper_dataset_f[p].append(x)
print({k: len(v) for k, v in paper_dataset_f.items()})

{'inline arXiv': 49199, 'arXiv': 813, 's2': 4592, 'Network Error': 18, '': 10048, 'no title': 174}


In [99]:
print(paper_dataset_f[''][0])

{'year': '2021', 'pages': '187--203', 'number': '1', 'volume': '43', 'journal': 'IEEE Transactions on Pattern Analysis and Machine Intelligence', 'author': 'Mar{\\i}n, Javier and Biswas, Aritro and Ofli, Ferda and Hynes, Nicholas and Salvador, Amaia and Aytar, Yusuf and Weber, Ingmar and Torralba, Antonio', 'title': 'Recipe1m+: A dataset for learning cross-modal embeddings for cooking recipes and food images', 'ENTRYTYPE': 'article', 'ID': 'marin2021recipe1m+', 'paper': '2501.01958', 'sentence': 9, 'key': 'marin2021recipe1m+'}


In [103]:
# print_json([x for p in paper_dataset for x in paper_dataset[p]], "crawled_papers/citations/all.jsonl")
print({k: len(v) for k, v in paper_dataset_f.items()})
paper_dataset_f[''] += paper_dataset_f['Network Error']
del paper_dataset_f['Network Error']
for p in paper_dataset_f:
    print_json(paper_dataset_f[p], f'crawled_papers/citations/{p.replace(" ", "_") if p else "null"}.jsonl')

{'inline arXiv': 49199, 'arXiv': 813, 's2': 4592, 'Network Error': 18, '': 10048, 'no title': 174}


In [None]:
import tqdm, glob
arxiv_set = {}
for f in tqdm.tqdm(glob.glob("crawled_papers/cs/*/citations-clean.jsonl")):
    d = load_local(f)
    for x in d:
        for m in x['citation']:
            cite = x['citation'][m]
            if "source" in cite and "arXiv" in cite['source'] and len(cite['volume']) == 7:
                if cite['volume'] in arxiv_set:
                    cite['volume'] = arxiv_set[cite['volume']]
                    continue
                while True:
                    try:
                        title = f'ti:{cite['title']}'
                        client = arxiv.Client(num_retries=retry)
                        search = arxiv.Search(query=title, max_results=3, sort_by=arxiv.SortCriterion.Relevance)
                        for r in client.results(search):
                            text = result.entry_id.split("/")
                            cite['volume'] = arxiv_set[cite['volume']] = f"{text[-2]}/{text[-1]}"
                            break
                        break
                    except Exception as e:
                        print(e)
                        pass
    print_json(d, f)

In [56]:
response = requests.get("https://api.semanticscholar.org/datasets/v1/release/latest").json()
print(response['release_id'])

2025-10-21


In [127]:
with open("paper_to_query/seeds.json") as f:
    papers = json.load(f)
data = [{"id": k, "title": v['title'], "abstract": v['abstract']} for k, v in papers.items()]
print_json(data, "../agenteval/inputs/seeds.jsonl")

In [181]:
with open("paper_to_query/easy_neg.json") as f:
    hard = json.load(f)
d = [{
    "positive": {
        "id": v['related_id'], 
        "title": papers[v['related_id']]['title'], 
        "abstract": papers[v['related_id']]['abstract']
    }, 
    "negative": {
        "id": k, 
        "title": v['title'], 
        "abstract": v['abstract']
    }
} for k, v in hard.items()]
print(len(d))
print_json(d, "../agenteval/inputs/easy_neg.jsonl")

50025


In [135]:
with open("crawled_papers/citations/urls.json") as f:
    url_map = json.load(f)
import glob
pdf_files = [x.replace("crawled_papers/pdf/", "").replace(".pdf", "") for x in glob.glob("crawled_papers/pdf/*.pdf")]
urls = set()
for title, v in url_map.items():
    if "arxiv.org" in v: title = v.replace("https://arxiv.org/pdf/", "")
    title = title.replace(" ", "+").replace(":", "--")
    urls.add(title)
diffa = urls - set(pdf_files)
diffb = set(pdf_files) - urls
print(len(url_map), len(set(pdf_files)), len(diffa), len(diffb))

54604 42248 3115 0


In [142]:
sources = ['inline_arXiv']
paper_dataset = {k: load_local(f"crawled_papers/citations/{k}.jsonl") for k in sources}
urls_map = {}
for k in paper_dataset:
    for x in paper_dataset[k]:
        # urls_map[x['title']] = x['url'] if k == "s2" else f"https://arxiv.org/pdf/{x['volume']}"
        urls_map[x['title']] = f"https://arxiv.org/pdf/{x['volume']}"
print({k: len(v) for k, v in paper_dataset.items()}, len(urls_map))
with open("crawled_papers/citations/urls_inlinearxiv.json", "w+") as f: json.dump(urls_map, f)

{'inline_arXiv': 49199} 49199


In [147]:
import os
urls_map_exists, urls_map_missing = {}, {}
for k in paper_dataset:
    for x in paper_dataset[k]:
        title = x['title']
        url = x['url'] if k == "s2" else f"https://arxiv.org/pdf/{x['volume']}"
        # urls_map[x['title']] = f"https://arxiv.org/pdf/{x['volume']}"
        path = url.replace("https://arxiv.org/pdf/", "") if "arxiv.org" in url else title
        path = path.replace(" ", "+").replace(":", "--")
        if os.path.exists(f'crawled_papers/pdf/{path}.pdf'): urls_map_exists[title] = path
        else: urls_map_missing[title] = url
print({k: len(v) for k, v in paper_dataset.items()}, len(urls_map_exists), len(urls_map_missing))
with open("crawled_papers/citations/urls_inlinearXiv.json", "w+") as f: json.dump(urls_map_exists, f)
with open("crawled_papers/citations/inlinearXiv_redownload.json", "w+") as f: json.dump(urls_map_missing, f)

{'inline_arXiv': 49199} 49025 174


In [149]:
# with open("crawled_papers/citations/find.json") as f: 
#     urls_map_missing.update(json.load(f))
null = load_local("crawled_papers/citations/null.jsonl")
os_source = {}
for k in ['s2', 'arXiv']:
    for x in yield_local(f"crawled_papers/citations/{k}.jsonl"):
        os_source[x['title']] = x['url'] if k == "s2" else f"https://arxiv.org/pdf/{x['volume']}"
old_urls = {}
for x in null:
    if x in os_source:
        url = os_source[x].replace("arxiv.org/abs/", "arxiv.org/pdf/").replace("openreview.net/forum", "openreview.net/pdf")
        if "pdf" not in url: url += ".pdf"
        old_urls[x] = url
print(len(old_urls))
with open("crawled_papers/citations/old_urls.json", "w+") as f: json.dump(old_urls, f)

2400


In [176]:
# 一共四个文件：
# urls_inlinearXiv.json: 初次下载成功的inline arxiv。 -- urls_map_exists
# with open("crawled_papers/citations/urls_inlinearXiv.json") as f: urls_map_exists = json.load(f)
# inlinearXiv_redownload.json: 下载失败的inline arxiv。 -- urls_map_missing
# with open("crawled_papers/citations/inlinearXiv_redownload.json") as f: urls_map_missing = json.load(f)
# find.json: s2api请求到的文献以及请求失败的文献中，用openalex再次找到的路径。 -- find
# with open("crawled_papers/citations/find.json") as f: find = json.load(f)
# old_urls.json: openalex请求失败了但曾经被s2api请求成功的文献。 -- old_urls
# with open("crawled_papers/citations/old_urls.json") as f: old_urls = json.load(f)

def construct_title(title, url):
    if "arxiv.org" in url: title = url.replace("https://arxiv.org/pdf/", "")
    return title.replace(" ", "+").replace(":", "--").split("/")[-1].replace("{", "").replace("}", "").replace(".pdf", "")
    
pdf_files = set(x.split("/")[-1].replace("{", "").replace("}", "").replace(".pdf", "") for x in os.listdir("crawled_papers/pdf"))
parse_success = set(x[:-4] for x in os.listdir("crawled_papers/paper_info"))
inline_titles = set(construct_title(k, v) for k, v in urls_map_exists.items())
redownload_titles = set(construct_title(k, v) for k, v in urls_map_missing.items())
find_titles = set(construct_title(k, v) for k, v in find.items())
old_titles = set(construct_title(k, v) for k, v in old_urls.items())
broken_pdfs = inline_titles - parse_success
new_pdfs = pdf_files - parse_success - broken_pdfs
print(len(find), len(pdf_files), len(parse_success), len(inline_titles), len(redownload_titles), 
      len(find_titles), len(old_titles), len(broken_pdfs), len(new_pdfs))

5006 43011 40813 48654 160 4653 2346 48615 2008


In [180]:
for f in glob.glob("crawled_papers/pdf/*.pdf"):
    filename = f.split("/")[-1]
    newf = re.sub(r"[\[\]\{\}\(\)\n]", "", filename)
    newf = "/".join(f.split("/")[:-1] + [newf])
    if f != newf: 
        os.rename(f, newf)

In [163]:
pd = os.listdir("crawled_papers/paper_info")
print(len(pd))

40813


In [194]:
def normalize_title(title: str, url: str) -> tuple[str, str]:
    if "https://arxiv.org/pdf/" in url: title = url.replace("https://arxiv.org/pdf/", "")
    elif "https://arxiv.org/abs/" in url: 
        title = url.replace("https://arxiv.org/abs/", "")
        url = url.replace("abs", "pdf")
    if "/" in title: title = title.split("/")[-1]
    title = title.replace(" ", "+").replace(":", "--")
    title = re.sub(r"[\{\}\[\]\(\)\n]", "", title)
    return title, url
    
pdfs = set(x[:-4] for x in os.listdir("crawled_papers/pdf"))
xmls = set(x[:-4] for x in os.listdir("crawled_papers/papers_full"))
diffs = pdfs - xmls
redownload = {}
for d in [urls_map_exists, urls_map_missing, find, old_urls]:
    for k, v in d.items():
        if len(d) == 49025 and "http" not in v:
            v = f"https://arxiv.org/pdf/{v}"
        title, url = normalize_title(k, v)
        if title in diffs: redownload[title] = url
print(len(pdfs), len(xmls), len(diffs), len(redownload))
with open("crawled_papers/citations/redownload.json", "w+") as f: json.dump(redownload, f)

43011 41930 1081 616


In [195]:
print(len(set(redownload.values())))

609


In [50]:
final = load_local("agent/final.jsonl")

In [54]:
import random
all_examples, no_positive = [], []
for x in final:
    positive = [y['features'] for y in x['citations'] if y['query'] is not None]
    if not positive: 
        no_positive.append(x['query'])
        continue
    positive_ids = set(y['id'] for y in x['citations'] if y['query'] is not None)
    #  and y['features'][3] > 0.001 and y['features'][0] > 0.4
    negative = [y['features'] for i, y in x['oracle'].items() if i not in positive_ids]
    if len(negative) > 8 * len(positive):
        negative = random.sample(negative, 8 * len(positive))
    for x in positive: x[3] *= 100
    for x in negative: x[3] *= 100
    all_examples.append({"chosen": positive, "rejected": negative})
print(len(all_examples), len(no_positive))

4237 5


In [55]:
print_json(all_examples, "train_unfiltered.jsonl")

In [47]:
for i in range(10): print(n4[i])
print()
for i in range(10): print(oracles[i])

[0.5802272093214645, 0.6951921335227317, 0.9200738986424688, 0.005647254731297929, 0.2634244151202588]
[0.4782271881188792, 0.6697714662520363, 0.5970057169885848, 0.0014639512639257279, 0.2793163962788106]
[0.5282662710935353, 0.5075860226891465, 0.5848955292849121, 0.0008251858097541986, 0.19789319473796205]
[0.2820138724558999, 0.39636830908366794, 0.0, 0.0003648024571642535, 0.2462774773860048]
[0.5820143310951718, 0.44224436735918976, 0.2433336164139801, 0.0004170158729369283, 0.2272686473082467]
[0.37203137140012726, 0.42418842747308594, 0.3650004246209701, 0.0004627312004300431, 0.20399151357263628]
[0.4879600489543501, 0.42928524072714397, 0.4361709450044923, 0.0006402247470968661, 0.1727570895927895]
[0.39342385826511284, 0.5602162994458415, 0.341561912870932, 0.0005187760248957936, 0.21227894262058866]
[0.5090515687739146, 0.5373698828284396, 0.0, 0.0003648024571642535, 0.1896679909247869]
[0.4468559640265191, 0.7091222426616733, 0.45022068946860055, 0.0008631726986542585, 0.

In [53]:
for x in final:
    del x['hard_negatives'], x['easy_negatives']
print_json(final, "agent/final.jsonl")

In [43]:
for y in x['citations']:
    if not y['query']:
        print(y)
        break

{'id': 'W2920708523', 'title': 'Prevalence, Risk Factors, and Fetomaternal Outcomes of Gestational Diabetes Mellitus in Kuwait: A Cross-Sectional Study', 'features': [0.5308709851132949, 0.29138664379250934, 0.0, 0, 0.21019103299036604], 'query': None}


In [44]:
print(x['title'])

Effects of Nutritional Strategies on Glucose Homeostasis in Gestational Diabetes Mellitus: A Systematic Review and Network Meta-Analysis
