# Data D3



### Data format
The data format is a .pkl file (pickle) containing a dataframe with at least these columns:
- id, a_text, b_id, b_text, c_id, c_text, research_type, y_true

In [1]:
import pandas as pd
import ast
import json

all_papers = pd.read_csv("./data/temp/papers_2025.csv", dtype={id: str})
all_papers.categories = all_papers.categories.apply(lambda x: ast.literal_eval(x))
print(all_papers.shape, "->", all_papers.columns)

ai4science_references = pd.read_json("./data/temp/ai4science_references_complete.json")
print(ai4science_references.shape, "->", ai4science_references.columns)

with open('./data/temp/arxiv_subfields_map.json', 'r') as file:
    arxiv_subfields_map = json.load(file)


(58444, 6) -> Index(['id', 'title', 'abstract', 'categories', 'authors', 'date'], dtype='object')
(9, 4) -> Index(['arxiv_id', 'research_type', 'explanation', 'ref_list'], dtype='object')


In [2]:
positives = ai4science_references.copy()

print(positives.shape, "->", positives.columns)

(9, 4) -> Index(['arxiv_id', 'research_type', 'explanation', 'ref_list'], dtype='object')


In [20]:
import random

def concatTitleAbstract(obj):
    if obj.get('abstract', None):
        return f"{obj['title']}; {obj['abstract']}"
    return obj['title']

def getKeyByTitle(title):
    return next((key for key, value in arxiv_subfields_map.items() if value['title'] == title), None)

data = pd.DataFrame(columns=("id", "a_text", "research_type", "main_id", "main_text", "main_categories", "target_id", "target_text", "target_categories", "list", "list_true"))

def addLine(df, arxiv_id, research_type, main_id, main_text, main_categories, target_id, target_text, target_categories, list, list_true):
    return pd.concat([df, pd.DataFrame([{
        'id': arxiv_id,
        'a_text': "",
        'research_type': research_type,
        'main_id': main_id,
        'main_text': main_text,
        'main_categories': main_categories,
        'target_id': target_id,
        'target_text': target_text,
        'target_categories': target_categories,
        'list': list,
        'list_true': list_true
    }])], ignore_index=True)

for row in positives.itertuples():
    rowI = row.Index
    arxiv_id = row.arxiv_id
    research_type = row.research_type
    paper_1, paper_2 = row.ref_list
    
    paper_1_text = concatTitleAbstract(paper_1)
    paper_1_categories = [getKeyByTitle(sf_title) for sf_title in paper_1["subfield"]]
    
    paper_2_text = concatTitleAbstract(paper_2)
    paper_2_categories = [getKeyByTitle(sf_title) for sf_title in paper_2["subfield"]]
        
    random99_1 = [concatTitleAbstract(paper) for i, paper in all_papers.sample(99, random_state=1+rowI).reset_index(drop=True).iterrows()] + [paper_2_text]
    random.shuffle(random99_1)
    true_1 = random99_1.index(paper_2_text) + 1
    random99_1_str = "; ".join([f"{i+1}) {subrow}" for i, subrow in enumerate(random99_1)])
    
    data = addLine(data, arxiv_id, research_type, f"{arxiv_id}-pos1-{rowI}", paper_1_text, paper_1_categories, f"{arxiv_id}-pos2-{rowI}", paper_2_text, paper_2_categories, random99_1_str, true_1)
    
    
    random99_2 = [concatTitleAbstract(paper) for i, paper in all_papers.sample(99, random_state=2+rowI).reset_index(drop=True).iterrows()] + [paper_1_text]
    random.shuffle(random99_2)
    true_2 = random99_2.index(paper_1_text) + 1
    random99_2_str = "; ".join([f"{i+1}) {subrow}" for i, subrow in enumerate(random99_2)])
    
    data = addLine(data, arxiv_id, research_type, f"{arxiv_id}-pos2-{rowI}", paper_2_text, paper_2_categories, f"{arxiv_id}-pos1-{rowI}", paper_1_text, paper_1_categories, random99_2_str, true_2)
    
    

print(data.shape, "->", data.columns)

  return pd.concat([df, pd.DataFrame([{


(18, 11) -> Index(['id', 'a_text', 'research_type', 'main_id', 'main_text',
       'main_categories', 'target_id', 'target_text', 'target_categories',
       'list', 'list_true'],
      dtype='object')


In [21]:
data.to_pickle("./data/data_D5.pkl")
data = pd.read_pickle("./data/data_D5.pkl")

data

Unnamed: 0,id,a_text,research_type,main_id,main_text,main_categories,target_id,target_text,target_categories,list,list_true
0,2411.01019,,applied,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2411.01019-pos2-0,Anterior mediastinal nodular lesion segmentati...,[cs.AI],1) Non-extendablity of Shelukhin's quasimorphi...,50
1,2411.01019,,applied,2411.01019-pos2-0,Anterior mediastinal nodular lesion segmentati...,[cs.AI],2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",1) SigN: SIMBox Activity Detection Through Lat...,70
2,2412.11084,,applied,2412.11084-pos1-1,BarcodeBERT: Transformers for Biodiversity Ana...,[cs.AI],2412.11084-pos2-1,Biological identifications through DNA barcode...,[q-bio.GN],1) SigN: SIMBox Activity Detection Through Lat...,49
3,2412.11084,,applied,2412.11084-pos2-1,Biological identifications through DNA barcode...,[q-bio.GN],2412.11084-pos1-1,BarcodeBERT: Transformers for Biodiversity Ana...,[cs.AI],1) Efficiently Solving Discounted MDPs with Pr...,47
4,2412.00036,,applied,2412.00036-pos1-2,Quant GANs: deep generation of financial time ...,[cs.AI],2412.00036-pos2-2,On the Distribution of the Two-Sample Cramer-v...,[q-fin.GN],1) Universal machine learning interatomic pote...,82
5,2412.00036,,applied,2412.00036-pos2-2,On the Distribution of the Two-Sample Cramer-v...,[q-fin.GN],2412.00036-pos1-2,Quant GANs: deep generation of financial time ...,[cs.AI],1) GOD model: Privacy Preserved AI School for ...,43
6,2411.0064,,applied,2411.0064-pos1-3,The Llama 3 Herd of Models; Modern artificial ...,[cs.AI],2411.0064-pos2-3,Quantifying Variance in Evaluation Benchmarks;...,[stat.ME],1) A structure-preserving parametric finite el...,60
7,2411.0064,,applied,2411.0064-pos2-3,Quantifying Variance in Evaluation Benchmarks;...,[stat.ME],2411.0064-pos1-3,The Llama 3 Herd of Models; Modern artificial ...,[cs.AI],1) Entropic force and bouncing behaviour in $\...,41
8,2411.00609,,applied,2411.00609-pos1-4,Improving Pediatric Low-Grade Neuroepithelial ...,[cs.CV],2411.00609-pos2-4,Pediatric low-grade glioma: State-of-the-art a...,[q-bio.NC],1) Will Systems of LLM Agents Cooperate: An In...,74
9,2411.00609,,applied,2411.00609-pos2-4,Pediatric low-grade glioma: State-of-the-art a...,[q-bio.NC],2411.00609-pos1-4,Improving Pediatric Low-Grade Neuroepithelial ...,[cs.CV],1) On the Role of Pre-trained Embeddings in Bi...,13
