# Data D3



### Data format
The data format is a .pkl file (pickle) containing a dataframe with at least these columns:
- id, a_text, b_id, b_text, c_id, c_text, research_type, y_true

In [66]:
import pandas as pd
import ast
import json

all_papers = pd.read_csv("./data/temp/papers_2025.csv", dtype={id: str})
all_papers.categories = all_papers.categories.apply(lambda x: ast.literal_eval(x))
print(all_papers.shape, "->", all_papers.columns)

all_papers_1s = pd.read_csv("./data/temp/papers_1subj_2025.csv", dtype={id: str})
all_papers_1s.categories = all_papers_1s.categories.apply(lambda x: ast.literal_eval(x))
print(all_papers_1s.shape, "->", all_papers_1s.columns)

ai4science_references = pd.read_json("./data/temp/ai4science_references_complete.json")
print(ai4science_references.shape, "->", ai4science_references.columns)

with open('./data/temp/arxiv_subfields_map.json', 'r') as file:
    arxiv_subfields_map = json.load(file)


(58444, 6) -> Index(['id', 'title', 'abstract', 'categories', 'authors', 'date'], dtype='object')
(26203, 6) -> Index(['id', 'title', 'abstract', 'categories', 'authors', 'date'], dtype='object')
(9, 4) -> Index(['arxiv_id', 'research_type', 'explanation', 'ref_list'], dtype='object')


In [67]:
positives = ai4science_references.copy()

print(positives.shape, "->", positives.columns)

(9, 4) -> Index(['arxiv_id', 'research_type', 'explanation', 'ref_list'], dtype='object')


In [72]:
# Getting a single string for "paper B" and a single string for "paper C"

def concatTitleAbstract(obj):
    if obj.get('abstract', None):
        return f"{obj['title']}; {obj['abstract']}"
    return obj['title']

def getKeyByTitle(title):
    return next((key for key, value in arxiv_subfields_map.items() if value['title'] == title), None)

data = pd.DataFrame(columns=("id", "a_text", "b_id", "b_text", "b_categories", "c_id", "c_text", "c_categories"))

def addLine(df, arxiv_id, paper_1_id, paper_1_text, paper_1_categories, paper_2_id, paper_2_text, paper_2_categories, research_type, y_true):
    return pd.concat([df, pd.DataFrame([{
        'id': arxiv_id,
        'a_text': "",
        'b_id': paper_1_id,
        'b_text': paper_1_text,
        'b_categories': paper_1_categories,
        'c_id': paper_2_id,
        'c_text': paper_2_text,
        'c_categories': paper_2_categories,
        'research_type': research_type,
        'y_true': y_true
    }])], ignore_index=True)

for row in positives.itertuples():
    rowI = row.Index
    arxiv_id = row.arxiv_id
    research_type = row.research_type
    paper_1, paper_2 = row.ref_list
    
    paper_1_text = concatTitleAbstract(paper_1)
    paper_1_categories = [getKeyByTitle(sf_title) for sf_title in paper_1["subfield"]]
    
    paper_2_text = concatTitleAbstract(paper_2)
    paper_2_categories = [getKeyByTitle(sf_title) for sf_title in paper_2["subfield"]]
        
    data = addLine(data, arxiv_id, f"{arxiv_id}-pos1-{rowI}", paper_1_text, paper_1_categories, f"{arxiv_id}-pos2-{rowI}", paper_2_text, paper_2_categories, research_type, True)
    
    paper_1_subj_random5 = all_papers_1s.loc[all_papers_1s.categories.apply(lambda x: x[0] in paper_1_categories)].sample(5, random_state=1+rowI).reset_index(drop=True)
    paper_2_subj_random5 = all_papers_1s.loc[all_papers_1s.categories.apply(lambda x: x[0] in paper_2_categories)].sample(5, random_state=2+rowI).reset_index(drop=True)
    paper_1_nsubj_random5 = all_papers.loc[~all_papers.categories.apply(lambda x: x[0] in (paper_1_categories + paper_2_categories))].sample(5, random_state=3+rowI).reset_index(drop=True)
    paper_2_nsubj_random5 = all_papers.loc[~all_papers.categories.apply(lambda x: x[0] in (paper_1_categories + paper_2_categories))].sample(5, random_state=4+rowI).reset_index(drop=True)

    for i, subrow in paper_2_subj_random5.iterrows():
        random_id = subrow.id
        random_text = concatTitleAbstract(subrow)
        random_category = subrow.categories
        data = addLine(data, arxiv_id, f"{arxiv_id}-pos1-{rowI}", paper_1_text, paper_1_categories, random_id, random_text, random_category, research_type, False)
    
    for i, subrow in paper_1_subj_random5.iterrows():
        random_id = subrow.id
        random_text = concatTitleAbstract(subrow)
        random_category = subrow.categories
        data = addLine(data, arxiv_id, f"{arxiv_id}-pos2-{rowI}", paper_2_text, paper_2_categories, random_id, random_text, random_category, research_type, False)
        
    for i, subrow in paper_1_nsubj_random5.iterrows():
        random_id = subrow.id
        random_text = concatTitleAbstract(subrow)
        random_category = subrow.categories
        data = addLine(data, arxiv_id, f"{arxiv_id}-pos1-{rowI}", paper_1_text, paper_1_categories, random_id, random_text, random_category, research_type, False)

    for i, subrow in paper_2_nsubj_random5.iterrows():
        random_id = subrow.id
        random_text = concatTitleAbstract(subrow)
        random_category = subrow.categories
        data = addLine(data, arxiv_id, f"{arxiv_id}-pos2-{rowI}", paper_2_text, paper_2_categories, random_id, random_text, random_category, research_type, False)

print(data.shape, "->", data.columns)

  return pd.concat([df, pd.DataFrame([{


(189, 10) -> Index(['id', 'a_text', 'b_id', 'b_text', 'b_categories', 'c_id', 'c_text',
       'c_categories', 'research_type', 'y_true'],
      dtype='object')


In [73]:
data.to_pickle("./data/data_D3.pkl")
data = pd.read_pickle("./data/data_D3.pkl")

data

Unnamed: 0,id,a_text,b_id,b_text,b_categories,c_id,c_text,c_categories,research_type,y_true
0,2411.01019,,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2411.01019-pos2-0,Anterior mediastinal nodular lesion segmentati...,[cs.AI],applied,True
1,2411.01019,,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2502.02377,A Minimax Approach to Ad Hoc Teamwork; We prop...,[cs.AI],applied,False
2,2411.01019,,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2502.1961,Program Synthesis Dialog Agents for Interactiv...,[cs.AI],applied,False
3,2411.01019,,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2502.1062,ProMRVL-CAD: Proactive Dialogue System with Mu...,[cs.AI],applied,False
4,2411.01019,,2411.01019-pos1-0,Incidental Anterior Mediastinal Nodular Lesion...,"[q-bio.TO, q-bio.CB]",2502.14491,Statistical Scenario Modelling and Lookalike D...,[cs.AI],applied,False
...,...,...,...,...,...,...,...,...,...,...
184,2411.00561,,2411.00561-pos2-8,Retrieval and classification of shape-based ob...,[cs.AI],2502.04492,Multi-Agent Reinforcement Learning with Focal ...,[cs.CL],applied,False
185,2411.00561,,2411.00561-pos2-8,Retrieval and classification of shape-based ob...,[cs.AI],2503.04422,PDX: A Data Layout for Vector Similarity Searc...,"[cs.DB, cs.AI]",applied,False
186,2411.00561,,2411.00561-pos2-8,Retrieval and classification of shape-based ob...,[cs.AI],2502.17914,"Upper Mid-Band Spectrum for 6G: Vision, Opport...",[eess.SP],applied,False
187,2411.00561,,2411.00561-pos2-8,Retrieval and classification of shape-based ob...,[cs.AI],2502.05898,"Rising Marginal Costs, Rising Prices?; We pres...","[econ.GN, q-fin.EC]",applied,False
