# DATA

### Data format
The data format is a .pkl file (pickle) containing a dataframe with at least these columns:
- id, a_text, b_id, b_text, c_id, c_text

In [None]:
# Reading the raw data

import pandas as pd

data = pd.read_json("./data/many_subject_negative_part_1.json")

print(data.shape, "->", data.columns)

In [None]:
# Getting a single string for "paper B" and a single string for "paper C"

def concatTitleAbstract(obj):
    if obj.get('abstract', None):
        return f"{obj['title']}; {obj['abstract']}"
    return obj['title']

data["id"] = None
data["a_text"] = None
data["b_id"] = None
data["b_text"] = None
data["c_id"] = None
data["c_text"] = None

for i, row in data.iterrows():
    a_text = ""
    b_text = concatTitleAbstract(row[0])
    c_text = concatTitleAbstract(row[1])
    
    data.loc[i, "id"] = f"neg-1s-{i}"
    data.loc[i, "a_text"] = a_text
    data.loc[i, "b_id"] = row[0]["paper_id"]
    data.loc[i, "b_text"] = b_text
    data.loc[i, "c_id"] = row[1]["paper_id"]
    data.loc[i, "c_text"] = c_text

print(data.shape, "->", data.columns)

(100, 8) -> Index([0, 1, 'id', 'a_text', 'b_id', 'b_text', 'c_id', 'c_text'], dtype='object')


In [None]:
# embeddigns A B C
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')#.to("cpu")

def scibertEncode(string):
    inputs = tokenizer(
        string,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )#.to("cpu")

    outputs = model(**inputs)

    return outputs[1]

data["a_scibert"] = None
data["b_scibert"] = None
data["c_scibert"] = None

for i, row in data.iterrows():
    a_scibert = scibertEncode(row["a_text"])
    b_scibert = scibertEncode(row["b_text"])
    c_scibert = scibertEncode(row["c_text"])
    
    data.at[i, "a_scibert"] = a_scibert
    data.at[i, "b_scibert"] = b_scibert
    data.at[i, "c_scibert"] = c_scibert

print(data.shape, "->", data.columns)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(100, 11) -> Index([          0,           1,        'id',    'a_text',      'b_id',
          'b_text',      'c_id',    'c_text', 'a_scibert', 'b_scibert',
       'c_scibert'],
      dtype='object')


In [None]:
data["y_true"] = False
data["research_type"] = None

In [None]:
data.to_pickle("./data/many_subject_negative_part_1.pkl")
data = pd.read_pickle("./data/many_subject_negative_part_1.pkl")

data

Unnamed: 0,0,1,id,a_text,b_id,b_text,c_id,c_text,a_scibert,b_scibert,c_scibert,y_true,research_type
0,"{'paper_id': '1206.2966v2', 'title': 'Panel Da...","{'paper_id': '2202.03234v1', 'title': 'General...",neg-1s-0,,1206.2966v2,Panel Data Models with Nonadditive Unobserved ...,2202.03234v1,Generalised norm resolvent convergence: compar...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.4359, grad_fn=<UnbindBackward0>), ...","[[tensor(0.0731, grad_fn=<UnbindBackward0>), t...",False,
1,"{'paper_id': '1506.05620v2', 'title': 'A param...","{'paper_id': '0901.1400v1', 'title': 'Variatio...",neg-1s-1,,1506.05620v2,A parameterized approximation algorithm for th...,0901.1400v1,Variation of quasiconformal mappings on lines;...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.1688, grad_fn=<UnbindBackward0>), ...","[[tensor(0.1216, grad_fn=<UnbindBackward0>), t...",False,
2,"{'paper_id': '1710.01236v6', 'title': 'netgwas...","{'paper_id': '2103.05504v1', 'title': 'Status ...",neg-1s-2,,1710.01236v6,netgwas: An R Package for Network-Based Genome...,2103.05504v1,Status of the wave function of Quantum Mechani...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.4274, grad_fn=<UnbindBackward0>), ...","[[tensor(0.0461, grad_fn=<UnbindBackward0>), t...",False,
3,"{'paper_id': '1011.6268v1', 'title': 'Quantita...","{'paper_id': '1905.10982v1', 'title': 'An Inte...",neg-1s-3,,1011.6268v1,Quantitative Analysis of Bloggers Collective B...,1905.10982v1,An Intelligent Monitoring System of Vehicles o...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.3497, grad_fn=<UnbindBackward0>), ...","[[tensor(0.0673, grad_fn=<UnbindBackward0>), t...",False,
4,"{'paper_id': '1911.00431v2', 'title': 'Composi...","{'paper_id': '1604.05350v1', 'title': 'Countin...",neg-1s-4,,1911.00431v2,Composition of Bhargava's Cubes over Number Fi...,1604.05350v1,Counting and Enumerating Crossing-free Geometr...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(0.3390, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.5561, grad_fn=<UnbindBackward0>), ...",False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"{'paper_id': '2003.09300v1', 'title': 'Graham'...","{'paper_id': '2411.01710v1', 'title': 'SPES: S...",neg-1s-95,,2003.09300v1,Graham's Formula for Valuing Growth Stocks; Be...,2411.01710v1,SPES: Spectrogram Perturbation for Explainable...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.0840, grad_fn=<UnbindBackward0>), ...","[[tensor(-0.4594, grad_fn=<UnbindBackward0>), ...",False,
96,"{'paper_id': '2305.03818v3', 'title': 'The Gen...","{'paper_id': '2305.14131v2', 'title': 'Tempora...",neg-1s-96,,2305.03818v3,The Generalized Makeev Problem Revisited; Base...,2305.14131v2,Temporally Causal Discovery Tests for Discrete...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.3008, grad_fn=<UnbindBackward0>), ...","[[tensor(-0.4231, grad_fn=<UnbindBackward0>), ...",False,
97,"{'paper_id': '1807.05786v4', 'title': 'MIDV-50...","{'paper_id': '2111.14281v1', 'title': 'Passive...",neg-1s-97,,1807.05786v4,MIDV-500: A Dataset for Identity Documents Ana...,2111.14281v1,Passive Indoor Localization with WiFi Fingerpr...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.3237, grad_fn=<UnbindBackward0>), ...","[[tensor(-0.0788, grad_fn=<UnbindBackward0>), ...",False,
98,"{'paper_id': '2407.13220v3', 'title': 'MEDIC: ...","{'paper_id': '2502.01640v1', 'title': 'Study o...",neg-1s-98,,2407.13220v3,MEDIC: Zero-shot Music Editing with Disentangl...,2502.01640v1,Study on the impact of trade policy uncertaint...,"[[tensor(0.2526, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.6015, grad_fn=<UnbindBackward0>), ...","[[tensor(-0.5077, grad_fn=<UnbindBackward0>), ...",False,
