In [1]:
import os
import json
import numpy as np
from tqdm import tqdm
import pandas as pd

import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embedding_loc = 'data/s2ag/processed/d2v'
embedding_name = 'd2v_64d_20e.pkl.dv.vectors.npy'
ids_name = 'ids_en_core_web_lg_1950-2024.csv'
tasks_config = 'config/Doc2Vec/load.jsonl'

save_path = f'embeddings/Doc2Vec/{embedding_name.replace(".pkl.dv.vectors.npy", ".jsonl")}'

In [3]:
ids = np.loadtxt(os.path.join(embedding_loc, ids_name), dtype=str)
ids

array(['246361431', '246361431', '246361431', ..., '4227846', '4227846',
       '25048087'], dtype='<U9')

In [4]:
embds = np.load(os.path.join(embedding_loc, embedding_name))
embds.shape

(49974674, 64)

In [5]:
datasets.load_dataset('allenai/scirepeval', 'relish')

DatasetDict({
    evaluation: Dataset({
        features: ['query', 'candidates'],
        num_rows: 3190
    })
})

In [6]:
datasets.load_dataset('allenai/scirepeval', 'pub_year')

DatasetDict({
    evaluation: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'year', 'venue', 'norm_year', 'scaled_year', 'n_authors', 'norm_authors'],
        num_rows: 30000
    })
    train: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'year', 'venue', 'norm_year', 'scaled_year', 'n_authors', 'norm_authors'],
        num_rows: 198995
    })
    validation: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'year', 'venue', 'norm_year', 'scaled_year', 'n_authors', 'norm_authors'],
        num_rows: 19869
    })
})

In [20]:
maps = []
with open(tasks_config, encoding="utf-8") as f:
    for line in f:
        task = json.loads(line)
        print(task)
        kwargs = {}
        task_data = task["data"]
        if not task_data.get("meta"):
            raise ValueError(f"Task {task_name} has no test metadata")
        if task_data.get("meta"):
            metadata = task_data["meta"]
            kwargs["meta_dataset"] = metadata if type(metadata) != dict else (metadata["name"], metadata["config"])
            data = datasets.load_dataset(kwargs['meta_dataset'][0], kwargs['meta_dataset'][1])
            print(data['evaluation'].to_pandas().head())

        if not task_data.get("test"):
            if type(metadata) == dict:
                kwargs["test_dataset"] = (metadata["name"], metadata["config"])
            else:
                raise ValueError(f"Task {task_name} has no test data")
        if task_data.get("test"):
            testdata = task_data["test"]
            kwargs["test_dataset"] = testdata if type(testdata) != dict else (testdata["name"], testdata["config"])
        
        data = datasets.load_dataset(kwargs['test_dataset'][0], kwargs['test_dataset'][1])
        maps.append(data["test"].to_pandas())
        if 'train' in data:
            maps.append(data["train"].to_pandas())
        
map = pd.concat(maps)
map = (pd.concat([map['query_id'].dropna(), map['cand_id'].dropna(), map['paper_id'].dropna()])
        .drop_duplicates()
        .astype(str)
        .to_frame('paper_id')
        .reset_index(drop=True))
map.head()

{'name': 'RELISH', 'type': 'proximity', 'data': {'meta': {'name': 'allenai/scirepeval', 'config': 'relish'}, 'test': {'name': 'allenai/scirepeval_test', 'config': 'relish'}}, 'embeddings': {'load': 'embeddings/Doc2Vec/d2v_64d_20e.jsonl'}, 'metrics': ['ndcg']}
                                               query  \
0  {'doc_id': '22569528', 'title': 'ERK1/2 MAP ki...   
1  {'doc_id': '23613754', 'title': 'ERK2 suppress...   
2  {'doc_id': '29409062', 'title': 'Novel overlap...   
3  {'doc_id': '29360039', 'title': 'Crk proteins ...   
4  {'doc_id': '27461729', 'title': 'Systematic id...   

                                          candidates  
0  [{'doc_id': '17928366', 'title': 'MEK1 and MEK...  
1  [{'doc_id': '18818436', 'title': 'Analysis of ...  
2  [{'doc_id': '18443018', 'title': 'PEPITO: impr...  
3  [{'doc_id': '17901128', 'title': 'Fibroblast g...  
4  [{'doc_id': '19136617', 'title': 'Genome-wide ...  
{'name': 'NFCorpus', 'type': 'adhoc_search', 'data': {'meta': {'name': 'a

Unnamed: 0,paper_id
0,22569528
1,23613754
2,29409062
3,29360039
4,27461729


In [14]:
ids_df = pd.DataFrame(ids.astype(str), columns=['id']).reset_index()
ids_df.head()

Unnamed: 0,index,id
0,0,246361431
1,1,246361431
2,2,246361431
3,3,52960022
4,4,245707429


In [15]:
merged = map.merge(ids_df, left_on='paper_id', right_on='id')
merged.head()

Unnamed: 0,paper_id,index,id
0,22569528,40794419,22569528
1,23613754,45656758,23613754
2,29360039,37693165,29360039
3,25704641,30908110,25704641
4,29329231,4193973,29329231


In [16]:
merged.shape[0] / map.shape[0]

0.4680604282469799

In [17]:
with open(save_path, 'w') as fout:
    for ix, row in tqdm(merged.iterrows(), total=merged.shape[0]):
        fout.write(json.dumps({"doc_id": str(row['paper_id']), "embedding": embds[row['index']].tolist()}) + '\n')

100%|██████████| 211861/211861 [00:07<00:00, 29338.55it/s]
