In [1]:
from pathlib import Path
import pandas as pd
import elinker

CONFIG_FILE_PATH: /home/wkt406/miniconda3/envs/torch-env/lib/python3.10/site-packages/elinker/data/config.yml


# Read in Data

In [2]:
base_path = Path("../data/MedMentions")

In [3]:
corpus = "full"
# corpus = "st21pv"

In [4]:
med_mentions = elinker.datasets.med_mentions(base_path/f"{corpus}/data/corpus_pubtator.txt")
doc_df = med_mentions[0]
mention_df = med_mentions[1]

In [5]:
doc_df.shape

(4392, 3)

In [6]:
mention_df.shape

(352496, 6)

In [7]:
len(set(mention_df['entity_id']))

34724

# Assign each PMID to dev, test, or training

In [8]:
with open(base_path/"full/data/corpus_pubtator_pmids_all.txt") as file:
    lines = file.readlines()
    all_pmids = [line.rstrip() for line in lines]
    
with open(base_path/"full/data/corpus_pubtator_pmids_dev.txt") as file:
    lines = file.readlines()
    dev_pmids = [line.rstrip() for line in lines]
    
with open(base_path/"full/data/corpus_pubtator_pmids_test.txt") as file:
    lines = file.readlines()
    test_pmids = [line.rstrip() for line in lines]
    
with open(base_path/"full/data/corpus_pubtator_pmids_trng.txt") as file:
    lines = file.readlines()
    train_pmids = [line.rstrip() for line in lines]

In [9]:
doc_df['split'] = None
doc_df.loc[doc_df.pmid.isin(train_pmids), 'split'] = 'train'
doc_df.loc[doc_df.pmid.isin(dev_pmids), 'split'] = 'dev'
doc_df.loc[doc_df.pmid.isin(test_pmids), 'split'] = 'test'
doc_df['split'].value_counts()

train    2635
test      879
dev       878
Name: split, dtype: int64

In [10]:
doc_df.to_parquet(base_path/f"doc_{corpus}.parquet", index=False)
mention_df.to_parquet(base_path/f"mention_{corpus}.parquet", index=False)

# Read Parquet Files

In [11]:
doc_df2 = pd.read_parquet(base_path/f"doc_{corpus}.parquet")
mention_df2 = pd.read_parquet(base_path/f"mention_{corpus}.parquet")

In [12]:
doc_df2.head()

Unnamed: 0,pmid,title,abstract,split
0,25763772,DCTN4 as a modifier of chronic Pseudomonas aer...,Pseudomonas aeruginosa (Pa) infection in cysti...,train
1,25847295,Nonylphenol diethoxylate inhibits apoptosis in...,Nonylphenol and short-chain nonylphenol ethoxy...,test
2,26316050,Prevascularized silicon membranes for the enha...,Recent advances in drug delivery and sensing d...,train
3,26406200,Seated maximum flexion: An alternative to stan...,The flexion - relaxation phenomenon (FRP) in s...,train
4,26424709,The Relationship Between Distance and Post-ope...,"To date, there is no research on voluntary med...",train


In [13]:
doc_df2.shape

(4392, 4)

In [14]:
mention_df2.head()

Unnamed: 0,pmid,start_index,end_index,text_segment,sem_type_id,entity_id
0,25763772,0,5,DCTN4,"T116,T123",C4308010
1,25763772,23,63,chronic Pseudomonas aeruginosa infection,T047,C0854135
2,25763772,67,82,cystic fibrosis,T047,C0010674
3,25763772,83,120,Pseudomonas aeruginosa (Pa) infection,T047,C0854135
4,25763772,124,139,cystic fibrosis,T047,C0010674


In [15]:
mention_df2.shape

(352496, 6)

In [30]:
t191_mentions = mention_df2[mention_df['sem_type_id'] == 'T191']

In [32]:
pmids = set(t191_mentions.pmid)

In [35]:
t191_pmids = doc_df2[doc_df2['pmid'].isin(pmids)]

In [38]:
t191_pmids[t191_pmids['split'] == 'train']

Unnamed: 0,pmid,title,abstract,split
13,26944725,Impact of totally laparoscopic combined manage...,Thanks to widespread diffusion of minimally in...,train
17,27086366,The Role of TRAF4 and B3GAT1 Gene Expression i...,Mastocytosis is an uncommon disease classified...,train
47,27237979,MiR-211 is epigenetically regulated by DNMT1 m...,MiR-211 has strong inhibitive effects on melan...,train
70,27246120,Assessment of laparoscopic stomach preserving ...,Along with the marked increase in early gastri...,train
87,27252416,Heterogeneous Mechanisms of Primary and Acquir...,To identify novel mechanisms of resistance to ...,train
...,...,...,...,...
4366,28544601,The impact of noninvasive follicular thyroid n...,A recent revision in thyroid tumor nomenclatur...,train
4370,28545358,Molecular Imaging of Tumor Angiogenesis and Th...,Angiogenesis is critical for the growth of tum...,train
4372,28545416,The role of Indonesian patients' health behavi...,"With an estimated 13,000 newly diagnosed patie...",train
4379,28548949,Dietary grape seed proanthocyanidins inactivat...,Ultraviolet B (UVB) radiation induces regulato...,train
