# Loading the dataset

In [1]:

from datasets import DatasetDict, load_dataset

dataset = load_dataset("json", data_files="./dataset/raw.jsonl", split="train")

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['type', 'pmid', 'is_selected', 'title', 'journal', 'abstract', 'authors'],
    num_rows: 93560
})

# Filtering the short abstracts (< 30 words)

In [2]:
def abstract_has_30_words(x: dict) -> bool:
  if (len(x["abstract"]) == 0):
    return False
  
  words = " ".join(x["abstract"]).split()

  return len(words) >= 30

dataset = dataset.filter(abstract_has_30_words)

dataset

Filter:   0%|          | 0/93560 [00:00<?, ? examples/s]

Dataset({
    features: ['type', 'pmid', 'is_selected', 'title', 'journal', 'abstract', 'authors'],
    num_rows: 92719
})

# Adding text and labels columns

In [3]:
def get_text_and_labels (x: dict) -> dict:
    text = x["title"] + "\n" + "\n".join(x["abstract"])
    hh_selected = x["is_selected"] and x["type"] == "hh"
    vh_selected = x["is_selected"] and x["type"] == "vh"
    not_selected = not x["is_selected"]
    return {"text": text, "labels": [hh_selected or vh_selected, not_selected]}

dataset = dataset.map(get_text_and_labels)

dataset

Map:   0%|          | 0/92719 [00:00<?, ? examples/s]

Dataset({
    features: ['type', 'pmid', 'is_selected', 'title', 'journal', 'abstract', 'authors', 'text', 'labels'],
    num_rows: 92719
})

# Removing useless columns

In [4]:
dataset = dataset.remove_columns(["pmid", "title", "journal", "abstract", "authors"])

dataset

Dataset({
    features: ['type', 'is_selected', 'text', 'labels'],
    num_rows: 92719
})

# Randomly sample balanced datasets

In [5]:
def sample_balanced_dataset(dataset: DatasetDict, num_negative_examples = 15000) -> DatasetDict:
  count_negative_examples = 0

  def filter_negative_examples(x: dict) -> dict:
    nonlocal count_negative_examples

    if (x['is_selected']):
      return True
    
    count_negative_examples += 1

    return count_negative_examples <= num_negative_examples
  
  return dataset.shuffle().filter(filter_negative_examples)

In [6]:
for i in range(5):
  datasetI = sample_balanced_dataset(dataset).train_test_split(0.1)
  print(datasetI["test"][range(3)])

  

Filter:   0%|          | 0/92719 [00:00<?, ? examples/s]

{'type': ['vh', 'vh', 'vh'], 'is_selected': [False, True, True], 'text': ["Inhibition Potencies of Phytochemicals Derived from Sesame Against SARS-CoV-2 Main Protease: A Molecular Docking and Simulation Study.\nThe ongoing COVID-19 pandemic, caused by SARS-CoV-2, has now spread across the nations with high mortality rates and multifaceted impact on human life. The proper treatment methods to overcome this contagious disease are still limited. The main protease enzyme (M<sup>pro</sup>, also called 3CL<sup>pro</sup>) is essential for viral replication and has been considered as one of the potent drug targets for treating COVID-19. In this study, virtual screening was performed to find out the molecular interactions between 36 natural compounds derived from sesame and the M<sup>pro</sup> of COVID-19. Four natural metabolites, namely, sesamin, sesaminol, sesamolin, and sesamolinol have been ranked as the top interacting molecules to M<sup>pro</sup> based on the affinity of molecular dockin

Filter:   0%|          | 0/92719 [00:00<?, ? examples/s]

{'type': ['vh', 'hh', 'hh'], 'is_selected': [False, True, True], 'text': ['Cytomegalovirus and HIV: A Dangerous Pas de Deux.\nHuman immunodeficiency virus (HIV)-infected adults who take stable antiretroviral therapy (ART) are at risk for early onset of age-related diseases. This is likely due to a complex interaction between traditional risk factors, HIV infection itself, and other factors, such as underlying immune dysfunction and persistent inflammation. HIV disrupts the balance between the host and coinfecting microbes, worsening control of these potential pathogens. For example, HIV-infected adults are more likely than the general population to have subclinical bursts of cytomegalovirus (CMV) replication at mucosal sites. Production of antigens can activate the immune system and stimulate HIV replication, and it could contribute to the pathogenesis of adverse outcomes of aging, like cardiovascular disease and neurocognitive impairment. Further investigation of the relationships bet

Filter:   0%|          | 0/92719 [00:00<?, ? examples/s]

{'type': ['vh', 'vh', 'hh'], 'is_selected': [False, False, True], 'text': ["KSHV MicroRNAs Repress Tropomyosin 1 and Increase Anchorage-Independent Growth and Endothelial Tube Formation.\nKaposi's sarcoma (KS) is characterized by highly vascularized spindle-cell tumors induced after infection of endothelial cells by Kaposi's sarcoma-associated herpesvirus (KSHV). In KS tumors, KSHV expresses only a few latent proteins together with 12 pre-microRNAs. Previous microarray and proteomic studies predicted that multiple splice variants of the tumor suppressor protein tropomyosin 1 (TPM1) were targets of KSHV microRNAs. Here we show that at least two microRNAs of KSHV, miR-K2 and miR-K5, repress protein levels of specific isoforms of TPM1. We identified a functional miR-K5 binding site in the 3' untranslated region (UTR) of one TPM1 isoform. Furthermore, the inhibition or loss of miR-K2 or miR-K5 restores expression of TPM1 in KSHV-infected cells. TPM1 protein levels were also repressed in KS

Filter:   0%|          | 0/92719 [00:00<?, ? examples/s]

{'type': ['hh', 'vh', 'vh'], 'is_selected': [True, False, True], 'text': ['Crystal structure of a soluble decoy receptor IL-22BP bound to interleukin-22.\nInterleukin-22 (IL-22) plays an important role in the regulation of immune and inflammatory responses in mammals. The IL-22 binding protein (IL-22BP), a soluble receptor that specifically binds IL-22, prevents the IL-22/interleukin-22 receptor 1 (IL-22R1)/interleukin-10 receptor 2 (IL-10R2) complex assembly and blocks IL-22 biological activity. Here we present the crystal structure of the IL-22/IL-22BP complex at 2.75 A resolution. The structure reveals IL-22BP residues critical for IL-22 binding, which were confirmed by site-directed mutagenesis and functional studies. Comparison of IL-22/IL-22BP and IL-22/IL-22R1 crystal structures shows that both receptors display an overlapping IL-22 binding surface, which is consistent with the inhibitory role played by IL-22 binding protein.', 'The role of viruses in the inception of sinusitis.

Filter:   0%|          | 0/92719 [00:00<?, ? examples/s]

{'type': ['vh', 'vh', 'vh'], 'is_selected': [True, False, False], 'text': ["SARS-CoV-2 spike engagement of ACE2 primes S2' site cleavage and fusion initiation.\nThe COVID-19 pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infection has resulted in tremendous loss worldwide. Although viral spike (S) protein binding of angiotensin-converting enzyme 2 (ACE2) has been established, the functional consequences of the initial receptor binding and the stepwise fusion process are not clear. By utilizing a cell-cell fusion system, in complement with a pseudoviral infection model, we found that the spike engagement of ACE2 primed the generation of S2' fragments in target cells, a key proteolytic event coupled with spike-mediated membrane fusion. Mutagenesis of an S2' cleavage site at the arginine (R) 815, but not an S2 cleavage site at arginine 685, was sufficient to prevent subsequent syncytia formation and infection in a variety of cell lines and primary cells is