# Loading the dataset

In [None]:

from datasets import DatasetDict, load_dataset

dataset = load_dataset("json", data_files="./dataset/raw.jsonl", split="train")

dataset

# Filtering the short abstracts (< 30 words)

In [None]:
def abstract_has_30_words(x: dict) -> bool:
  if (len(x["abstract"]) == 0):
    return False
  
  words = " ".join(x["abstract"]).split()

  return len(words) >= 30

dataset = dataset.filter(abstract_has_30_words)

dataset

# Adding text and labels columns

In [None]:
def get_text_and_labels (x: dict) -> dict:
    text = x["title"] + "\n" + "\n".join(x["abstract"])
    hh_selected = x["is_selected"] and x["type"] == "hh"
    vh_selected = x["is_selected"] and x["type"] == "vh"
    not_selected = not x["is_selected"]
    return {"text": text, "labels": [hh_selected or vh_selected, not_selected]}

dataset = dataset.map(get_text_and_labels)

dataset

# Removing useless columns

In [None]:
dataset = dataset.remove_columns(["pmid", "title", "journal", "abstract", "authors"])

dataset

# Randomly sample balanced datasets

In [None]:
def sample_balanced_dataset(dataset: DatasetDict, num_negative_examples = 15000) -> DatasetDict:
  count_negative_examples = 0

  def filter_negative_examples(x: dict) -> dict:
    nonlocal count_negative_examples

    if (x['is_selected']):
      return True
    
    count_negative_examples += 1

    return count_negative_examples <= num_negative_examples
  
  return dataset.shuffle().filter(filter_negative_examples)

In [None]:
for i in range(5):
  datasetI = sample_balanced_dataset(dataset).train_test_split(0.1)
  print(datasetI["test"][range(3)])

  