# Loading the raw dataset

In [None]:
from datasets import load_dataset

dataset_raw = load_dataset("json", data_files="./datasets/raw.jsonl", split="train")

dataset_raw

# Remove hh examples and type column

The HH examples were not manually curated; they are exclusively positive examples extracted from high-throughput publications.

In [None]:
dataset_formated = dataset_raw.filter(lambda x: x["type"] == "vh").remove_columns(
    ["type"]
)

dataset_formated

# Format the examples

Several preprocessing steps are applied to the data:

- Abstract paragraphs are joined into a single paragraph.
- Unicode characters are unescaped (e.g., Greek letters in biomedical terms), and HTML tags are removed.
- The number of words in the title, abstract, and the total word count are added for convenience.

In [None]:
import html
from bs4 import BeautifulSoup


# unescape html characters and remove html tags.
format_text = lambda x: BeautifulSoup(html.unescape(x), "html.parser").get_text()


def format_exemples(x):
    # format title.
    x["title"] = format_text(x["title"])
    # replace the abstract part list by a string and format it.
    x["abstract"] = format_text(" ".join(x["abstract"]))
    # add number of words in title and abstract.
    title_num_words = len(x["title"].split())
    abstract_num_words = len(x["abstract"].split())
    x["title_num_words"] = title_num_words
    x["abstract_num_words"] = abstract_num_words
    x["total_num_words"] = title_num_words + abstract_num_words
    return x


dataset_formatted = dataset_formated.map(format_exemples)

dataset_formatted

# Filter the dataset title and abstract size

Only articles with a title and an abstract containing at least 30 words are retained.

In [None]:
def filter_examples(x):
    if x["title_num_words"] == 0:
        return False
    if x["abstract_num_words"] < 30:
        return False
    return True


dataset_formatted = dataset_formatted.filter(filter_examples)

dataset_formatted

# Inspect balance

The dataset is highly unbalanced, with nearly 20 times more negative examples than positive ones.

In [None]:
import matplotlib.pyplot as plt

# filters for each label.
pos_filter = lambda x: x["is_selected"]
neg_filter = lambda x: not x["is_selected"]

pos = dataset_formatted.filter(pos_filter)
neg = dataset_formatted.filter(neg_filter)

fig, ax = plt.subplots()

ax.set_title("Number of examples per label")
ax.pie([len(pos), len(neg)], labels=[f"pos {len(pos)}", f"neg {len(neg)}"])

plt.tight_layout()

# Create train, eval and test splits

- Evaluation and Test Splits: 10% of the dataset is randomly sampled twice to create evaluation and test splits. The original class imbalance is intentionally preserved to ensure that evaluations reflect real-world conditions.

- Training Split: To fully utilize the negative examples, a synthetic balanced training split is created by repeating the positive examples approximately 20 times to match the number of negative examples. Training will be conducted over a single epoch, where each negative example appears once, while positive examples are repeated multiple times.

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets

# helper function to split a given number (n: int) or by a given % (n: float).
def split_dataset(d: Dataset, n: int | float, seed=42) -> tuple[Dataset, Dataset]:
    splitted = d.train_test_split(n, seed=seed)
    return splitted["train"], splitted["test"]


# helper function to sample a dataset into train, eval and test datasets.
def sample_dataset(dataset: Dataset, test_ratio: float = 0.1, seed=42) -> DatasetDict:
    # get the examples of each label.
    pos_train = dataset.filter(pos_filter)
    neg_train = dataset.filter(neg_filter)

    # total number in each split.
    pos_num = len(pos_train)
    neg_num = len(neg_train)

    # test number for each split.
    pos_num_test = int(pos_num * test_ratio)
    neg_num_test = int(neg_num * test_ratio)

    # get 10% of examples for test dataset.
    pos_train, pos_test = split_dataset(pos_train, pos_num_test, seed)
    neg_train, neg_test = split_dataset(neg_train, neg_num_test, seed)

    # get 10% of examples for validation dataset.
    pos_train, pos_eval = split_dataset(pos_train, pos_num_test, seed)
    neg_train, neg_eval = split_dataset(neg_train, neg_num_test, seed)

    # r = the number of time to repeat the positive train examples.
    # extrapolate r times new examples from positive train exemples.
    # concatenate all train datasets into a single one.
    r = neg_num_test // pos_num_test
    pos_train_list = [pos_train for _ in range(r)]
    dataset_train = concatenate_datasets(pos_train_list + [neg_train]).shuffle(seed)

    # create stratified eval and test datasets.
    dataset_eval = concatenate_datasets([pos_eval, neg_eval]).shuffle(seed)
    dataset_test = concatenate_datasets([pos_test, neg_test]).shuffle(seed)

    # return a single dataset.
    return DatasetDict(
        {
            "train": dataset_train,
            "eval": dataset_eval,
            "test": dataset_test,
        }
    )
# sample train, eval and test datasets.
dataset = sample_dataset(dataset_formatted)

# Inspect the dataset

In [None]:
pos_train = dataset["train"].filter(pos_filter)
neg_train = dataset["train"].filter(neg_filter)
pos_eval = dataset["eval"].filter(pos_filter)
neg_eval = dataset["eval"].filter(neg_filter)
pos_test = dataset["test"].filter(pos_filter)
neg_test = dataset["test"].filter(neg_filter)

print(len(pos_train), len(neg_train))
print(pos_train[range(10)])
print(neg_train[range(10)])

print(len(pos_eval), len(neg_eval))
print(pos_eval[range(10)])
print(neg_eval[range(10)])

print(len(pos_test), len(neg_test))
print(pos_test[range(10)])
print(neg_test[range(10)])

# Visualize the dataset

The dataset splits align with expectations, and the word count distribution is similar across categories. Additionally, the mean word count is around 280 for each category, which is well-suited for language models handling sequences up to 512 tokens.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, (pies, hists1, hists2) = plt.subplots(3, 3, figsize=(15, 7))

# =========================
pies[0].set_title("Train")
pies[0].pie(
    [len(pos_train), len(neg_train)],
    labels=[f"pos {len(pos_train)}", f"neg {len(neg_train)}"],
)

pies[1].set_title("Eval")
pies[1].pie(
    [len(pos_eval), len(neg_eval)],
    labels=[f"pos {len(pos_eval)}", f"neg {len(neg_eval)}"],
)

pies[2].set_title("Test")
pies[2].pie(
    [len(pos_test), len(neg_test)],
    labels=[f"pos {len(pos_test)}", f"neg {len(neg_test)}"],
)

# =========================
hists1[0].set_ylim(0, 6000)
hists1[0].set_title("Num words train pos")

x = pos_train["total_num_words"]
mean = np.mean(x)
hists1[0].hist(x, histtype="bar", range=(0, 500), bins=50)
hists1[0].axvline(mean, color="r", linestyle="--")
hists1[0].text(mean + mean / 20, 6000 * 0.8, f"{mean:.2f}", color="r")

hists1[1].set_ylim(0, 60)
hists1[1].set_title("Num words eval pos")

x = pos_eval["total_num_words"]
mean = np.mean(x)
hists1[1].hist(x, histtype="bar", range=(0, 500), bins=50)
hists1[1].axvline(mean, color="r", linestyle="--")
hists1[1].text(mean + mean / 20, 60 * 0.8, f"{mean:.2f}", color="r")

hists1[2].set_ylim(0, 60)
hists1[2].set_title("Num words test pos")

x = pos_test["total_num_words"]
mean = np.mean(x)
hists1[2].hist(x, histtype="bar", range=(0, 500), bins=50)
hists1[2].axvline(mean, color="r", linestyle="--")
hists1[2].text(mean + mean / 20, 60 * 0.8, f"{mean:.2f}", color="r")

# =========================
hists2[0].set_ylim(0, 6000)
hists2[0].set_title("Num words train neg")

x = neg_train["total_num_words"]
mean = np.mean(x)
hists2[0].hist(x, histtype="bar", range=(0, 500), bins=50)
hists2[0].axvline(mean, color="r", linestyle="--")
hists2[0].text(mean + mean / 20, 6000 * 0.8, f"{mean:.2f}", color="r")

hists2[1].set_ylim(0, 700)
hists2[1].set_title("Num words eval neg")

x = neg_eval["total_num_words"]
mean = np.mean(x)
hists2[1].hist(x, histtype="bar", range=(0, 500), bins=50)
hists2[1].axvline(mean, color="r", linestyle="--")
hists2[1].text(mean + mean / 20, 700 * 0.8, f"{mean:.2f}", color="r")

hists2[2].set_ylim(0, 700)
hists2[2].set_title("Num words test neg")

x = neg_test["total_num_words"]
mean = np.mean(x)
hists2[2].hist(x, histtype="bar", range=(0, 500), bins=50)
hists2[2].axvline(mean, color="r", linestyle="--")
hists2[2].text(mean + mean / 20, 700 * 0.8, f"{mean:.2f}", color="r")

# =========================
fig.tight_layout()

# Save the dataset to disk

In [None]:
dataset.save_to_disk("./datasets/abstracts.hf")