For the manipulation if datasets and datasetDict, we'll be using the Drug Review Dataset, wich contains patient reviews on various frugs, along with the condition being treated in a 10-star rating of the patient's satisfaction.

To download and extract the data:

!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
#A small random sample of the dataset

drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]


The response of the request is:

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than an elevated blood pressure.  I had severe knee and ankle pain which completely went away after taking Mobic.  I attempted to stop the medication however pain returned after a few days."'],
 'rating': [9.0, 3.0, 10.0],
 'date': ['September 2, 2015', 'November 7, 2011', 'June 5, 2013'],
 'usefulCount': [36, 13, 128]}


In [None]:
#To test the patient ID hypothesis for the Unnamed:0 column, we can use the 
# Dataset.unique() function to verify that the number of IDs matches the number 
# of rows in each split

for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed:0"))

#Since it confirms our hypothesis, we rename that column to something more interpretable

drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed:0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


In [None]:
#To normalize all the condition labels, first we need to drop the entries in the codition column that are "none".

drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

#Now we can normalize the column using dataset.map()

drug_dataset = drug_dataset.map(lowercase_condition)
#Check that lowecasing worked 
drug_dataset["train"]["condition"][:3]

In [None]:
#A simple function that counts the number of words in each review
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

#It returns a dictionary whose keys does not correspond to one of the column names in the dataset

In [None]:
#We'll use the dataset.filter() function to remove reviews that contain feweer than 30 words
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

In [None]:
#We have to unescape html character codes in our reviews

import html

text = "I&$#039;m feeling better now"
print(html.unescape(text))

#The response should be ""I'm feeling better now"


#We'll use Dataset,map() to unescape all the HTML characters in our corpus

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})


In [None]:
#Here is another way to unescape all HTML characters ut using batched = True

new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched = True
)

#This command executes way faster than the previous one because list comprehensions are usually
# faster than executing the same code in a for loop, and we also gain some preformance by accessing
# lots of elements at the same time instead of one by one

# Using Dataset.map() with batched=True will be essential to unlock the speed of the "fast" tokenizer

In [None]:
# To tokenize all the drug reviews with a fast tokenizer, we could use a function like this:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [None]:
# To enable multiprocessing, use the num_proc argument and specify the number of processes to use
# in your call to dataset.map()
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)


tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
# Here we will tokenize our examples and truncate them to a maximum length of 128, but we will ask the tokenizer 
# to return all the chunks of the texts instead of just the first one. This can be done with return_overflowing_tokens=True:

def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

The result of the test in one example is [128, 49]. 
When we run it in the whole dataset, we get en error because the length of the two columns are not the same: the drug_dataset column has a certain number of examples but the tokenized_dataset we are building will have more. That doesn't work for a dataset, so we need to either remove the columns from the old dataset or make them the same size as they are in the new dataset.
We can do the former with the remove_columns argument

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

# Now this works without error. 
#  We can check that our new dataset has many more elements than the original dataset by comparing the lengths

print(len(tokenized_dataset["train"]), len(drug_dataset["train"]))

# [206772, 138514]


In [None]:
#Another way to deal with the mismatched length problem is by making the old columns the same size as the new ones
# To do this we will need the overflow_to_sample_mapping field the tokenizer returns when we set return_overflowing_tokens=True

def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

#We can see it works with Dataset.map() without us needing to remove the old columns:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],
        num_rows: 68876
    })
})

To convert a dataset into a dataframe, as Pandas, we can do it as:

--> drug_dataset.set_format("pandas")

And when we access elements of the dataset we get a pandas.DataFrame instead of a dictionary
We can create a pandas.DataFrame for the whole training set by selecting all the elements of drug_dataset["train"]:

--> train_df = drug_dataset["train"][:]

From here we can use all the Pandas functionality that we want

In [None]:
#Once we're done with our Pandas analysis, we can always create a new Dataset object by using the Dataset.from_pandas() function as:

from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

#To reset the output format of drug_dataset from "pandas" to "arrow":

drug_dataset.reset_format()



In [None]:
#Creating a validation set: to split our training set into train and
# validation splits (we set the seed argument for reproducibility):

drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'review_clean'],
        num_rows: 46108
    })
})

To store somewhere the dataset, we use:
    - In Arrow format: Dataset.save_to_disk()
    - in CSV format: Dataset.to_csv()
    - in JSON format: Dataset.to_json()

ex: drug_dataset_clean.save_to_disk("drug-reviews")

In [None]:
# Once the dataset is saved, we can load it by using the load_from_disk() function as follows:

from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

#DatasetDict({
#    train: Dataset({
#        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
#        num_rows: 110811
#    })
#    validation: Dataset({
#        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
#        num_rows: 27703
#    })
#    test: Dataset({
#        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
#        num_rows: 46108
#    })
#})

In [None]:
# For csv and json format, we have to store each split as a separate file. One way to do this is by iterating over the keys
# and values in the DatasetDict object

for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

# Other way: 
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

I'v just installed The Pile (pip install zstandard)
Now I can load the dataset

In [None]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a tea or coffee while you wait :)
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

#Dataset({
#    features: ['meta', 'text'],
#    num_rows: 15518009
#})

A simple way to measure memory usage in Pyhton is with the psutil library

pip install psutil

import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

To deal with the management of memory when working with big datasets, hugging face's datasets provides a streaming feature that allow us to download and access elements on the fly, without needing to download the whole dataset

In [None]:
#To enable dataset streaming just pass the streaming=Truen arg to the load_dataset() function
pubmed_dataset_streamed = load_dataset(
    "json", data_files=data_files, split="train", streaming=True
)

#What it returns is a iterableDataset. We can access the first element of our streamed dataset as:
next(iter(pubmed_dataset_streamed))

#The elements from a streamed dataset can be processed on the fly using IterableDataset.map(), which is 
# useful during training if you need to tokenize the inputs

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

#OUTPUT
#{'input_ids': [101, 4958, 5178, 4328, 6779, ...], 'attention_mask': [1, 1, 1, 1, 1, ...]}

#To speed up tokenization with streaming you can pass batched=True, which would process the examples batch by batch


Elements from a streamed dataset can be selected using the IterableDataset.take() and IterableDataset.skip() functions, which act in a similar way to Dataset.select(). Example:

dataset_head = pubmed_dataset_streamed.take(5)
list(dataset_head)

OUTPUT:
[{'meta': {'pmid': 11409574, 'language': 'eng'},
  'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},
 {'meta': {'pmid': 11409575, 'language': 'eng'},
  'text': 'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'},
 {'meta': {'pmid': 11409576, 'language': 'eng'},
  'text': "Hypoxaemia in children with severe pneumonia in Papua New Guinea ..."},
 {'meta': {'pmid': 11409577, 'language': 'eng'},
  'text': 'Oxygen concentrators and cylinders ...'},
 {'meta': {'pmid': 11409578, 'language': 'eng'},
  'text': 'Oxygen supply in rural africa: a personal experience ...'}]

Similarly, we can use the IterableDataset.skip() function to create training and validation splits from a shuffled dataset as follows:

# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

When we need to combine larga datasets, we can use the interleave_datasets() function that converts a list of iterableDataset objects into a single iterableDataset, where the elements of the new dataset are obtained by alternating among the source examples

In [None]:
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])
list(islice(combined_dataset, 2))

#OUTPUT
#[{'meta': {'pmid': 11409574, 'language': 'eng'},
#  'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'},
# {'meta': {'case_ID': '110921.json',
#   'case_jurisdiction': 'scotus.tar.gz',
#   'date_created': '2010-04-28T17:12:49Z'},
#  'text': '\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued January 19, 1983.
# \nDecided April 26, 1983.\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\n*239 Michael A. Lilly, First Deputy
# Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}]

In [None]:
#If you want to stream the dataset in its 825 GB enitrety, you can grab all the prepared files:

base_url = "https://the-eye.eu/public/AI/pile/"
data_files = {
    "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)],
    "validation": base_url + "val.jsonl.zst",
    "test": base_url + "test.jsonl.zst",
}
pile_dataset = load_dataset("json", data_files=data_files, streaming=True)
next(iter(pile_dataset["train"]))

#OUTPUT
#{'meta': {'pile_set_name': 'Pile-CC'},
# 'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'}