In [21]:
from pprint import pprint

In [22]:
from datasets import load_dataset

In [23]:
imdb = load_dataset("stanfordnlp/imdb")

In [24]:
print(imdb)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [25]:
# check the value at a particular index
idx = 1000
example = imdb['train'][idx]
pprint(example)

{'label': 0,
 'text': 'Although I have to admit I laughed more watching this movie than the '
         'last few comedies I saw.<br /><br />The budget must have consisted '
         'of pocket change from the actors. The production values are so low '
         'that they actual made it kind of fun to watch. Reminds me of the '
         'Robot Monster made up of a guy in a gorilla suit with a cardboard '
         'diving helmet on.<br /><br />In one scene a hapless victim gets '
         'their arm and leg cut off. Geez, hard to believe but the Black '
         'Knight scene from Holy Grail was more realistic. I kept wondering '
         'why the victim didn\'t start shouting " None Shall Pass" and " It\'s '
         'only a flesh wound, I\'ve had worse". It was one of the funniest '
         'scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" '
         'was a stitch too. Between the horribly cheap costume and the geeky '
         'look of the guy in it the end result

In [26]:
# choose a subset of indices from the dataset
idx = range(0, 100, 2)
list_of_examples = imdb['train'].select(idx)
list_of_examples

Dataset({
    features: ['text', 'label'],
    num_rows: 50
})

In [27]:
# configs and splits
from datasets import get_dataset_config_names, get_dataset_split_names
print(get_dataset_config_names("wmt/wmt14"))

['cs-en', 'de-en', 'fr-en', 'hi-en', 'ru-en']


In [28]:
print(get_dataset_split_names("wmt/wmt14", 'hi-en'))

['train', 'validation', 'test']


In [29]:
translation_dataset = load_dataset("wmt/wmt14", name='hi-en')
translation_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 32863
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [30]:
# combine all the splits of a dataset
combined_dataset = load_dataset("wmt/wmt14", name="hi-en", split="train+test+validation")
print(combined_dataset)
print(len(combined_dataset))

Dataset({
    features: ['translation'],
    num_rows: 35890
})
35890


### Features
Features define the internal structure of a dataset. It is used to specify the underlying serialization format. The features of a dataset vary based on the type of data.

In [31]:
translation_dataset['train'].features

{'translation': Translation(languages=['hi', 'en'], id=None)}

In the above example, the feature is "Translation"

In [32]:
# load MRPC from the GLUE dataset
# MRPC: Microsoft Research Paraphrase Corpus
mrpc = load_dataset('glue', 'mrpc', split='train')
mrpc

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [33]:
mrpc.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}