In [64]:
import multiprocessing
multiprocessing.cpu_count() # gives the physical + logical core count

8

In [65]:
from pprint import pprint
from datasets import load_dataset, concatenate_datasets, interleave_datasets

In [66]:
imdb = load_dataset('stanfordnlp/imdb')

In [67]:
print('Before Filtering')
print(35*'-')
imdb

Before Filtering
-----------------------------------


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [68]:
example = imdb['train']
print(len(example)) # no of observations

25000


In [69]:
print(type(example))

<class 'datasets.arrow_dataset.Dataset'>


### Filter

In [70]:
# Filtering
num_words = 100
# taking all those observations whose lenght of the text is greater than 100 words
imdb_filtered = imdb.filter(lambda example: len(example['text'].split(' ')) >= num_words)

In [71]:
print("After Filtering")
print(35*'-')
print(imdb_filtered)

After Filtering
-----------------------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 44095
    })
})


### MAP

In [72]:
# using function
def add_prefix(example):
    example["text"] = "IMDB: " + example["text"]
    return example

In [73]:
pprint(add_prefix(example[100]))

{'label': 0,
 'text': 'IMDB: Terrible movie. Nuff Said.<br /><br />These Lines are Just '
         "Filler. The movie was bad. Why I have to expand on that I don't "
         'know. This is already a waste of my time. I just wanted to warn '
         'others. Avoid this movie. The acting sucks and the writing is just '
         'moronic. Bad in every way. The only nice thing about the movie are '
         "Deniz Akkaya's breasts. Even that was ruined though by a terrible "
         'and unneeded rape scene. The movie is a poorly contrived and totally '
         'unbelievable piece of garbage.<br /><br />OK now I am just going to '
         'rag on IMDb for this stupid rule of 10 lines of text minimum. First '
         'I waste my time watching this offal. Then feeling compelled to warn '
         'others I create an account with IMDb only to discover that I have to '
         'write a friggen essay on the film just to express how bad I think it '
         'is. Totally unnecessary.'}


In [74]:
# using map
imdb_mapped = imdb.map(add_prefix)
imdb_mapped

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [75]:
# the function has been applied to all the observations
pprint(imdb_mapped['train'][200])

{'label': 0,
 'text': 'IMDB: This is an action Western. James Steart leads an all star cast '
         'in the scenic Northwest, which is filmed in great splendor. The '
         'scenery and costumes are great. There is action and adventure. '
         'Stewart plays a wealthy cattleman who runs afoul of a crooked '
         'government in the old Nothwest.<br /><br />The main drawback is the '
         'stereotypical cynic that Hollywood has always made into a hero. Even '
         'when this movie was made, the cynic was the stereotypical hero, and '
         'the one Stewart portrays really has few saving graces. He is kind to '
         'his two partners, and that does give him an extra dimension of '
         'credibility and likability.<br /><br />However, he is so piggish to '
         'everyone else, it is hard to really care for him, or to accept him. '
         'He is much like the one dimensional spaghetti Western characters '
         '(cut not that bad).<br /><br />Still,

### Concatenate Datasets
We often need to combine two or more datasets. The only requirement is that the datasets must have same features and number of splits

In [76]:
imdb_dataset_whole = load_dataset("stanfordnlp/imdb", split='train+test')
print(imdb_dataset_whole)
print(imdb_dataset_whole.features)

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [77]:
rotten_tomato_whole = load_dataset("cornell-movie-review-data/rotten_tomatoes", split='all')
print(rotten_tomato_whole)
print(rotten_tomato_whole.features)

Dataset({
    features: ['text', 'label'],
    num_rows: 10662
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [78]:
concat_dataset = concatenate_datasets([imdb_dataset_whole, rotten_tomato_whole], axis=0)
concat_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 60662
})

### Interleaving Datasets

- Oftern times we have n skewed datasets i.e, the number of samples in each dataset might differ drastically.
- So we can build a new dataset with proportions according to our choice. Let's say 60% and 40%.

- interleave_datasets by default uses `stopping_stragegy=first_exhausted` (construction of the dataset is stopped as soon as one of the dataset runs out of sample)

In [89]:
inter_datasets = interleave_datasets([imdb_dataset_whole, rotten_tomato_whole], probabilities=[0.6, 0.4])
inter_datasets

Dataset({
    features: ['text', 'label'],
    num_rows: 26502
})

### Interable Dataset
Suitable for loading samples from large datasets iteratively without writing anything to local disk.

In [92]:
imdb_iter_dataset = load_dataset("stanfordnlp/imdb", split='train', streaming=True)
imdb_iter_dataset

IterableDataset({
    features: ['text', 'label'],
    num_shards: 1
})

In [98]:
for i, example in enumerate(imdb_iter_dataset):
    if i == 10:
        break
    print(example)

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

### Questions to Ponder
1. Can we load a dataset directly from external links?
2. Can we load a dataset from zipped file directly?