In [1]:
from datasets import interleave_datasets, load_dataset, get_dataset_config_names, get_dataset_split_names
from datasets import load_from_disk
from pprint import pprint
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
imdb_dataset = load_dataset('stanfordnlp/imdb')
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
imdb_train_split = imdb_dataset['train']
print(imdb_train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [4]:
_ = imdb_dataset.pop('unsupervised')
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [5]:
# to download only the train split 
train_split = load_dataset('stanfordnlp/imdb', split='train')
print(train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


we can train on one specific train split and test on a diff test split of a diff dataset 


#### Train test split

In [6]:
small_ds = train_split.train_test_split(test_size=0.2)
print(small_ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


In [7]:
# import local files 
# it has both train and test data 
data_files =['data/train.csv', 'data/test.csv']
local_dataset = load_dataset('csv', data_files=data_files)
print(local_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11
    })
})


In [8]:
train_test_split = local_dataset["train"].train_test_split(test_size=0.5)

if its a larger dataset then convert it into `pyarrow(.arrow)` and then save it to memory

In [9]:
train_test_split.save_to_disk('pyarrow_dataset/movie_review')

Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 833.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6/6 [00:00<00:00, 2966.62 examples/s]


In [10]:
raw_dataset_from_disk = load_from_disk('pyarrow_dataset/movie_review')
print(raw_dataset_from_disk)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 6
    })
})


In [11]:
idx = 1000
example = imdb_dataset['train'][idx]
pprint(example)

subset = imdb_dataset['train'].select([0, 1])
print(subset)

{'label': 0,
 'text': 'Although I have to admit I laughed more watching this movie than the '
         'last few comedies I saw.<br /><br />The budget must have consisted '
         'of pocket change from the actors. The production values are so low '
         'that they actual made it kind of fun to watch. Reminds me of the '
         'Robot Monster made up of a guy in a gorilla suit with a cardboard '
         'diving helmet on.<br /><br />In one scene a hapless victim gets '
         'their arm and leg cut off. Geez, hard to believe but the Black '
         'Knight scene from Holy Grail was more realistic. I kept wondering '
         'why the victim didn\'t start shouting " None Shall Pass" and " It\'s '
         'only a flesh wound, I\'ve had worse". It was one of the funniest '
         'scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" '
         'was a stitch too. Between the horribly cheap costume and the geeky '
         'look of the guy in it the end result

we can use these subsets for testing the code

In [12]:
idx = range(0, 100, 2)
subset = imdb_dataset['train'].select(idx)
print(subset)

Dataset({
    features: ['text', 'label'],
    num_rows: 50
})


### WMT14

In [13]:
print(get_dataset_config_names('wmt/wmt14'))
print(get_dataset_split_names('wmt/wmt14', 'hi-en'))

['cs-en', 'de-en', 'fr-en', 'hi-en', 'ru-en']
['train', 'validation', 'test']


In [14]:
translation_dataset = load_dataset(path = 'wmt/wmt14', name='hi-en')
print(translation_dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 32863
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [15]:
raw_dataset = load_dataset(path = 'wmt/wmt14', name='hi-en', split='train+test+validation')
print(raw_dataset)
print(len(raw_dataset))

Dataset({
    features: ['translation'],
    num_rows: 35890
})
35890


In [16]:
pprint(translation_dataset['train'].features)

{'translation': Translation(languages=['hi', 'en'], id=None)}


### Glue

In [17]:
mrpc_dataset = load_dataset('glue', 'mrpc', split='train')
print(mrpc_dataset)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})


In [18]:
pprint(mrpc_dataset.features)

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}


In [19]:
import multiprocessing
print(multiprocessing.cpu_count())

16


In [20]:
print('before filtering')
print(20*'-')
print(imdb_dataset)

before filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


the samples have been split across the 16 cpus and the condition has been checked 

**filter**

In [21]:
num_words = 100 
imdb_filtered_dataset = imdb_dataset.filter(
    lambda example: len(example['text'].split(' ')) >= num_words
)
print('after filtering')
print(20*'-')
print(imdb_filtered_dataset)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter: 100%|██████████| 25000/25000 [00:00<00:00, 77647.33 examples/s]
Filter: 100%|██████████| 25000/25000 [00:00<00:00, 81921.22 examples/s]

after filtering
--------------------
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
})





**map**

In [22]:
def add_prefix(example) : 
    example['text'] = "IMDB:"+example["text"]
    return example

imdb_prefixed_dataset = imdb_dataset.map(add_prefix)
print(imdb_prefixed_dataset)
pprint(imdb_prefixed_dataset['train']['text'][100])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})
('IMDB:Terrible movie. Nuff Said.<br /><br />These Lines are Just Filler. The '
 "movie was bad. Why I have to expand on that I don't know. This is already a "
 'waste of my time. I just wanted to warn others. Avoid this movie. The acting '
 'sucks and the writing is just moronic. Bad in every way. The only nice thing '
 "about the movie are Deniz Akkaya's breasts. Even that was ruined though by a "
 'terrible and unneeded rape scene. The movie is a poorly contrived and '
 'totally unbelievable piece of garbage.<br /><br />OK now I am just going to '
 'rag on IMDb for this stupid rule of 10 lines of text minimum. First I waste '
 'my time watching this offal. Then feeling compelled to warn others I create '
 'an account with IMDb only to discover that I have to write a friggen essay '
 'on the f

**concatinate**

In [23]:
imdb_dataset_whole = load_dataset('stanfordnlp/imdb', split='train+test')
print(imdb_dataset_whole)
print(imdb_dataset_whole.features)

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [24]:
rt_dataset_whole = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='all')
print(rt_dataset_whole)
print(rt_dataset_whole.features)

Dataset({
    features: ['text', 'label'],
    num_rows: 10662
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [25]:
concat_dataset = datasets.concatenate_datasets([imdb_dataset_whole, rt_dataset_whole], axis=0)
print(concat_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 60662
})


**interleaving datasets**

it will select 60% from imdb and 40%(all) from rotten_tomato

In [26]:
inter_datasets = interleave_datasets(
    [imdb_dataset_whole, rt_dataset_whole], probabilities=[0.6, 0.4]
)
print(inter_datasets)

Dataset({
    features: ['text', 'label'],
    num_rows: 26577
})


**iterable dataset**

stream one sample at a time 

In [27]:
imdb_itr_dataset = load_dataset('stanfordnlp/imdb', split='train', streaming=True)
print(imdb_itr_dataset)

IterableDataset({
    features: ['text', 'label'],
    num_shards: 1
})


In [28]:
counter = 0
for example in imdb_itr_dataset:
    pprint(example)
    counter += 1 
    if counter == 2 :
        break

{'label': 0,
 'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the '
         'controversy that surrounded it when it was first released in 1967. I '
         'also heard that at first it was seized by U.S. customs if it ever '
         'tried to enter this country, therefore being a fan of films '
         'considered "controversial" I really had to see this for myself.<br '
         '/><br />The plot is centered around a young Swedish drama student '
         'named Lena who wants to learn everything she can about life. In '
         'particular she wants to focus her attentions to making some sort of '
         'documentary on what the average Swede thought about certain '
         'political issues such as the Vietnam War and race issues in the '
         'United States. In between asking politicians and ordinary denizens '
         'of Stockholm about their opinions on politics, she has sex with her '
         'drama teacher, classmates, and married men.<br

In [29]:
for example in imdb_itr_dataset.map(add_prefix):
    pprint(example) 
    break

{'label': 0,
 'text': 'IMDB:I rented I AM CURIOUS-YELLOW from my video store because of all '
         'the controversy that surrounded it when it was first released in '
         '1967. I also heard that at first it was seized by U.S. customs if it '
         'ever tried to enter this country, therefore being a fan of films '
         'considered "controversial" I really had to see this for myself.<br '
         '/><br />The plot is centered around a young Swedish drama student '
         'named Lena who wants to learn everything she can about life. In '
         'particular she wants to focus her attentions to making some sort of '
         'documentary on what the average Swede thought about certain '
         'political issues such as the Vietnam War and race issues in the '
         'United States. In between asking politicians and ordinary denizens '
         'of Stockholm about their opinions on politics, she has sex with her '
         'drama teacher, classmates, and married me

### Test

In [30]:
print(len(get_dataset_config_names('ai4bharat/naamapadam')))
print(get_dataset_config_names('ai4bharat/naamapadam'))

11
['as', 'bn', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']


In [31]:
test_dataset = load_dataset('ai4bharat/naamapadam', 'hi')
print(test_dataset)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 985787
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 867
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 13460
    })
})


In [66]:
ds = load_dataset('ai4bharat/naamapadam', 'ta')
print(ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2795
    })
})


In [67]:
ds.cache_files

{'train': [{'filename': 'C:\\Users\\ghora\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-train.arrow'}],
 'test': [{'filename': 'C:\\Users\\ghora\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-test.arrow'}],
 'validation': [{'filename': 'C:\\Users\\ghora\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-validation.arrow'}]}

In [68]:
import os 

train_file = ds.cache_files['train'][0]['filename']
test_file = ds.cache_files['test'][0]['filename']
validation_file = ds.cache_files['validation'][0]['filename']

total_size = sum(
    os.path.getsize(file) for file in [train_file, test_file, validation_file]
)

print(total_size/(1024*1024))

179.28880310058594


In [69]:
ds['train'][0]

{'tokens': ['பைரவருக்கு',
  'தேய்பிறை',
  'அஷ்டமியில்',
  'விசேஷ',
  'அபிஷேக',
  'ஆராதனைகள்',
  'நடைபெறுகின்றன',
  '.'],
 'ner_tags': [1, 0, 0, 0, 0, 0, 0, 0]}

In [70]:
def compute_num_tokens(example) :
    return {"num_tokens" : len(example['tokens'])}

ds = ds.map(compute_num_tokens)

total_tokens = sum(ds['train']['num_tokens']) + sum(ds['test']['num_tokens']) + sum(ds['validation']['num_tokens'])
print(total_tokens)

6001876


In [71]:
ds['train'][0]

{'tokens': ['பைரவருக்கு',
  'தேய்பிறை',
  'அஷ்டமியில்',
  'விசேஷ',
  'அபிஷேக',
  'ஆராதனைகள்',
  'நடைபெறுகின்றன',
  '.'],
 'ner_tags': [1, 0, 0, 0, 0, 0, 0, 0],
 'num_tokens': 8}

In [72]:
train_file = ds.cache_files['train'][0]['filename']
test_file = ds.cache_files['test'][0]['filename']
validation_file = ds.cache_files['validation'][0]['filename']

total_size = sum(
    os.path.getsize(file) for file in [train_file, test_file, validation_file]
)

print(total_size/(1024*1024))

183.1378173828125


In [73]:
ds_combined = datasets.concatenate_datasets([ds['train'], ds['test'], ds['validation']])

def combine_tokens(example):
    return {"text": " ".join(example["tokens"])}

ds_combined = ds_combined.map(combine_tokens)
ds = ds_combined.remove_columns(["ner_tags", "tokens"])

ds

Map: 100%|██████████| 501435/501435 [00:21<00:00, 23412.90 examples/s]


Dataset({
    features: ['num_tokens', 'text'],
    num_rows: 501435
})

In [77]:
ds_filtered_dataset = ds.filter(
    lambda example: example['num_tokens'] >= 6
)
print('after filtering')
print(20*'-')
print(ds_filtered_dataset)

Filter: 100%|██████████| 501435/501435 [00:01<00:00, 390749.63 examples/s]

after filtering
--------------------
Dataset({
    features: ['num_tokens', 'text'],
    num_rows: 370495
})





In [79]:
glue_ds = load_dataset('ai4bharat/indic_glue', 'inltkh.ta')
print(glue_ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5346
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 669
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 669
    })
})


In [80]:
glue_ds['train'][0]

{'text': 'கே.வி.ஆனந்தே ட்விட்டரில் இதை அறிவித்துள்ளார். இந்தப் படத்துக்கு கேவ்மிக் ஆரி ஒளிப்பதிவு செய்ய, ஹாரிஸ் ஜெயராஜ் இசையமைக்கிறார். பட்டுக்கோட்டை பிரபாகர் வசனம் எழுத, கலை இயக்குநராக கிரண் பணியாற்றுகிறார். இந்தப் படத்தை லைகா புரொடக்\u200cஷன்ஸ் நிறுவனம் தயாரிக்கிறது.',
 'label': 6}

In [81]:
glue_ds_filter = glue_ds.filter(
    lambda example: len(example['text'].split(' ')) >= 6
)

Filter: 100%|██████████| 5346/5346 [00:00<00:00, 102810.90 examples/s]
Filter: 100%|██████████| 669/669 [00:00<00:00, 26762.19 examples/s]
Filter: 100%|██████████| 669/669 [00:00<00:00, 24890.36 examples/s]


In [84]:
glue_ds_filter_combined = datasets.concatenate_datasets([glue_ds_filter['train'], glue_ds_filter['test'], glue_ds_filter['validation']])
glue_ds_filter_combined

Dataset({
    features: ['text', 'label'],
    num_rows: 6428
})

In [85]:
inter_datasets = interleave_datasets(
    [ds_filtered_dataset, glue_ds_filter_combined], probabilities=[0.8, 0.2], seed=42
)
print(inter_datasets)

Dataset({
    features: ['num_tokens', 'text', 'label'],
    num_rows: 32354
})
