In [1]:
!pip install nlp 
!pip install tokenizers
!pip install transformers

Collecting nlp
[?25l  Downloading https://files.pythonhosted.org/packages/09/e3/bcdc59f3434b224040c1047769c47b82705feca2b89ebbc28311e3764782/nlp-0.4.0-py3-none-any.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 2.8MB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 18.5MB/s 
Collecting pyarrow>=0.16.0
[?25l  Downloading https://files.pythonhosted.org/packages/a1/0a/a89de6d747c4698af128a46398703e3d1889f196478fd94a4e16bd1b5c65/pyarrow-1.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.2MB)
[K     |████████████████████████████████| 17.2MB 239kB/s 
Installing collected packages: xxhash, pyarrow, nlp
  Found existing installation: pyarrow 0.14.1
    Uninstalling pyarrow-0.14.1:
      Successfully uninstalled pyarrow-0.14.1
Successfully installed nlp-0.4.0 pyarrow-1.0.0 xxhash-2

In [2]:
import os
import torch

import nlp
import tokenizers
import transformers

In [4]:
mnli = nlp.load_dataset(path='glue', name='mnli', split='train[:50%]')

In [9]:
from transformers import XLMRobertaTokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

In [10]:
def convert_to_features(batch):
    input_pairs = list(zip(batch['premise'], batch['hypothesis']))
    encodings = tokenizer.batch_encode_plus(input_pairs, 
                                            add_special_tokens=True, 
                                            padding=True, 
                                            max_length=96, 
                                            truncation=True, 
                                            return_attention_mask=True, 
                                            return_token_type_ids=True)
    return encodings

### MNLI

In [253]:
mnli_encoded_dataset = mnli.map(convert_to_features, batched=True, remove_columns=['idx', 'premise', 'hypothesis'])
mnli_encoded_dataset.set_format("torch", columns=['attention_mask', 'input_ids', 'token_type_ids', 'label'])

HBox(children=(FloatProgress(value=0.0, max=197.0), HTML(value='')))




In [254]:
print(mnli_encoded_dataset.num_rows)
print(mnli_encoded_dataset.num_columns)
print(mnli_encoded_dataset.column_names)

196351
4
['label', 'input_ids', 'token_type_ids', 'attention_mask']


### XNLI

In [255]:
xnli = nlp.load_dataset(path='xnli')
xnli = nlp.concatenate_datasets([xnli['test'], xnli['validation']])

In [256]:
def preprocess_xnli(example):
    premise_output = []
    hypothesis_output = []
    label_output = []
    for prem, hyp, lab in zip(example['premise'],  example['hypothesis'], example["label"]):
        label = lab
        langs = hyp['language']
        translations = hyp['translation']
        hypothesis = {k: v for k, v in zip(langs, translations)}
        for lang in prem:
            if lang in hypothesis:
                premise_output += [prem[lang]]
                hypothesis_output += [hypothesis[lang]]
                label_output += [label]
    return {'premise':premise_output, 'hypothesis':hypothesis_output, 'label':label_output}

In [257]:
xnli_processed = xnli.map(preprocess_xnli, batched=True).shuffle(seed=2020)
xnli_encoded = xnli_processed.map(convert_to_features, batched=True, remove_columns=['premise', 'hypothesis'])
xnli_encoded.set_format("torch", columns=['attention_mask', 'input_ids', 'token_type_ids', 'label']) 

HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




In [258]:
print(xnli_processed.num_rows)
print(xnli_processed.num_columns)
print(xnli_processed.column_names)

print(xnli_encoded.num_rows)
print(xnli_encoded.num_columns)
print(xnli_encoded.column_names)

112500
3
['hypothesis', 'label', 'premise']
112500
4
['label', 'input_ids', 'token_type_ids', 'attention_mask']


## The Stanford Natural Language Inference Corpus (SNLI)

In [259]:
snli = nlp.load_dataset(path='snli', split='train[:20%]')

In [260]:
print(len(snli.filter(lambda x: x['label']==0)))
print(len(snli.filter(lambda x: x['label']==1)))
print(len(snli.filter(lambda x: x['label']==2)))

36726
36524
36658


In [262]:
snli_encoded = snli.map(convert_to_features, batched=True, remove_columns=['premise', 'hypothesis'])
snli_encoded.set_format("torch", columns=['attention_mask', 'input_ids', 'token_type_ids', 'label']) 

HBox(children=(FloatProgress(value=0.0, max=111.0), HTML(value='')))




### Check all three datasets

In [263]:
mnli_encoded_dataset.num_rows + snli_encoded.num_rows + xnli_encoded.num_rows

418881

In [264]:
print(mnli_encoded_dataset.column_names)
print(snli_encoded.column_names)
print(xnli_encoded.column_names)

['label', 'input_ids', 'token_type_ids', 'attention_mask']
['label', 'input_ids', 'token_type_ids', 'attention_mask']
['label', 'input_ids', 'token_type_ids', 'attention_mask']


In [266]:
dataset = nlp.concatenate_datasets([mnli_encoded_dataset, snli_encoded, xnli_encoded])

In [268]:
dataset = dataset.shuffle(seed=2020)

HBox(children=(FloatProgress(value=0.0, max=419.0), HTML(value='')))


