In [0]:
try: import nlp
except ImportError:
  %pip install -q fastai2 transformers nlp
  !rm -rf Pretrain-MLM-and-finetune-on-GLUE-with-fastai
  !git clone https://github.com/richardyy1188/Pretrain-MLM-and-finetune-on-GLUE-with-fastai.git
  exit() # In Colab, to use the newer installed pyarrow, you need to restart your session for first use

In [2]:
%cd Pretrain-MLM-and-finetune-on-GLUE-with-fastai

from pathlib import Path
from pprint import pprint
from functools import partial
from collections import namedtuple
import pickle
from IPython.core.debugger import set_trace as bk
import csv
import pandas as pd
from sklearn import metrics as skm
from scipy import stats as spm
from torch import nn
import nlp
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertTokenizerFast
from fastai2.text.all import *
from _utils.would_like_to_pr import TextDataloader

/content/Pretrain-MLM-and-finetune-on-GLUE-with-fastai


In [0]:
""" tokenizer and fast tokenizer
We use normal tokenizer to get vocab, use fast tokenizer to convert tokens to ids.
Because we can't get vocab from fast tokenizer and fast tokenizer is faster,
and they have the same token-id mapping.
"""
hf_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
hf_fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
seq_clas_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# 1. Prepare data

**Download, Preprocess, and Cache**

In Colab, it takes you 20+ minutes for the creation, and about 3 minutes using cache for the first time, and few seconds if it is not using cache for the first time. 

In [4]:
# create a 'glue' folder under it, and all cache files will be under 'glue'
cache_dir=Path('/content/drive/My Drive/datasets')
cache_dir.mkdir(parents=True, exist_ok=True) # create recursively if not exist

def textcols(dataset):
  "Infer text cols of different GLUE datasets in huggingface/nlp"
  column_names = dataset.column_names
  if 'question' in column_names: return ['question', 'sentence']
  elif 'sentence1' in column_names: return ['sentence1', 'sentence2']
  elif 'question1' in column_names: return ['question1','question2']
  elif 'premise' in column_names: return ['premise','hypothesis']
  elif 'sentence' in column_names: return ['sentence']

def tokenize_sents(example, cols):
  if len(cols)==1:
    example['input_ids'] = hf_fast_tokenizer.convert_tokens_to_ids(hf_fast_tokenizer.tokenize(f"[CLS] {example[cols[0]]} [SEP]"))
  elif len(cols)==2:
    example['input_ids'] = concat(hf_fast_tokenizer.convert_tokens_to_ids(hf_fast_tokenizer.tokenize(f'[CLS] {example[cols[0]]} [SEP]')),
                                  hf_fast_tokenizer.convert_tokens_to_ids(hf_fast_tokenizer.tokenize(f'{example[cols[1]]} [SEP]')))
  else: raise ValueError()
  return example

glue_dsets = {}
for glue_task in ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte', 'wnli', 'ax']:
  task = nlp.load_dataset('glue', glue_task, cache_dir=cache_dir)
  glue_dsets[glue_task] = {}
  print(f'loading processed datasets of {glue_task} ...')
  for split in task.keys():
    raw_dataset = task[split]
    cache_file = Path(raw_dataset.cache_files[0]['filename']).parent / f'tokenized_{split}.arrow'
    if cache_file.exists():dataset = nlp.Dataset.from_file(str(cache_file))
    else: dataset = raw_dataset.map(partial(tokenize_sents, cols=textcols(raw_dataset)),
                                    cache_file_name=str(cache_file))
    glue_dsets[glue_task][split] = dataset 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28998.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=30329.0, style=ProgressStyle(descriptio…


Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, total: 964.86 KiB) to /content/drive/My Drive/datasets/glue/cola/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376971.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/cola/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of cola ...


1063it [00:00, 8437.85it/s]
8551it [00:00, 8807.21it/s]
1043it [00:00, 8367.09it/s]


Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, total: 11.90 MiB) to /content/drive/My Drive/datasets/glue/sst2/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7439277.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/sst2/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of sst2 ...


1821it [00:00, 6354.51it/s]
67349it [00:08, 8225.62it/s]
872it [00:00, 6227.49it/s]


Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, total: 2.85 MiB) to /content/drive/My Drive/datasets/glue/mrpc/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=6222.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', max=1.0, style=ProgressSt…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

351it [00:00, 3500.69it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/mrpc/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of mrpc ...


1725it [00:00, 3521.85it/s]
3668it [00:01, 3463.15it/s]
408it [00:00, 3350.54it/s]


Downloading and preparing dataset glue/qqp (download: 57.73 MiB, generated: 107.02 MiB, total: 164.75 MiB) to /content/drive/My Drive/datasets/glue/qqp/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=60534884.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

441it [00:00, 4406.52it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/qqp/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of qqp ...


390965it [01:27, 4462.52it/s]
363849it [01:21, 4475.85it/s]
40430it [00:08, 4529.89it/s]


Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, total: 1.86 MiB) to /content/drive/My Drive/datasets/glue/stsb/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=802872.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/stsb/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of stsb ...


1379it [00:00, 4515.31it/s]
5749it [00:01, 4655.81it/s]
1500it [00:00, 4644.14it/s]


Downloading and preparing dataset glue/mnli (download: 298.29 MiB, generated: 78.65 MiB, total: 376.95 MiB) to /content/drive/My Drive/datasets/glue/mnli/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=312783507.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



420it [00:00, 4198.05it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/mnli/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of mnli ...


9796it [00:02, 4006.05it/s]
9847it [00:02, 3976.12it/s]
392702it [01:38, 3966.78it/s]
9815it [00:02, 3992.00it/s]
9832it [00:02, 4041.29it/s]


Downloading and preparing dataset glue/qnli (download: 10.14 MiB, generated: 27.11 MiB, total: 37.24 MiB) to /content/drive/My Drive/datasets/glue/qnli/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10627589.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/qnli/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of qnli ...


5463it [00:01, 3403.51it/s]
104743it [00:29, 3551.54it/s]
5463it [00:01, 3552.89it/s]


Downloading and preparing dataset glue/rte (download: 680.81 KiB, generated: 1.83 MiB, total: 2.49 MiB) to /content/drive/My Drive/datasets/glue/rte/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=697150.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/rte/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of rte ...


3000it [00:00, 3071.49it/s]
2490it [00:00, 3036.02it/s]
277it [00:00, 3168.02it/s]


Downloading and preparing dataset glue/wnli (download: 28.32 KiB, generated: 154.03 KiB, total: 182.35 KiB) to /content/drive/My Drive/datasets/glue/wnli/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28999.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

0it [00:00, ?it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/wnli/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of wnli ...


146it [00:00, 2672.35it/s]
635it [00:00, 4110.91it/s]
71it [00:00, 4111.72it/s]


Downloading and preparing dataset glue/ax (download: 217.05 KiB, generated: 232.80 KiB, total: 449.85 KiB) to /content/drive/My Drive/datasets/glue/ax/1.0.0...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=222257.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

417it [00:00, 4165.53it/s]

Dataset glue downloaded and prepared to /content/drive/My Drive/datasets/glue/ax/1.0.0. Subsequent calls will reuse this data.
loading processed datasets of ax ...


1104it [00:00, 3747.31it/s]


I don't know how to let `Learner` validate two validation sets every epochs, so I just merged `mnli validation mismatched` and `mnli validation matched`.

In [0]:
class HF_MergedDataset():
  def __init__(self, *datasets):
    self.dsets = datasets
  def __len__(self):
    return reduce(lambda a,d: a+len(d), self.dsets, 0)
  def __getitem__(self, i):
    for dset in self.dsets:
      if i < len(dset): return dset[i]
      else: i -= len(dset)
  def set_format(self, type, columns):
    for dset in self.dsets: dset.set_format(type, columns)
  @property
  def cache_files(self):
    return concat(*[ds.cache_files for ds in self.dsets])
    
glue_dsets['mnli']['validation'] = HF_MergedDataset(glue_dsets['mnli']['validation_matched'], glue_dsets['mnli']['validation_mismatched'])

 **Novel huggingface/nlp integration**, which mimics `fastai2.data.core.Datasets`

In [0]:
class HF_Dataset():

  """ Inheritance by object composition
  I want this class behave like nlp.arrow_dataset.Dataset, and overload some methods. (Inheritance),
  But I don't know how to initialize a nlp.arrow_dataset.Dataset with existing Dataset properly and without additional cost such as a new copy.
  
  So I add every attributes/methods of nlp.arrow_dataset.Dataset, ans pass execution to composed Dataset.
  Notice that __init__, __repr__,__getattribute__,__new__ should'nt be added, when doing this I call it Inheritance by object composition
  ,otherwise it won't work for the reason I don't know.
  """
  for attr_name, attr in nlp.arrow_dataset.Dataset.__dict__.items():
    if attr_name not in ['__init__', '__repr__','__getattribute__','__new__'] + ['__getitem__','__iter__',]:
      if callable(attr): exec(f'def {attr_name}(self,*args,**kwargs): return self.dataset.{attr_name}(*args,**kwargs)')
      else: exec(f'@property\ndef {attr_name}(self): return self.dataset.{attr_name}')

  def __init__(self, dataset, cols, encode_types, decode_funcs, decode_types):
    store_attr(self, 'dataset,cols,encode_types,decode_funcs,decode_types')

  def __getitem__(self, i):
    sample = self.dataset[i]
    return tuple( enc_type(sample[col]) for col, enc_type in zip(self.cols, self.encode_types))

  def __iter__(self):
    """
    default __iter__ will iter until get IndexError, 
    but ArrowDataset gives you ValueError when out of index.
    So we have to explicitly define __iter__ method
    """
    for i in range(len(self)): yield self[i] 

  def decode(self, o, full=True): return tuple( de_type(de_fc(o_)) for o_,de_fc,de_type in zip(o,self.decode_funcs,self.decode_types))
  #def __len__(self): return len(self.dataset)

class HF_Datasets(FilteredBase):
  def __init__(self, datasets, cols, encode_types, decode_funcs, decode_types):
    assert len(cols) == len(decode_funcs) == len(encode_types) == len(decode_types) == len(decode_funcs)
    for ds in datasets: ds.set_format(type='torch', columns=cols)
    self.datasets = L(HF_Dataset(ds, cols, encode_types, decode_funcs, decode_types) for ds in datasets)
  def subset(self, i): return self.datasets[i]
  def __getitem__(self, i): return self.datasets[i]
  @property
  def n_subsets(self): return len(self.datasets)

Use this novel integration to get `Dataloaders` of GLUE tasks

In [0]:
def text_decode_fc(x, pretty=True):
  if pretty:
    return hf_fast_tokenizer.decode([idx for idx in x if idx != hf_fast_tokenizer.pad_token_id])
  else:
    tokens = hf_fast_tokenizer.convert_ids_to_tokens(x)
    return ' '.join(tokens)

@delegates(FilteredBase.dataloaders)
def get_glue_dls(task_name, **kwargs):
  assert task_name in ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte', 'wnli', 'ax']
  splits = ['train','validation']+(['test'] if task_name != 'mnli' else ['test_matched', 'test_mismatched'])
  if task_name == 'ax': splits = ['test']
  arrow_dsets = [glue_dsets[task_name][s] for s in splits ]
  show_pretty = kwargs.pop('show_pretty', True) 
  dsets = HF_Datasets(datasets=arrow_dsets, 
                      cols=['input_ids', 'label'],
                      encode_types=[TensorText, noop],
                      decode_funcs=[partial(text_decode_fc,pretty=show_pretty),noop],
                      decode_types=[TitledStr, lambda x: Category(x.item())])
  dl_cache_files = [Path(dset.cache_files[0]['filename']).parent/f'dl_{s}.pth' for dset, s in zip(dsets,splits)]
  if all([ p.exists() for p in dl_cache_files]):
    device = kwargs.pop('device', default_device())
    dl_s = [TextDataloader.from_cache(f, dsets[i], **kwargs) for i, f in enumerate(dl_cache_files)]  
    dls = DataLoaders(*dl_s, device=device)
  else:
    dls = dsets.dataloaders(before_batch=partial(pad_input_chunk,pad_first=False,pad_idx=hf_fast_tokenizer.pad_token_id,),
                             dl_type=partial(TextDataloader, sort_by_len=False),
                             **kwargs)
    for dl, cache_f in zip(dls,dl_cache_files): dl.cache(cache_f)
  return dls

## 1.1 Get dataloaders fror each dataset

**[CoLA](https://nyu-mll.github.io/CoLA/)** (*The Corpus of Linguistic Acceptability*):


Check whether a sentence is linguistically acceptable. 

(0: unacceptable, 1: acceptable) 


In [10]:
cola_dls = get_glue_dls('cola', show_bar=False)
print(f"Dataset size (train/valid/test): {len(cola_dls[0].dataset)}/{len(cola_dls[1].dataset)}/{len(cola_dls[2].dataset)}")
cola_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 8551/1043/1063


Unnamed: 0,text,category
0,"[CLS] our friends won't buy this analysis, let alone the next one we propose. [SEP]",1
1,[CLS] one more pseudo generalization and i'm giving up. [SEP],1


**Note**: for the readibility, we won't show pad and result of sentencepiece ('##...') here, which are the actual results in a batch.

In [11]:
get_glue_dls('cola', show_bar=False, show_pretty=False).show_batch(max_n=2)

Unnamed: 0,text,category
0,"[CLS] our friends won ' t buy this analysis , let alone the next one we propose . [SEP]",1
1,[CLS] one more pseudo general ##ization and i ' m giving up . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD],1


**[SST-2](https://nlp.stanford.edu/sentiment/index.html)** (*The Stanford Sentiment Treebank*): 

Identify the sentiment of a work/phrase/sentence. 

(1: positvie, 0: negative)

In [12]:
sst2_dls = get_glue_dls('sst2')
print(f"Dataset size (train/valid/test): {len(sst2_dls[0].dataset)}/{len(sst2_dls[1].dataset)}/{len(sst2_dls[2].dataset)}")
sst2_dls.show_batch(max_n=2)

TextDataloader init:: 100%|██████████| 67349/67349 [00:09<00:00, 6808.92it/s]
TextDataloader init:: 100%|██████████| 872/872 [00:00<00:00, 8424.39it/s]
TextDataloader init:: 100%|██████████| 1821/1821 [00:00<00:00, 6615.68it/s]


Dataset size (train/valid/test): 67349/872/1821


Unnamed: 0,text,category
0,[CLS] hide new secretions from the parental units [SEP],0
1,"[CLS] contains no wit, only labored gags [SEP]",0


**[MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398)** (*Microsoft Research Paraphrase Corpus*): 

Whether each pair captures a paraphrase/semantic equivalence relationship. 

(1: yes, 0: no)

In [14]:
mrpc_dls = get_glue_dls('mrpc',show_bar=False)
print(f"Dataset size (train/valid/test): {len(sst2_dls.train_ds)}/{len(sst2_dls.valid_ds)}/{len(sst2_dls[2])}")
mrpc_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 67349/872/29


Unnamed: 0,text,category
0,"[CLS] amrozi accused his brother, whom he called "" the witness "", of deliberately distorting his evidence. [SEP] referring to him as only "" the witness "", amrozi accused his brother of deliberately distorting his evidence. [SEP]",1
1,[CLS] yucaipa owned dominick's before selling the chain to safeway in 1998 for $ 2. 5 billion. [SEP] yucaipa bought dominick's in 1995 for $ 693 million and sold it to safeway for $ 1. 8 billion in 1998. [SEP],0


**[STS-B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)** (*Semantic Textual Similarity Benchmark*):

Score the similarity of meanings of two sentences. The only regression task in GLUE 

(0.0 ~ 5.0)

In [15]:
stsb_dls = get_glue_dls('stsb',show_bar=False)
print(f"Dataset size (train/valid/test): {len(stsb_dls.train_ds)}/{len(stsb_dls.valid_ds)}/{len(stsb_dls[2])}")
stsb_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 5749/1500/22


Unnamed: 0,text,category
0,[CLS] a plane is taking off. [SEP] an air plane is taking off. [SEP],5.0
1,[CLS] a man is playing a large flute. [SEP] a man is playing a flute. [SEP],3.799999952316284


**[QQP](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)** (*Quora Question Pairs*)

Check whether two questions are duplicated. 

(0: no, 1: duplicated)

In [16]:
qqp_dls = get_glue_dls('qqp')
print(f"Dataset size (train/valid/test): {len(qqp_dls.train_ds)}/{len(qqp_dls.valid_ds)}/{len(qqp_dls[2])}")
qqp_dls.show_batch(max_n=2)

TextDataloader init:: 100%|██████████| 363849/363849 [01:07<00:00, 5402.32it/s]
TextDataloader init:: 100%|██████████| 40430/40430 [00:05<00:00, 6843.85it/s]
TextDataloader init:: 100%|██████████| 390965/390965 [01:12<00:00, 5380.07it/s]


Dataset size (train/valid/test): 363849/40430/6109


Unnamed: 0,text,category
0,[CLS] how is the life of a math student? could you describe your own experiences? [SEP] which level of prepration is enough for the exam jlpt5? [SEP],0
1,[CLS] how do i control my horny emotions? [SEP] how do you control your horniness? [SEP],1


**[MNLI](https://cims.nyu.edu/~sbowman/multinli/)** (*The Multi-Genre NLI Corpus*)

Whether the premise (sentence 1) entails the hypothesis (sentence 2) (entailment), contradicts the hypothesis (contradiction), or neither (neutral) 

(0: entailment, 1: neutral, 2: contradiction)

In [24]:
mnli_dls = get_glue_dls('mnli')
print(f"Dataset size (train/valid/test_matched/test_mismatched): {len(mnli_dls[0].dataset)}/{len(mnli_dls[1].dataset)}/{len(mnli_dls[2].dataset)}/{len(mnli_dls[3].dataset)}")
mnli_dls.show_batch(max_n=2)

TextDataloader init:: 100%|██████████| 392702/392702 [01:13<00:00, 5359.05it/s]
TextDataloader init:: 100%|██████████| 19647/19647 [00:02<00:00, 6980.73it/s]
TextDataloader init:: 100%|██████████| 9796/9796 [00:01<00:00, 7218.74it/s]
TextDataloader init:: 100%|██████████| 9847/9847 [00:01<00:00, 7161.08it/s]


Dataset size (train/valid/test_matched/test_mismatched): 392702/19647/9796/9847


Unnamed: 0,text,category
0,[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP],1
1,[CLS] you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the braves decide to call to recall a guy from triple a then a double a guy goes up to replace him and a single a guy goes up to replace him [SEP] you lose the things to the following level if the people recall. [SEP],0


**QNLI** (*The Stanford Question Answering Dataset*):

The task is to determine whether the context sentence (sentence 2) contains the answer to the question (sentence 1).

(0: entailment, 1: not_entailment)

In [25]:
qnli_dls = get_glue_dls('qnli')
print(f"Dataset size (train/valid/test): {len(qnli_dls[0].dataset)}/{len(qnli_dls[1].dataset)}/{len(qnli_dls[2].dataset)}")
qnli_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 104743/5463/5463


Unnamed: 0,text,category
0,"[CLS] when did the third digimon series begin? [SEP] unlike the two seasons before it and most of the seasons that followed, digimon tamers takes a darker and more realistic approach to its story featuring digimon who don't reincarnate after their deaths and more complex character development in the original japanese. [SEP]",1
1,"[CLS] which missile batteries often have individual launchers several kilometres from one another? [SEP] when manpads is operated by specialists, batteries may have several dozen teams deploying separately in small sections ; self - propelled air defence guns may deploy in pairs. [SEP]",1


**[RTE](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment)** (*Recognizing_Textual_Entailment*):

Whether hypothesis (sentence 2) is entailed (can be inferred) from the premise (sentence 1).

(0: entailment, 1: not_entailment)

In [26]:
rte_dls = get_glue_dls('rte', show_bar=False)
print(f"Dataset size (train/valid/test): {len(rte_dls[0].dataset)}/{len(rte_dls[1].dataset)}/{len(rte_dls[2].dataset)}")
rte_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 2490/277/3000


Unnamed: 0,text,category
0,[CLS] no weapons of mass destruction found in iraq yet. [SEP] weapons of mass destruction found in iraq. [SEP],1
1,"[CLS] a place of sorrow, after pope john paul ii died, became a place of celebration, as roman catholic faithful gathered in downtown chicago to mark the installation of new pope benedict xvi. [SEP] pope benedict xvi is the new leader of the roman catholic church. [SEP]",0


**[WNLI](https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html)** (*The Winograd Schema Challenge*)

Check whether sentence 2 (which is rephrased sentence of sentence 1) correctly solve the pronoun in sentence 1.

(0: wrong, 1: correct)

In [27]:
wnli_dls = get_glue_dls('wnli', show_bar=False)
print(f"Dataset size (train/valid/test): {len(wnli_dls[0].dataset)}/{len(wnli_dls[1].dataset)}/{len(wnli_dls[2].dataset)}")
wnli_dls.show_batch(max_n=2)

Dataset size (train/valid/test): 635/71/146


Unnamed: 0,text,category
0,"[CLS] i stuck a pin through a carrot. when i pulled the pin out, it had a hole. [SEP] the carrot had a hole. [SEP]",1
1,[CLS] john couldn't see the stage with billy in front of him because he is so short. [SEP] john is so short. [SEP],1


**[AX](https://gluebenchmark.com/diagnostics)** (*GLUE Diagnostic Dataset*):

Whether the premise (sentence 1) entails the hypothesis (sentence 2) (entailment), contradicts the hypothesis (contradiction), or neither (neutral) 

Test set only.

Currently, this dataset huggingface/nlp has labels of all -1s. 
I have submitted [issue](https://github.com/huggingface/nlp/issues/183).

In [28]:
ax_dls = get_glue_dls('ax')
print(f"Dataset size (test): {len(ax_dls[0].dataset)}")
ax_dls.show_batch(max_n=2)

Dataset size (test): 1104


Unnamed: 0,text,category
0,[CLS] the cat sat on the mat. [SEP] the cat did not sit on the mat. [SEP],-1
1,[CLS] the cat did not sit on the mat. [SEP] the cat sat on the mat. [SEP],-1


In [0]:
glue_dls = {}
for task_name in ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte', 'wnli', 'ax']:
  glue_dls[task_name] = eval(f"{task_name}_dls")
glue_dls['ax'][0].shuffle = False # fastai see 0th dl as train_dl which is default shuffled