In [2]:

GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]


In [3]:

GLUE_TASKS


['cola',
 'mnli',
 'mnli-mm',
 'mrpc',
 'qnli',
 'qqp',
 'rte',
 'sst2',
 'stsb',
 'wnli']

In [4]:

task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16


In [6]:

from datasets import load_dataset, load_metric


In [7]:


actual_task = "mnli" if task == "mnli-mm" else task


In [8]:

actual_task


'cola'

In [9]:

dataset = load_dataset("glue", actual_task)
metric  = load_metric('glue',  actual_task)


  metric  = load_metric('glue',  actual_task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [10]:

dataset


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [11]:

dataset["train"][0]


{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [12]:

dataset["train"][17]


{'sentence': 'They drank the pub dry.', 'label': 1, 'idx': 17}

In [13]:

import datasets
import random
import pandas as pd
from IPython.display import display, HTML


In [14]:


def show_random_elements(dataset, num_examples=10):
    
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [15]:

show_random_elements(dataset["train"])


Unnamed: 0,sentence,label,idx
0,"After John comes home, will Sally take a shower?",acceptable,5323
1,The beaver built a dam.,acceptable,6944
2,We contributed her with our paycheck.,unacceptable,2725
3,I played a tune on my iPod.,acceptable,5915
4,"Kim likes Lee, and to Ronnie.",unacceptable,7107
5,Nora sent at the book to Peter.,unacceptable,2671
6,Zeke cooked and ate the chili.,acceptable,5631
7,John did leave.,acceptable,4459
8,You shouldn't play with rifles because to is dangerous.,unacceptable,857
9,It is this hat that I know the boy who is wearing.,unacceptable,1671


In [16]:

metric


Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res


## COLA GLUE metric


In [17]:

'''

>>> glue_metric = datasets.load_metric('glue', 'cola')
>>> references = [0, 1]
>>> predictions = [0, 1]
>>> results = glue_metric.compute(predictions=predictions, references=references)
>>> print(results)

'''


"\n\n>>> glue_metric = datasets.load_metric('glue', 'cola')\n>>> references = [0, 1]\n>>> predictions = [0, 1]\n>>> results = glue_metric.compute(predictions=predictions, references=references)\n>>> print(results)\n\n"

In [18]:

import numpy as np


In [20]:

fake_preds  = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))

print( len(fake_preds) )
print(fake_preds)
print(fake_labels)



64
[0 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 1 0 0 0 0 0 0 0
 0 1 0 1 1 0 0 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0]
[0 1 1 1 1 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0
 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 1 1 1 0 1 1]


In [21]:

metric.compute(predictions=fake_preds, references=fake_labels)


{'matthews_correlation': -0.0905982365507463}

In [22]:

from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


In [23]:

tokenizer("Hello, this one sentence!", "And this sentence goes with it.")


{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}


In [25]:

sentence1_key, sentence2_key = task_to_keys[task]
sentence1_key, sentence2_key


('sentence', None)

In [26]:

if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")



Sentence: Our friends won't buy this analysis, let alone the next one we propose.


In [27]:

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)


In [29]:

preprocess_function( dataset['train'][:5] )


{'input_ids': [[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 1996, 2062, 2057, 2817, 16025, 1010, 1996, 13675, 16103, 2121, 2027, 2131, 1012, 102], [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [31]:

encoded_dataset = dataset.map(preprocess_function, batched=True)


In [32]:

encoded_dataset 


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})

In [33]:

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [34]:

num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
print( num_labels )


2


In [35]:

model      = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model 


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [36]:

metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

metric_name


'matthews_correlation'

In [37]:

model_name = model_checkpoint.split("/")[-1]
model_name


'distilbert-base-uncased'

In [38]:



args = TrainingArguments(
    f"/scratch/scholar/rcalix/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)


In [39]:

args


TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [40]:

def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
        
    return metric.compute(predictions=predictions, references=labels)


In [41]:

validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
validation_key


'validation'

In [42]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset =encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [43]:

trainer.train()


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5191,0.461798,0.458846
2,0.3467,0.537853,0.472806
3,0.2233,0.627159,0.502552
4,0.1801,0.70941,0.529037
5,0.1341,0.812698,0.541935


TrainOutput(global_step=2675, training_loss=0.2692002604832159, metrics={'train_runtime': 183.4147, 'train_samples_per_second': 233.106, 'train_steps_per_second': 14.584, 'total_flos': 229437415353012.0, 'train_loss': 0.2692002604832159, 'epoch': 5.0})

In [44]:

trainer.evaluate()


{'eval_loss': 0.8126975893974304,
 'eval_matthews_correlation': 0.541934635424655,
 'eval_runtime': 0.6518,
 'eval_samples_per_second': 1600.132,
 'eval_steps_per_second': 101.255,
 'epoch': 5.0}


## Hyperparameter search



The Trainer supports hyperparameter search using optuna or Ray Tune. For this last section you will need either of those libraries installed, just uncomment the line you want on the next cell and run it.


In [None]:

## !pip install optuna


In [None]:

## !pip install ray[tune]


In [None]:

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


In [None]:

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:

best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")


In [None]:

best_run


In [None]:

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()
