In [1]:

GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]


In [2]:

task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16


In [3]:

from datasets import load_dataset, load_metric


In [4]:

actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)


  metric = load_metric('glue', actual_task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [5]:

dataset


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [6]:

dataset["train"][0]


{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [7]:

import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [8]:

show_random_elements(dataset["train"])


Unnamed: 0,sentence,label,idx
0,The Dodgers beat the Red Sox and the Dodgers were beaten by the Giants.,acceptable,6510
1,Anson demonized David every day.,acceptable,8535
2,Which picture of himself does Mary think that John said that Susan likes?,acceptable,367
3,It struck John that it was so.,acceptable,550
4,All Mr Collins has done is have praised Lady de Bourg.,unacceptable,6792
5,He does be leaving.,unacceptable,4456
6,"Ann is going to send a picture of Chairman Mao to her teacher, as soon as she gets home.",acceptable,1486
7,John to call would be unlikely.,unacceptable,413
8,Jeff must have eaten the deep fried muffin.,acceptable,5972
9,The cat chased the long string.,acceptable,3601


In [9]:

metric


Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [10]:

import numpy as np

fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)


{'matthews_correlation': 0.10746372201097237}

In [11]:

from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)


In [12]:

tokenizer("Hello, this one sentence!", "And this sentence goes with it.")


{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}


In [14]:

sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")


Sentence: Our friends won't buy this analysis, let alone the next one we propose.


In [15]:

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)


In [16]:

preprocess_function(dataset['train'][:5])


{'input_ids': [[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 1996, 2062, 2057, 2817, 16025, 1010, 1996, 13675, 16103, 2121, 2027, 2131, 1012, 102], [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [17]:

encoded_dataset = dataset.map(preprocess_function, batched=True)


In [18]:

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:

metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"/scratch/scholar/rcalix/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)


In [20]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)


In [21]:

validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"


trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:

trainer.train()


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5266,0.467504,0.47512
2,0.3618,0.457907,0.535758
3,0.2286,0.591004,0.545606
4,0.1777,0.748982,0.52824
5,0.1249,0.838311,0.532073


TrainOutput(global_step=2675, training_loss=0.27245647679979557, metrics={'train_runtime': 248.7602, 'train_samples_per_second': 171.872, 'train_steps_per_second': 10.753, 'total_flos': 229437415353012.0, 'train_loss': 0.27245647679979557, 'epoch': 5.0})

In [23]:

trainer.evaluate()


{'eval_loss': 0.5910043120384216,
 'eval_matthews_correlation': 0.5456062114587601,
 'eval_runtime': 1.0824,
 'eval_samples_per_second': 963.556,
 'eval_steps_per_second': 60.973,
 'epoch': 5.0}


## Hyperparameter search



The Trainer supports hyperparameter search using optuna or Ray Tune. For this last section you will need either of those libraries installed, just uncomment the line you want on the next cell and run it.


In [24]:

## !pip install optuna


In [25]:

## !pip install ray[tune]


In [26]:

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


In [27]:

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")


2024-02-02 19:11:00,047	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.5.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
[I 2024-02-02 19:11:00,274] A new study created in memory with name: no-name-9383033c-9f2e-4054-a0aa-ab218559cb8d
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.487226,0.388288
2,0.416700,0.506046,0.493867


[I 2024-02-02 19:12:20,629] Trial 0 finished with value: 0.49386659520144205 and parameters: {'learning_rate': 9.12793605422809e-05, 'num_train_epochs': 2, 'seed': 34, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.602584,0.0
2,No log,0.563072,0.0
3,No log,0.54265,0.221116
4,0.565600,0.540584,0.262732


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
[I 2024-02-02 19:14:44,925] Trial 1 finished with value: 0.2627316656479125 and parameters: {'learning_rate': 2.9505454399078704e-06, 'num_train_epochs': 4, 'seed': 2, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5648,0.568635,0.046356
2,0.5262,0.550058,0.268473


[I 2024-02-02 19:17:06,758] Trial 2 finished with value: 0.26847252023507995 and parameters: {'learning_rate': 1.9208303050787203e-06, 'num_train_epochs': 2, 'seed': 38, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5523,0.540803,0.367733
2,0.4345,0.511979,0.439473
3,0.3644,0.506572,0.466913
4,0.328,0.526144,0.463343


[I 2024-02-02 19:20:28,672] Trial 3 finished with value: 0.46334348866082065 and parameters: {'learning_rate': 7.019805944754896e-06, 'num_train_epochs': 4, 'seed': 14, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.574625,0.0
2,No log,0.533712,0.330757
3,No log,0.529433,0.379474
4,0.529800,0.528598,0.391503


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
[I 2024-02-02 19:22:54,152] Trial 4 finished with value: 0.39150252536899116 and parameters: {'learning_rate': 3.9799885743763455e-06, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5285,0.563079,0.330259
2,0.5075,0.576722,0.40346


[I 2024-02-02 19:26:48,293] Trial 5 finished with value: 0.40345980880864174 and parameters: {'learning_rate': 2.4805182091287745e-06, 'num_train_epochs': 2, 'seed': 38, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.49386659520144205.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5069,0.478778,0.445211
2,0.3331,0.578454,0.549474


[I 2024-02-02 19:29:09,792] Trial 6 finished with value: 0.5494735380761103 and parameters: {'learning_rate': 4.112775448423998e-05, 'num_train_epochs': 2, 'seed': 25, 'per_device_train_batch_size': 8}. Best is trial 6 with value: 0.5494735380761103.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.6109,0.603979,0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
[I 2024-02-02 19:29:57,680] Trial 7 pruned. 
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.539726,0.414998
2,0.465900,0.488486,0.471844
3,0.465900,0.507487,0.517926
4,0.276100,0.57326,0.532507
5,0.276100,0.614081,0.535357


[I 2024-02-02 19:33:17,647] Trial 8 finished with value: 0.5353569722427551 and parameters: {'learning_rate': 1.5150873092072758e-05, 'num_train_epochs': 5, 'seed': 30, 'per_device_train_batch_size': 32}. Best is trial 6 with value: 0.5494735380761103.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5359,0.616398,0.39731
2,0.5073,0.867633,0.441345


In [None]:

best_run


In [None]:

for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()
