<a href="https://colab.research.google.com/github/simulate111/Deep-Learning-in-Human-Language-Technology/blob/main/ex4_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip3 install -q transformers datasets evaluate accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from pprint import pprint
import logging

logging.disable(logging.INFO)

---
# Download and prepare data

In [3]:
import datasets

dataset = datasets.load_dataset('imdb')
dataset = dataset.shuffle() #This is never a bad idea, datasets may have ordering to them, which is not what we want
del dataset["unsupervised"] # Delete the unlabeled part of the dataset to make things faster

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

---

# Tokenize and vectorize data

In [4]:
import transformers

model_name = "bert-base-cased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# Define a simple function that applies the tokenizer
def tokenize(example):
    return tokenizer(
        example["text"],
        max_length=128,
        truncation=True,
    )

# Apply the tokenizer to the whole dataset using .map()
dataset = dataset.map(tokenize)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

---

# Define model

(Note that here we define the model structure and computation without setting any parameters yet!)

In [5]:
import torch


# This gives a new name to the config class, just for convenience
BasicConfig = transformers.PretrainedConfig


# This is the model
class SimpleCNN(transformers.PreTrainedModel):

    config_class = BasicConfig

    # In the initialization method, one instantiates the layers
    # these will be the parameters of the model
    def __init__(self, config):
        super().__init__(config)
        # Embedding layer: vocab size x embedding dim
        self.embeddings = torch.nn.Embedding(
            num_embeddings=config.vocab_size,
            embedding_dim=config.embedding_dim
        )
        # Convolution layer: TODO
        self.convolution = torch.nn.Conv1d(
            config.embedding_dim,
            config.num_filters,
            config.filter_size,
            padding=1
        )
        # Activation function following convolution
        self.activation = torch.nn.ReLU()
        # Pooling layer: global max pooling, regardless of input length
        self.pooling_layer = torch.nn.AdaptiveMaxPool1d(
            output_size=1
        )
        # Output layer: num filters to output size
        self.output_layer = torch.nn.Linear(
            in_features=config.num_filters,
            out_features=config.num_labels
        )
        # Loss function: standard loss for classification
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None, attention_mask=None):
        #shape of input: [batch_size, maxlen]
        x = self.embeddings(input_ids)
        #shape of x: [batch_size, maxlen, embedding_dim]
        x = x.permute((0,2,1))
        #shape of x: [batch_size, embedding_dim, maxlen]
        x = self.convolution(x)
        #shape of x: [batch_size, filters, maxlen]
        x = self.activation(x)
        #shape of x: [batch_size, filters, maxlen]
        x = self.pooling_layer(x)
        #shape of x: [batch_size, filters, 1]
        x = x.flatten(start_dim=1)
        #shape of x: [batch_size, filters]
        output = self.output_layer(x)

        # Return value computed as in the MLP:
        if labels is not None:
            # We have labels, so we can calculate the loss
            return (self.loss(output,labels), output)
        else:
            # No labels, so just return the output
            return (output,)

---
# Define training support

(Collator, evaluation, Callbacks)

In [6]:
import evaluate

# evaluation
accuracy = evaluate.load("accuracy")

def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels
    predictions = outputs.argmax(axis=-1) #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels)

# collator
data_collator = transformers.DataCollatorWithPadding(tokenizer)

# Callbacks / logging
from collections import defaultdict

class LogSavingCallback(transformers.TrainerCallback):
    def on_train_begin(self, *args, **kwargs):
        self.logs = defaultdict(list)
        self.training = True

    def on_train_end(self, *args, **kwargs):
        self.training = False

    def on_log(self, args, state, control, logs, model=None, **kwargs):
        if self.training:
            for k, v in logs.items():
                if k != "epoch" or v not in self.logs[k]:
                    self.logs[k].append(v)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

---
# Hyperparameter search - First option

In [7]:
for lr in [0.000005, 0.00005, 0.0005, 0.005, 0.05, 0.5]:

    # create the model
    config = BasicConfig(
        vocab_size = tokenizer.vocab_size,
        num_labels = len(set(dataset['train']['label'])),
        embedding_dim = 64,
        filter_size = 3,
        num_filters = 10,
    )

    model = SimpleCNN(config)

    # Set training arguments
    trainer_args = transformers.TrainingArguments(
        "checkpoints",
        evaluation_strategy="steps",
        logging_strategy="steps",
        load_best_model_at_end=True,
        eval_steps=500,
        logging_steps=500,
        learning_rate=lr, # <--- parameter goes here
        per_device_train_batch_size=8,
        max_steps=2500,
    )

    trainer = transformers.Trainer(
        model=model,
        args=trainer_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_accuracy,
        data_collator=data_collator,
        callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5), LogSavingCallback()]
    )

    trainer.train()
    eval_results = trainer.evaluate(dataset["test"])
    print('Learning rate:', lr, 'Accuracy:', eval_results['eval_accuracy'])

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
500,0.7496,0.75305,0.50008
1000,0.7447,0.740807,0.50004
1500,0.7408,0.732967,0.50004
2000,0.7373,0.728923,0.5002
2500,0.7216,0.727872,0.50012


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 5e-06 Accuracy: 0.50012


Step,Training Loss,Validation Loss,Accuracy
500,0.6961,0.695939,0.5102
1000,0.6934,0.693632,0.51992
1500,0.6943,0.6925,0.5238
2000,0.691,0.691781,0.52716
2500,0.6918,0.691466,0.529


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 5e-05 Accuracy: 0.529


Step,Training Loss,Validation Loss,Accuracy
500,0.6866,0.672584,0.58068
1000,0.6618,0.647147,0.61844
1500,0.6362,0.625056,0.64592
2000,0.61,0.614749,0.65336
2500,0.6121,0.610213,0.66144


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.0005 Accuracy: 0.66144


Step,Training Loss,Validation Loss,Accuracy
500,0.6942,0.636144,0.63804
1000,0.5976,0.579763,0.70292
1500,0.5564,0.538429,0.72788
2000,0.5175,0.525946,0.734
2500,0.5123,0.511693,0.7446


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.005 Accuracy: 0.7446


Step,Training Loss,Validation Loss,Accuracy
500,1.3565,1.105459,0.52208
1000,1.1575,1.40216,0.58732
1500,1.0483,0.947593,0.65152
2000,0.7774,0.830967,0.59012
2500,0.5963,0.548808,0.72536


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.05 Accuracy: 0.72536


Step,Training Loss,Validation Loss,Accuracy
500,332.4566,465.178802,0.49392
1000,394.1323,418.406433,0.5062
1500,266.9355,135.760727,0.50988
2000,107.9983,59.493389,0.50916
2500,34.748,4.669114,0.502


Learning rate: 0.5 Accuracy: 0.502


---
# Hyperparameter search – Second option

* Hyperparameter search using [Optuna](https://optuna.org/)

In [8]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [9]:
import optuna

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 5e-4, 5e-2, log=True)
    num_filters = trial.suggest_categorical("num_filters", [10, 16, 24])

    # create the model
    config = BasicConfig(
        vocab_size = tokenizer.vocab_size,
        num_labels = len(set(dataset['train']['label'])),
        embedding_dim = 64,
        filter_size = 3,
        num_filters = num_filters, # <--- parameter goes here
    )

    model = SimpleCNN(config)

    # Set training arguments
    trainer_args = transformers.TrainingArguments(
        "checkpoints",
        evaluation_strategy="steps",
        logging_strategy="steps",
        load_best_model_at_end=True,
        eval_steps=500,
        logging_steps=500,
        learning_rate=learning_rate, # <--- parameter goes here
        per_device_train_batch_size=8,
        max_steps=2500,
    )

    trainer = transformers.Trainer(
        model=model,
        args=trainer_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_accuracy,
        data_collator=data_collator,
        callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5), LogSavingCallback()]
    )

    trainer.train()
    eval_results = trainer.evaluate(dataset["test"])
    print('Learning rate:', learning_rate, 'Filters:', num_filters, 'Accuracy:', eval_results['eval_accuracy'])
    return eval_results['eval_accuracy']



study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3) # <--- How many trials we run, more would be needed in real case!

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
500,1.4311,1.168275,0.5196
1000,1.1236,0.885249,0.66316
1500,1.0302,1.153721,0.52904
2000,0.775,0.738994,0.67596
2500,0.6064,0.547992,0.72704


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.03658989696731933 Filters: 16 Accuracy: 0.72704


Step,Training Loss,Validation Loss,Accuracy
500,0.6664,0.621111,0.65316
1000,0.5926,0.571317,0.69704
1500,0.5643,0.55062,0.70924
2000,0.5428,0.542512,0.71412
2500,0.5308,0.537103,0.72144


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.0013554067287896206 Filters: 24 Accuracy: 0.72144


Step,Training Loss,Validation Loss,Accuracy
500,0.8083,0.689389,0.63368
1000,0.6922,0.60546,0.706
1500,0.6103,0.661865,0.68876
2000,0.573,0.505944,0.75376
2500,0.4838,0.487471,0.7662


Learning rate: 0.010987437345294407 Filters: 24 Accuracy: 0.7662
