In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [1]:
!nvidia-smi

Fri Apr 28 22:20:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    48W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
!pip install transformers
!pip install pytorch_lightning==2.0.2
!pip install sentencepiece
!pip install optuna==3.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K

In [9]:
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
import glob
import os
import re
import argparse
from sklearn import metrics
import locale
import gc

import optuna
from optuna.integration import PyTorchLightningPruningCallback
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, DistilBertForSequenceClassification

#torch.set_float32_matmul_precision("medium")

In [4]:
!unzip -qq steam-ds.zip -d steam-ds

In [10]:
train_pos_files = glob.glob('steam-ds/train/pos/*.txt')
train_neg_files = glob.glob('steam-ds/train/pos/*.txt')
print(len(train_pos_files), len(train_neg_files))

24000 24000


#Dataset
* Based on T5 Text classification code (https://huggingface.co/docs/transformers/v4.28.1/en/model_doc/t5#resources) 

In [11]:
class SteamDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.pos_file_path = os.path.join(data_dir, type_path, 'pos')
    self.neg_file_path = os.path.join(data_dir, type_path, 'neg')
    
    self.pos_files = glob.glob("%s/*.txt" % self.pos_file_path)
    self.neg_files = glob.glob("%s/*.txt" % self.neg_file_path)
    
    self.max_len = max_len
    self.tokenizer = tokenizer

    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)

In [12]:
class SteamT5Dataset(SteamDataset):
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

  def _build(self):
    self._build_examples_from_files(self.pos_files, 'positive')
    self._build_examples_from_files(self.neg_files, 'negative')
  
  def _build_examples_from_files(self, files, sentiment):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")

    for path in files:
      with open(path, 'r') as f:
        text = f.read()
      
      line = text.strip()
      line = REPLACE_NO_SPACE.sub("", line) 
      line = line

      target = sentiment

       # tokenize inputs
      tokenized_inputs = self.tokenizer([line], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
       # tokenize targets
      tokenized_targets = self.tokenizer([target], max_length=2, padding='max_length', return_tensors="pt")

      assert tokenized_inputs['input_ids'].shape[1] == 512, "actual shape was " + str(tokenized_inputs['input_ids'].shape)

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)


In [13]:
def get_dataset(tokenizer, type_path, args):
  return SteamT5Dataset(tokenizer=tokenizer, data_dir = args['data_dir'], type_path = type_path, max_len=args.max_seq_length)

#T5 Transformer
* Also adapted from T5 Text classification code (https://huggingface.co/docs/transformers/v4.28.1/en/model_doc/t5#resources), but heavilly modified to newer versions of Pytorch Lightning and Transformers

In [14]:
class TransformerBase(pl.LightningModule):
  def __init__(self):
    super(TransformerBase, self).__init__()
    self.training_step_outputs = []
    self.validation_step_outputs = []
    self.predictions = []
    self.actual = []
    self.validation_step_acc = []
    self.test_step_acc = []

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams['train_batch_size'], drop_last=True, shuffle=True, num_workers=self.hparams['num_workers'])
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams['eval_batch_size'], num_workers=self.hparams['num_workers'])

  def test_dataloader(self):
    test_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test", args=self.hparams)
    return DataLoader(test_dataset, batch_size=self.hparams['test_batch_size'], num_workers=self.hparams['num_workers'])

In [15]:
class T5Classification(TransformerBase):
  def __init__(self, hparams):
    super(T5Classification, self).__init__()
    self.hparams.update(hparams)
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams['model_name_or_path'])
    self.tokenizer = AutoTokenizer.from_pretrained(hparams['tokenizer_name_or_path'])

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )
  
  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.training_step_outputs.append(loss)
    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

    return {"loss": loss}

  def on_training_epoch_start(self):
    self.training_step_outputs = []
  
  def on_training_epoch_end(self):
    avg_train_loss = torch.stack(self.training_step_outputs).mean()
    self.log("avg_train_loss", prog_bar=True)
    
    return {"avg_train_loss": avg_train_loss}

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    return loss

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.validation_step_outputs.append(loss)
    self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

    #Generate uses pad_token as start token, so remove it after
    outs = self.model.generate(input_ids = batch['source_ids'], attention_mask=batch['source_mask'], max_length=3)
    dec = [self.tokenizer.decode(ids[1:]) for ids in outs]
    target = [self.tokenizer.decode(ids) for ids in batch['target_ids']]
    accuracy = torch.tensor(metrics.accuracy_score(target, dec))

    self.predictions.extend(dec)
    self.actual.extend(target)

    self.log("val_acc", accuracy, on_step=True, on_epoch=True, prog_bar=True)
    self.validation_step_acc.append(accuracy)

    return {"val_loss": loss, "val_acc": accuracy}

  def on_validation_epoch_start(self):
    self.validation_step_outputs = []
    self.validation_step_acc = []
    self.predictions = []
    self.actual = []

  def on_validation_epoch_end(self):
    avg_loss = torch.stack(self.validation_step_outputs).mean()
    avg_acc = torch.stack(self.validation_step_acc).mean()

    f1 = metrics.f1_score(self.actual, self.predictions)

    self.log("val_f1", f1, prog_bar=True)
    self.log("avg_val_loss", avg_loss, prog_bar=True)
    self.log("avg_val_acc", avg_acc, prog_bar=True)

    return {"avg_val_loss": avg_loss, "avg_val_acc": avg_acc, "val_f1": f1}

  def test_step(self, batch, batch_idx):
    #Generate uses pad_token as start token, so remove it after
    outs = self.model.generate(input_ids = batch['source_ids'], attention_mask=batch['source_mask'], max_length=3)
    dec = [self.tokenizer.decode(ids[1:]) for ids in outs]
    target = [self.tokenizer.decode(ids) for ids in batch['target_ids']]
    accuracy = torch.tensor(metrics.accuracy_score(target, dec))

    self.predictions.extend(dec)
    self.actual.extend(target)

    self.log("test_acc", accuracy, on_step=True, prog_bar=True)
    self.test_step_acc.append(accuracy)

    return {"test_acc": accuracy}

  def on_test_epoch_start(self):
    self.test_step_outputs = []
    self.test_step_acc = []
    self.predictions = []
    self.actual = []

  def on_test_epoch_end(self):
    avg_acc = torch.stack(self.test_step_acc).mean()
    f1 = metrics.f1_score(self.actual, self.predictions)
    self.log("test_f1", f1, prog_bar=True)
    self.log("avg_test_acc", avg_acc, prog_bar=True)
    return {"avg_test_acc": avg_acc, "test_f1": f1}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams['weight_decay'],
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams['learning_rate'], eps=self.hparams['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=self.trainer.estimated_stepping_batches
    )
    return [optimizer], scheduler
  
  def lr_scheduler_step(self, scheduler, metric):
    scheduler.step()

In [None]:
args_dict = dict(
    data_dir="steam-ds", # path for data files
    output_dir="t5-steam-ds-sentiment", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=512,
    learning_rate=0.001,
    weight_decay=3e-7,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    test_batch_size=8,
    num_train_epochs=4,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    num_workers=6,
    seed=42,
)

In [None]:
model = T5Classification(args_dict)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
!mkdir -p t5-steam-ds-sentiment

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args_dict['output_dir'], monitor="avg_val_acc", mode="max", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args_dict['gradient_accumulation_steps'],
    max_epochs = args_dict['num_train_epochs'],
    precision= 16 if args_dict['fp_16'] else 32,
    gradient_clip_val=args_dict['max_grad_norm'],
    enable_checkpointing=True,
    callbacks=[checkpoint_callback]
)
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


In [None]:
trainer.test(model)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_acc_epoch': 0.8538333333333333, 'avg_test_acc': 0.8538333333333333}]

#T5 Hyperparameter Tuning

In [None]:
def objective(trial):

  lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
  weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-2, log=True)
  warmup_steps = trial.suggest_int("warmup_steps", 0, 100, step=10)
  batch_size = trial.suggest_int("batch_size", 4, 16, step=2)
  train_epochs = trial.suggest_int("train_epochs", 1, 4)
  grad_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 10, 30, step=2)

  args_dict = dict(
    data_dir="steam-ds", # path for data files
    output_dir="t5-steam-ds-sentiment", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=512,
    learning_rate=lr,
    weight_decay=weight_decay,
    adam_epsilon=1e-8,
    warmup_steps=warmup_steps,
    train_batch_size=batch_size,
    eval_batch_size=batch_size,
    test_batch_size=batch_size,
    num_train_epochs=train_epochs,
    gradient_accumulation_steps=grad_accumulation_steps,
    n_gpu=1,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    num_workers=4,
    seed=42,
  )

  checkpoint_callback = pl.callbacks.ModelCheckpoint(
    os.path.join(args_dict["output_dir"],"trial_{}".format(trial.number)), monitor="avg_val_acc", mode="max")
  
  earlystop_callback = pl.callbacks.EarlyStopping(
      monitor="avg_val_acc", mode="max", check_on_train_epoch_end=False, divergence_threshold=0.6, patience=1
    )

  train_params = dict(
    accumulate_grad_batches=args_dict['gradient_accumulation_steps'],
    max_epochs = args_dict['num_train_epochs'],
    precision= 16 if args_dict['fp_16'] else 32,
    gradient_clip_val=args_dict['max_grad_norm'],
    enable_checkpointing=True,
    callbacks=[checkpoint_callback, earlystop_callback],
    limit_train_batches=0.25
  )

  model = T5Classification(args_dict)
  trainer = pl.Trainer(**train_params)

  trainer.fit(model)

  return trainer.callback_metrics["avg_val_acc"]

study = optuna.create_study(direction="maximize", pruner=None)
study.enqueue_trial(
    {"lr": 3e-4, 
     "weight_decay": 1e-8, 
     "warmup_steps": 0, 
     "batch_size": 8, 
     "train_epochs": 2, 
     "gradient_accumulation_steps": 16}
    )

study.optimize(objective, n_trials=25, gc_after_trial=True)

print("Best trial:")
trial = study.best_trial

print("Params:")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

[32m[I 2023-04-28 05:53:58,339][0m A new study created in memory with name: no-name-80289401-7289-4b5a-8235-ace58acd2219[0m


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-28 06:00:07,254][0m Trial 0 finished with value: 0.8373333333333334 and parameters: {'lr': 0.0003, 'weight_decay': 1e-08, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 16}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(0.2024), 'train_loss_step': tensor(0.1659), 'val_loss': tensor(0.1813), 'val_loss_epoch': tensor(0.1813), 'val_acc': tensor(0.8373, dtype=torch.float64), 'val_acc_epoch': tensor(0.8373, dtype=torch.float64), 'avg_val_loss': tensor(0.1813), 'avg_val_acc': tensor(0.8373, dtype=torch.float64), 'train_loss_epoch': tensor(0.2024)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-28 06:05:22,626][0m Trial 1 finished with value: 0.0 and parameters: {'lr': 8.93696813766729e-05, 'weight_decay': 6.249812226425759e-08, 'warmup_steps': 10, 'batch_size': 4, 'train_epochs': 1, 'gradient_accumulation_steps': 16}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.3865), 'train_loss_step': tensor(7.9145), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.3865)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:08:31,053][0m Trial 2 finished with value: 0.0 and parameters: {'lr': 0.0002436969132857979, 'weight_decay': 0.0016363700286995792, 'warmup_steps': 100, 'batch_size': 10, 'train_epochs': 2, 'gradient_accumulation_steps': 18}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4171), 'train_loss_step': tensor(9.4114), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4171)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:11:37,795][0m Trial 3 finished with value: 0.0 and parameters: {'lr': 0.0003047700148634345, 'weight_decay': 2.167019255120676e-06, 'warmup_steps': 100, 'batch_size': 10, 'train_epochs': 3, 'gradient_accumulation_steps': 20}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4321), 'train_loss_step': tensor(9.9953), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4321)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:15:39,460][0m Trial 4 finished with value: 0.0 and parameters: {'lr': 0.0003976085668719768, 'weight_decay': 3.794834949339979e-06, 'warmup_steps': 70, 'batch_size': 6, 'train_epochs': 3, 'gradient_accumulation_steps': 28}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4042), 'train_loss_step': tensor(8.8060), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4042)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:18:47,223][0m Trial 5 finished with value: 0.0 and parameters: {'lr': 1.1373255474129947e-05, 'weight_decay': 4.091894895749748e-08, 'warmup_steps': 60, 'batch_size': 10, 'train_epochs': 4, 'gradient_accumulation_steps': 18}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4558), 'train_loss_step': tensor(10.4809), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4558)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-28 06:21:54,855][0m Trial 6 finished with value: 0.0 and parameters: {'lr': 0.0002907669609615392, 'weight_decay': 5.7103490339448833e-08, 'warmup_steps': 30, 'batch_size': 10, 'train_epochs': 1, 'gradient_accumulation_steps': 10}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.3919), 'train_loss_step': tensor(11.0528), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.3919)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-28 06:24:40,340][0m Trial 7 finished with value: 0.0 and parameters: {'lr': 6.046470320453397e-05, 'weight_decay': 0.004634309226262638, 'warmup_steps': 70, 'batch_size': 16, 'train_epochs': 1, 'gradient_accumulation_steps': 22}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4499), 'train_loss_step': tensor(9.6051), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4499)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:27:25,047][0m Trial 8 finished with value: 0.0 and parameters: {'lr': 0.008296347018292958, 'weight_decay': 2.7758913583460726e-07, 'warmup_steps': 20, 'batch_size': 16, 'train_epochs': 2, 'gradient_accumulation_steps': 30}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4195), 'train_loss_step': tensor(9.8825), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4195)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:30:17,335][0m Trial 9 finished with value: 0.0 and parameters: {'lr': 1.1111633560048988e-05, 'weight_decay': 0.00015789983988414053, 'warmup_steps': 80, 'batch_size': 14, 'train_epochs': 2, 'gradient_accumulation_steps': 22}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(9.4507), 'train_loss_step': tensor(9.4188), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5353), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4507)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:40:42,613][0m Trial 10 finished with value: 0.8206666666666668 and parameters: {'lr': 0.002026396893622944, 'weight_decay': 1.0989470574195546e-08, 'warmup_steps': 0, 'batch_size': 6, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 0 with value: 0.8373333333333334.[0m


{'train_loss': tensor(0.1805), 'train_loss_step': tensor(0.3041), 'val_loss': tensor(0.1923), 'val_loss_epoch': tensor(0.1923), 'val_acc': tensor(0.8207, dtype=torch.float64), 'val_acc_epoch': tensor(0.8207, dtype=torch.float64), 'avg_val_loss': tensor(0.1923), 'avg_val_acc': tensor(0.8207, dtype=torch.float64), 'train_loss_epoch': tensor(0.1805)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.
[32m[I 2023-04-28 06:54:47,767][0m Trial 11 finished with value: 0.8518333333333334 and parameters: {'lr': 0.0013441512255227151, 'weight_decay': 1.0563432528643968e-08, 'warmup_steps': 0, 'batch_size': 6, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 11 with value: 0.8518333333333334.[0m


{'train_loss': tensor(0.1703), 'train_loss_step': tensor(0.2216), 'val_loss': tensor(0.1667), 'val_loss_epoch': tensor(0.1667), 'val_acc': tensor(0.8518, dtype=torch.float64), 'val_acc_epoch': tensor(0.8518, dtype=torch.float64), 'avg_val_loss': tensor(0.1667), 'avg_val_acc': tensor(0.8518, dtype=torch.float64), 'train_loss_epoch': tensor(0.1703)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 06:58:57,797][0m Trial 12 finished with value: 0.0 and parameters: {'lr': 0.0013488419786428433, 'weight_decay': 1.2855330083960802e-08, 'warmup_steps': 40, 'batch_size': 6, 'train_epochs': 3, 'gradient_accumulation_steps': 14}. Best is trial 11 with value: 0.8518333333333334.[0m


{'train_loss': tensor(9.4165), 'train_loss_step': tensor(7.8355), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4165)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.
[32m[I 2023-04-28 07:10:59,901][0m Trial 13 finished with value: 0.8525 and parameters: {'lr': 0.0010260066116536213, 'weight_decay': 3.350496690379203e-07, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 4, 'gradient_accumulation_steps': 10}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(0.1685), 'train_loss_step': tensor(0.1180), 'val_loss': tensor(0.1705), 'val_loss_epoch': tensor(0.1705), 'val_acc': tensor(0.8525, dtype=torch.float64), 'val_acc_epoch': tensor(0.8525, dtype=torch.float64), 'avg_val_loss': tensor(0.1705), 'avg_val_acc': tensor(0.8525, dtype=torch.float64), 'train_loss_epoch': tensor(0.1685)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:16:21,621][0m Trial 14 finished with value: 0.0 and parameters: {'lr': 0.0016533634307510064, 'weight_decay': 4.272362693753768e-07, 'warmup_steps': 20, 'batch_size': 4, 'train_epochs': 4, 'gradient_accumulation_steps': 10}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4552), 'train_loss_step': tensor(8.9354), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4552)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:19:58,774][0m Trial 15 finished with value: 0.0 and parameters: {'lr': 0.004843848267874706, 'weight_decay': 4.82023147829043e-07, 'warmup_steps': 40, 'batch_size': 8, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4535), 'train_loss_step': tensor(9.9538), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4535)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
[32m[I 2023-04-28 07:27:34,844][0m Trial 16 finished with value: 0.8408333333333333 and parameters: {'lr': 0.003690390941588992, 'weight_decay': 3.615437246378056e-05, 'warmup_steps': 0, 'batch_size': 12, 'train_epochs': 3, 'gradient_accumulation_steps': 10}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(0.1953), 'train_loss_step': tensor(0.1218), 'val_loss': tensor(0.1813), 'val_loss_epoch': tensor(0.1813), 'val_acc': tensor(0.8408, dtype=torch.float64), 'val_acc_epoch': tensor(0.8408, dtype=torch.float64), 'avg_val_loss': tensor(0.1813), 'avg_val_acc': tensor(0.8408, dtype=torch.float64), 'train_loss_epoch': tensor(0.1953)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:31:10,416][0m Trial 17 finished with value: 0.0 and parameters: {'lr': 0.0009775012421560424, 'weight_decay': 1.736088262847559e-07, 'warmup_steps': 20, 'batch_size': 8, 'train_epochs': 4, 'gradient_accumulation_steps': 14}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4174), 'train_loss_step': tensor(9.0276), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4174)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:35:16,656][0m Trial 18 finished with value: 0.0 and parameters: {'lr': 0.0007571764772753644, 'weight_decay': 9.842824050493262e-07, 'warmup_steps': 10, 'batch_size': 6, 'train_epochs': 4, 'gradient_accumulation_steps': 24}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.3862), 'train_loss_step': tensor(7.9409), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.3862)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:38:19,048][0m Trial 19 finished with value: 0.0 and parameters: {'lr': 0.0029127675414518713, 'weight_decay': 6.1943836024871676e-06, 'warmup_steps': 40, 'batch_size': 12, 'train_epochs': 3, 'gradient_accumulation_steps': 14}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4578), 'train_loss_step': tensor(9.4256), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4578)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:43:40,836][0m Trial 20 finished with value: 0.0 and parameters: {'lr': 0.007838581868397244, 'weight_decay': 1.1555496778957182e-07, 'warmup_steps': 10, 'batch_size': 4, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4622), 'train_loss_step': tensor(10.5139), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4622)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 07:48:56,915][0m Trial 21 finished with value: 0.8015 and parameters: {'lr': 0.003136929325222933, 'weight_decay': 1.8813351531865904e-05, 'warmup_steps': 0, 'batch_size': 12, 'train_epochs': 3, 'gradient_accumulation_steps': 10}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(0.2079), 'train_loss_step': tensor(0.2103), 'val_loss': tensor(0.2195), 'val_loss_epoch': tensor(0.2195), 'val_acc': tensor(0.8015, dtype=torch.float64), 'val_acc_epoch': tensor(0.8015, dtype=torch.float64), 'avg_val_loss': tensor(0.2195), 'avg_val_acc': tensor(0.8015, dtype=torch.float64), 'train_loss_epoch': tensor(0.2079)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
[32m[I 2023-04-28 07:56:31,417][0m Trial 22 finished with value: 0.8116666666666668 and parameters: {'lr': 0.004185819388967604, 'weight_decay': 1.7745857264498175e-05, 'warmup_steps': 0, 'batch_size': 12, 'train_epochs': 3, 'gradient_accumulation_steps': 10}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(0.2011), 'train_loss_step': tensor(0.1666), 'val_loss': tensor(0.1991), 'val_loss_epoch': tensor(0.1991), 'val_acc': tensor(0.8117, dtype=torch.float64), 'val_acc_epoch': tensor(0.8117, dtype=torch.float64), 'avg_val_loss': tensor(0.1991), 'avg_val_acc': tensor(0.8117, dtype=torch.float64), 'train_loss_epoch': tensor(0.2011)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 08:00:06,348][0m Trial 23 finished with value: 0.0 and parameters: {'lr': 0.0023198748736610675, 'weight_decay': 1.3430953898712854e-06, 'warmup_steps': 10, 'batch_size': 8, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4613), 'train_loss_step': tensor(7.8130), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5325), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4613)}


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type              

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 08:03:02,817][0m Trial 24 finished with value: 0.0 and parameters: {'lr': 0.0008179805550911611, 'weight_decay': 5.654790795188224e-05, 'warmup_steps': 30, 'batch_size': 14, 'train_epochs': 3, 'gradient_accumulation_steps': 16}. Best is trial 13 with value: 0.8525.[0m


{'train_loss': tensor(9.4500), 'train_loss_step': tensor(10.0912), 'val_loss': tensor(9.5325), 'val_loss_epoch': tensor(9.5325), 'val_acc': tensor(0., dtype=torch.float64), 'val_acc_epoch': tensor(0., dtype=torch.float64), 'avg_val_loss': tensor(9.5353), 'avg_val_acc': tensor(0., dtype=torch.float64), 'train_loss_epoch': tensor(9.4500)}
Best trial:
Params:
    lr: 0.0010260066116536213
    weight_decay: 3.350496690379203e-07
    warmup_steps: 0
    batch_size: 8
    train_epochs: 4
    gradient_accumulation_steps: 10


#DistilBERT

In [16]:
class SteamBERTDataset(SteamDataset):
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze

    target = self.targets[index]

    return {"source_ids": source_ids, "source_mask": src_mask, "labels": target}

  def _build(self):
    self._build_examples_from_files(self.pos_files, 'positive')
    self._build_examples_from_files(self.neg_files, 'negative')

    self.targets = torch.stack(self.targets).squeeze()
  
  def _build_examples_from_files(self, files, sentiment):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")

    for path in files:
      with open(path, 'r') as f:
        text = f.read()
      
      line = text.strip()
      line = REPLACE_NO_SPACE.sub("", line) 
      line = line

      # tokenize inputs
      tokenized_inputs = self.tokenizer([line], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
      assert tokenized_inputs['input_ids'].shape[1] == 512, "actual shape was " + str(tokenized_inputs['input_ids'].shape)

      if (sentiment == 'positive'):
        target = torch.ones(1)
      else:
        target = torch.zeros(1)

      self.inputs.append(tokenized_inputs)
      self.targets.append(target)


In [17]:
def get_dataset(tokenizer, type_path, args):
  return SteamBERTDataset(tokenizer=tokenizer, data_dir = args['data_dir'], type_path = type_path, max_len=args["max_seq_length"])

In [24]:
class DistilBertClassification(TransformerBase):
  def __init__(self, hparams):
    super(DistilBertClassification, self).__init__()
    self.hparams.update(hparams)
    
    self.model = DistilBertForSequenceClassification.from_pretrained(hparams['model_name_or_path'], num_labels=2)
    self.tokenizer = AutoTokenizer.from_pretrained(hparams['tokenizer_name_or_path'])

  def forward(
      self, input_ids, attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    one_hot = torch.nn.functional.one_hot(batch["labels"].long(), num_classes=2).float()
    one_hot = one_hot.to(self.device)
    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=one_hot
    )

    loss = outputs[0]
    logits = outputs[1]
    return loss, logits

  def training_step(self, batch, batch_idx):
    loss, logits = self._step(batch)
    self.training_step_outputs.append(loss)
    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

    return {"loss": loss}

  def on_training_epoch_start(self):
    self.training_step_outputs = []
  
  def on_training_epoch_end(self):
    avg_train_loss = torch.stack(self.training_step_outputs).mean()
    self.log("avg_train_loss", prog_bar=True)
    return {"avg_train_loss": avg_train_loss}

  def validation_step(self, batch, batch_idx):
    loss, logits = self._step(batch)
    self.validation_step_outputs.append(loss)
    self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)

    pred = torch.argmax(logits, -1).to('cpu')
    target = batch['labels'].to('cpu')
    accuracy = torch.tensor(metrics.accuracy_score(pred, target))

    self.predictions.extend(pred)
    self.actual.extend(target)

    self.log("val_acc", accuracy, on_step=True, on_epoch=True, prog_bar=True)
    self.validation_step_acc.append(accuracy)

    return {"val_loss": loss, "val_acc": accuracy}

  def on_validation_epoch_start(self):
    self.validation_step_outputs = []
    self.validation_step_acc = []
    self.predictions = []
    self.actual = []

  def on_validation_epoch_end(self):
    avg_loss = torch.stack(self.validation_step_outputs).mean()
    avg_acc = torch.stack(self.validation_step_acc).mean()

    f1 = metrics.f1_score(self.actual, self.predictions)
    self.log("val_f1", f1, prog_bar=True)

    self.log("avg_val_loss", avg_loss, prog_bar=True)
    self.log("avg_val_acc", avg_acc, prog_bar=True)

    return {"avg_val_loss": avg_loss, "avg_val_acc": avg_acc, "val_f1": f1}

  def test_step(self, batch, batch_idx):
    loss, logits = self._step(batch)
    pred = torch.argmax(logits, -1).to('cpu')
    target = batch['labels'].to('cpu')
    accuracy = torch.tensor(metrics.accuracy_score(pred, target))

    self.predictions.extend(pred)
    self.actual.extend(target)

    self.log("test_acc", accuracy, on_step=True, prog_bar=True)
    self.test_step_acc.append(accuracy)

    return {"test_acc": accuracy}

  def on_test_epoch_start(self):
    self.validation_step_outputs = []
    self.validation_step_acc = []
    self.predictions = []
    self.actual = []

  def on_test_epoch_end(self):
    avg_acc = torch.stack(self.test_step_acc).mean()
    f1 = metrics.f1_score(self.actual, self.predictions)
    self.log("test_f1", f1, prog_bar=True)
    self.log("avg_test_acc", avg_acc, prog_bar=True)
    return {"avg_test_acc": avg_acc, "test_f1": f1}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams['weight_decay'],
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams['learning_rate'], eps=self.hparams['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=self.trainer.estimated_stepping_batches
    )
    return [optimizer], scheduler
  
  def lr_scheduler_step(self, scheduler, metric):
    scheduler.step()

In [27]:
args_dict = dict(
    data_dir="steam-ds", # path for data files
    output_dir="distilbert-steam-ds-sentiment", # path to save the checkpoints
    model_name_or_path="distilbert-base-uncased",
    tokenizer_name_or_path="distilbert-base-uncased",
    max_seq_length=512,
    learning_rate=1e-4,
    weight_decay=2.4e-8,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=6,
    eval_batch_size=6,
    test_batch_size=6,
    num_train_epochs=2,
    gradient_accumulation_steps=14,
    n_gpu=1,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    num_workers=4,
    seed=42,
  )

In [28]:
model = DistilBertClassification(args_dict)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [16]:
!mkdir -p distilbert-steam-ds-sentiment

In [29]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args_dict['output_dir'], monitor="avg_val_acc", mode="max", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args_dict['gradient_accumulation_steps'],
    max_epochs = args_dict['num_train_epochs'],
    precision= 16 if args_dict['fp_16'] else 32,
    gradient_clip_val=args_dict['max_grad_norm'],
    enable_checkpointing=True,
    callbacks=[checkpoint_callback],
)
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                                | Params
--------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M
--------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [31]:
trainer.test(model)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_acc_epoch': 0.8596666666666667,
  'test_f1': 0.8588669125041904,
  'avg_test_acc': 0.8596666666666667}]

In [20]:
def objective(trial):
  lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
  weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-2, log=True)
  warmup_steps = trial.suggest_int("warmup_steps", 0, 100, step=10)
  batch_size = trial.suggest_int("batch_size", 4, 16, step=2)
  train_epochs = trial.suggest_int("train_epochs", 1, 4)
  grad_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 10, 30, step=2)

  args_dict = dict(
    data_dir="steam-ds", # path for data files
    output_dir="distilbert-steam-ds-sentiment", # path to save the checkpoints
    model_name_or_path="distilbert-base-uncased",
    tokenizer_name_or_path="distilbert-base-uncased",
    max_seq_length=512,
    learning_rate=lr,
    weight_decay=weight_decay,
    adam_epsilon=1e-8,
    warmup_steps=warmup_steps,
    train_batch_size=batch_size,
    eval_batch_size=batch_size,
    test_batch_size=batch_size,
    num_train_epochs=train_epochs,
    gradient_accumulation_steps=grad_accumulation_steps,
    n_gpu=1,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    num_workers=4,
    seed=42,
  )

  checkpoint_callback = pl.callbacks.ModelCheckpoint(
    os.path.join(args_dict["output_dir"],"trial_{}".format(trial.number)), monitor="avg_val_acc", mode="max")
  
  earlystop_callback = pl.callbacks.EarlyStopping(
      monitor="avg_val_acc", mode="max", check_on_train_epoch_end=False, divergence_threshold=0.6, patience=1
    )
  
  train_params = dict(
    accumulate_grad_batches=args_dict['gradient_accumulation_steps'],
    max_epochs = args_dict['num_train_epochs'],
    precision= 16 if args_dict['fp_16'] else 32,
    gradient_clip_val=args_dict['max_grad_norm'],
    enable_checkpointing=True,
    callbacks=[checkpoint_callback, earlystop_callback],
  )

  model = DistilBertClassification(args_dict)
  trainer = pl.Trainer(**train_params)

  trainer.fit(model)
  return trainer.callback_metrics["avg_val_acc"]

torch.set_float32_matmul_precision('medium')
study = optuna.create_study(direction="maximize", pruner=None)
study.enqueue_trial(
    {"lr": 3e-4, 
     "weight_decay": 1e-8, 
     "warmup_steps": 0, 
     "batch_size": 8, 
     "train_epochs": 2, 
     "gradient_accumulation_steps": 16}
    )

study.optimize(objective, n_trials=25, gc_after_trial=True)

print("Best trial:")
trial = study.best_trial

print("Params:")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

[32m[I 2023-04-28 22:54:43,831][0m A new study created in memory with name: no-name-c845d29d-733f-4550-b7f8-7c08f6d03cdf[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-28 23:02:28,148][0m Trial 0 finished with value: 0.836 and parameters: {'lr': 0.0003, 'weight_decay': 1e-08, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 16}. Best is trial 0 with value: 0.836.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:08:38,364][0m Trial 1 finished with value: 0.5 and parameters: {'lr': 1.3298002711780248e-05, 'weight_decay': 3.31200144079998e-06, 'warmup_steps': 70, 'batch_size': 4, 'train_epochs': 4, 'gradient_accumulation_steps': 12}. Best is trial 0 with value: 0.836.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Bert

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:12:40,061][0m Trial 2 finished with value: 0.498 and parameters: {'lr': 3.607242457307323e-05, 'weight_decay': 1.2934583288066039e-05, 'warmup_steps': 20, 'batch_size': 12, 'train_epochs': 3, 'gradient_accumulation_steps': 16}. Best is trial 0 with value: 0.836.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:17:35,431][0m Trial 3 finished with value: 0.503 and parameters: {'lr': 0.00011778127111264571, 'weight_decay': 3.7076964354474197e-06, 'warmup_steps': 20, 'batch_size': 6, 'train_epochs': 3, 'gradient_accumulation_steps': 24}. Best is trial 0 with value: 0.836.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-28 23:21:37,687][0m Trial 4 finished with value: 0.4977938727938728 and parameters: {'lr': 1.2697716732790216e-05, 'weight_decay': 2.3698021388921193e-07, 'warmup_steps': 10, 'batch_size': 14, 'train_epochs': 1, 'gradient_accumulation_steps': 16}. Best is trial 0 with value: 0.836.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequen

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-28 23:29:35,960][0m Trial 5 finished with value: 0.8551666666666666 and parameters: {'lr': 0.0003169290772186897, 'weight_decay': 1.9547180843018065e-05, 'warmup_steps': 60, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 14}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBe

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:33:38,048][0m Trial 6 finished with value: 0.4905 and parameters: {'lr': 3.921903289238671e-05, 'weight_decay': 7.39737109599311e-05, 'warmup_steps': 30, 'batch_size': 12, 'train_epochs': 4, 'gradient_accumulation_steps': 20}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (ini

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:39:43,736][0m Trial 7 finished with value: 0.5268333333333334 and parameters: {'lr': 0.0008843333729455197, 'weight_decay': 2.5457424696403123e-06, 'warmup_steps': 60, 'batch_size': 4, 'train_epochs': 2, 'gradient_accumulation_steps': 30}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly i

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:44:03,425][0m Trial 8 finished with value: 0.5098333333333334 and parameters: {'lr': 0.00010530166560071933, 'weight_decay': 2.127797712275022e-08, 'warmup_steps': 30, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 20}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly i

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:49:00,126][0m Trial 9 finished with value: 0.5001666666666666 and parameters: {'lr': 1.1964017054301105e-05, 'weight_decay': 1.465848373114684e-05, 'warmup_steps': 100, 'batch_size': 6, 'train_epochs': 3, 'gradient_accumulation_steps': 16}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-28 23:52:55,955][0m Trial 10 finished with value: 0.49666666666666665 and parameters: {'lr': 0.0004017874977674611, 'weight_decay': 7.874075007501666e-05, 'warmup_steps': 90, 'batch_size': 16, 'train_epochs': 1, 'gradient_accumulation_steps': 10}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Distil

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-28 23:57:06,959][0m Trial 11 finished with value: 0.5006666666666666 and parameters: {'lr': 0.0002874645961354959, 'weight_decay': 0.0007403748562794647, 'warmup_steps': 50, 'batch_size': 10, 'train_epochs': 2, 'gradient_accumulation_steps': 14}. Best is trial 5 with value: 0.8551666666666666.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-29 00:05:03,743][0m Trial 12 finished with value: 0.8596666666666667 and parameters: {'lr': 0.00027151325393454985, 'weight_decay': 1.0059032825767392e-08, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 24}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Distil

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-29 00:09:25,936][0m Trial 13 finished with value: 0.5001666666666666 and parameters: {'lr': 0.0001834505146551348, 'weight_decay': 1.6675320393268466e-07, 'warmup_steps': 50, 'batch_size': 8, 'train_epochs': 1, 'gradient_accumulation_steps': 26}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Distil

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 00:13:42,016][0m Trial 14 finished with value: 0.49866666666666676 and parameters: {'lr': 0.0006489487310164763, 'weight_decay': 1.9711475648798274e-07, 'warmup_steps': 80, 'batch_size': 10, 'train_epochs': 2, 'gradient_accumulation_steps': 24}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exact

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 00:18:39,695][0m Trial 15 finished with value: 0.42216666666666663 and parameters: {'lr': 0.0005134481190232774, 'weight_decay': 5.957902325620379e-08, 'warmup_steps': 40, 'batch_size': 6, 'train_epochs': 3, 'gradient_accumulation_steps': 30}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-29 00:22:43,573][0m Trial 16 finished with value: 0.4854999999999999 and parameters: {'lr': 0.00022677539104451382, 'weight_decay': 6.442473880398936e-07, 'warmup_steps': 70, 'batch_size': 12, 'train_epochs': 1, 'gradient_accumulation_steps': 22}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Disti

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-29 00:31:00,656][0m Trial 17 finished with value: 0.8206666666666667 and parameters: {'lr': 0.0004881628151818152, 'weight_decay': 1.0288643545576478e-08, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 26}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilB

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 00:35:19,473][0m Trial 18 finished with value: 0.5026666666666667 and parameters: {'lr': 0.0009895705119245674, 'weight_decay': 5.713909620644664e-08, 'warmup_steps': 60, 'batch_size': 10, 'train_epochs': 3, 'gradient_accumulation_steps': 10}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
[32m[I 2023-04-29 00:40:21,903][0m Trial 19 finished with value: 0.5006666666666667 and parameters: {'lr': 0.0001891735345740964, 'weight_decay': 1.4871516804352527e-06, 'warmup_steps': 40, 'batch_size': 6, 'train_epochs': 1, 'gradient_accumulation_steps': 18}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Distil

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 00:44:34,033][0m Trial 20 finished with value: 0.5043333333333334 and parameters: {'lr': 0.0003243203408050634, 'weight_decay': 6.297679973767024e-07, 'warmup_steps': 80, 'batch_size': 10, 'train_epochs': 2, 'gradient_accumulation_steps': 28}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-29 00:52:39,529][0m Trial 21 finished with value: 0.8393333333333334 and parameters: {'lr': 0.0003121756399223699, 'weight_decay': 2.32314617859495e-08, 'warmup_steps': 0, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 14}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBer

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 00:57:12,685][0m Trial 22 finished with value: 0.5041666666666667 and parameters: {'lr': 0.00040604939437149267, 'weight_decay': 3.8432423580763265e-08, 'warmup_steps': 10, 'batch_size': 8, 'train_epochs': 2, 'gradient_accumulation_steps': 12}. Best is trial 12 with value: 0.8596666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactl

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.
[32m[I 2023-04-29 01:06:42,281][0m Trial 23 finished with value: 0.8616666666666667 and parameters: {'lr': 0.00015244618212887392, 'weight_decay': 2.4120724200537892e-08, 'warmup_steps': 0, 'batch_size': 6, 'train_epochs': 2, 'gradient_accumulation_steps': 14}. Best is trial 23 with value: 0.8616666666666667.[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Distil

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[32m[I 2023-04-29 01:13:12,646][0m Trial 24 finished with value: 0.477 and parameters: {'lr': 0.00015034407643404617, 'weight_decay': 1.0927143784594459e-07, 'warmup_steps': 10, 'batch_size': 4, 'train_epochs': 3, 'gradient_accumulation_steps': 18}. Best is trial 23 with value: 0.8616666666666667.[0m


Best trial:
Params:
    lr: 0.00015244618212887392
    weight_decay: 2.4120724200537892e-08
    warmup_steps: 0
    batch_size: 6
    train_epochs: 2
    gradient_accumulation_steps: 14
