In [1]:
# Import Packages

import datasets
import torch
import optuna

from datetime import datetime
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments

In [2]:
torch.cuda.is_available()

True

# Load the dataset

In [3]:
dataset = datasets.load_from_disk("data")
dataset

DatasetDict({
    train: Dataset({
        features: ['selftext', 'label'],
        num_rows: 3984
    })
    test: Dataset({
        features: ['selftext', 'label'],
        num_rows: 498
    })
    valid: Dataset({
        features: ['selftext', 'label'],
        num_rows: 498
    })
})

# Prepare Model and Dataset

In [4]:
# Define 4096 as our maximum sentence length

MAX_LEN = 4096

In [5]:
# Import tokenizer

tokenizer = LongformerTokenizer.from_pretrained('AIMH/mental-longformer-base-4096', max_length = MAX_LEN)



In [6]:
# Define tokenization function

def tokenization(text):
    return tokenizer(text["selftext"], padding='max_length', truncation=True, max_length=MAX_LEN)

In [7]:
# Tokenize data

dataset["train"] = dataset["train"].map(tokenization)
dataset["valid"] = dataset["valid"].map(tokenization)
dataset["test"] = dataset["test"].map(tokenization)

In [8]:
# Sanity check: make sure our tokenization follows our max sentence length

len(dataset["valid"][4]["input_ids"])

4096

In [9]:
# Convert to Pytorch Tensor

dataset["train"].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dataset["valid"].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dataset["test"].set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Setting Up Hyperparameter Search

We will use HuggingFace Trainer API's [built-in hyperparameter testing feature](https://huggingface.co/docs/transformers/hpo_train) using Optuna backend

In [10]:
# Set up hyperparameter search space, compute metrics, and compute objective

def optuna_hp_space(trial: optuna.Trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-6, 5e-3, log=True),
        "warmup_steps": trial.suggest_int("warmup_steps", 50, 250, step=25),
        "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True),
    }

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, weighted_f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    macro_f1 = f1_score(labels, preds, average="macro")
    class_f1 = f1_score(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'macro_f1': macro_f1,
        # 'accuracy': acc,
        # 'weighted_f1': weighted_f1,
        # 'class_f1': class_f1,
        # 'precision': precision,
        # 'recall': recall
    }

def objective(metrics):
    print("Metrics:", metrics)
    return metrics["eval_macro_f1"]

In [11]:
# Set up other training arguments

training_args = TrainingArguments(
    output_dir = "frames",
    num_train_epochs = 5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 8,
    per_device_eval_batch_size = 16,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = True, 
    load_best_model_at_end=True,
    logging_strategy = "epoch",
    fp16 = True,
    dataloader_num_workers = 0,
)

In [12]:
# Set up model init, trainer, and other training arguments

def model_init(trial):
    return LongformerForSequenceClassification.from_pretrained(
        'AIMH/mental-longformer-base-4096', 
        gradient_checkpointing=True, 
        attention_window=512, 
        num_labels=6,
    )

trainer = Trainer(
    model=None,
    model_init=model_init,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = dataset["train"],
    eval_dataset = dataset["valid"],
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Hyperparameter search

best_trial = trainer.hyperparameter_search(
    # compute_objective=objective,
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=20,
    study_name="MentalLongformer-" + datetime.now().strftime("%d-%m-%Y-%H-%M-%S"),
    storage="sqlite:///optuna/optuna.db",
    sampler=optuna.samplers.RandomSampler(),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5, n_warmup_steps=124, interval_steps=24
    ),
)

[I 2024-10-16 22:09:38,565] A new study created in RDB with name: MentalLongformer-16-10-2024-22-09-37
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing global attention on CLS token...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.722, 'grad_norm': 1.5232858657836914, 'learning_rate': 0.0010899623215575313, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.7097824811935425, 'eval_macro_f1': 0.07067510548523206, 'eval_runtime': 81.4369, 'eval_samples_per_second': 6.115, 'eval_steps_per_second': 0.393, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6986, 'grad_norm': 0.8284668326377869, 'learning_rate': 0.0008158269288116735, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.698832392692566, 'eval_macro_f1': 0.07355242566510171, 'eval_runtime': 81.0925, 'eval_samples_per_second': 6.141, 'eval_steps_per_second': 0.395, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.7096, 'grad_norm': 0.9869685173034668, 'learning_rate': 0.0005438846192077823, 'epoch': 2.995983935742972}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6851096153259277, 'eval_macro_f1': 0.07355242566510171, 'eval_runtime': 81.5178, 'eval_samples_per_second': 6.109, 'eval_steps_per_second': 0.393, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6839, 'grad_norm': 0.7177553772926331, 'learning_rate': 0.00026974922646192425, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6852312088012695, 'eval_macro_f1': 0.07067510548523206, 'eval_runtime': 81.1214, 'eval_samples_per_second': 6.139, 'eval_steps_per_second': 0.394, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6841, 'grad_norm': 0.9751347303390503, 'learning_rate': 2.193083141966864e-06, 'epoch': 4.979919678714859}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.6801031827926636, 'eval_macro_f1': 0.07355242566510171, 'eval_runtime': 81.3826, 'eval_samples_per_second': 6.119, 'eval_steps_per_second': 0.393, 'epoch': 4.979919678714859}


[I 2024-10-17 03:14:04,293] Trial 0 finished with value: 0.07355242566510171 and parameters: {'learning_rate': 0.0011952303123719408, 'warmup_steps': 75, 'weight_decay': 0.00019466016011908684}. Best is trial 0 with value: 0.07355242566510171.


{'train_runtime': 18262.8665, 'train_samples_per_second': 1.091, 'train_steps_per_second': 0.034, 'train_loss': 1.6996839954007057, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6156, 'grad_norm': 7.629766941070557, 'learning_rate': 3.100281674601555e-05, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.327921986579895, 'eval_macro_f1': 0.40332625612333306, 'eval_runtime': 81.9311, 'eval_samples_per_second': 6.078, 'eval_steps_per_second': 0.391, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2074, 'grad_norm': 6.547703742980957, 'learning_rate': 3.727590464351013e-05, 'epoch': 2.0}
{'eval_loss': 1.2164682149887085, 'eval_macro_f1': 0.482433233955098, 'eval_runtime': 81.6335, 'eval_samples_per_second': 6.1, 'eval_steps_per_second': 0.392, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.9287, 'grad_norm': 7.6176605224609375, 'learning_rate': 2.4883914896069764e-05, 'epoch': 2.995983935742972}
{'eval_loss': 1.2769542932510376, 'eval_macro_f1': 0.5121330387825234, 'eval_runtime': 81.5826, 'eval_samples_per_second': 6.104, 'eval_steps_per_second': 0.392, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.6301, 'grad_norm': 10.97100830078125, 'learning_rate': 1.2491925148629401e-05, 'epoch': 4.0}
{'eval_loss': 1.4182852506637573, 'eval_macro_f1': 0.5004934566361768, 'eval_runtime': 81.6346, 'eval_samples_per_second': 6.1, 'eval_steps_per_second': 0.392, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.4138, 'grad_norm': 11.014628410339355, 'learning_rate': 2.9980620356710563e-07, 'epoch': 4.979919678714859}
{'eval_loss': 1.5379713773727417, 'eval_macro_f1': 0.48816754668723333, 'eval_runtime': 81.9451, 'eval_samples_per_second': 6.077, 'eval_steps_per_second': 0.391, 'epoch': 4.979919678714859}


[I 2024-10-17 08:19:17,496] Trial 1 finished with value: 0.48816754668723333 and parameters: {'learning_rate': 4.4471253529120666e-05, 'warmup_steps': 175, 'weight_decay': 0.05779812628585346}. Best is trial 1 with value: 0.48816754668723333.


{'train_runtime': 18311.1512, 'train_samples_per_second': 1.088, 'train_steps_per_second': 0.034, 'train_loss': 0.9607640912455897, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6294, 'grad_norm': 5.57384729385376, 'learning_rate': 1.2000124420904386e-05, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.370368242263794, 'eval_macro_f1': 0.3276223144137645, 'eval_runtime': 82.0159, 'eval_samples_per_second': 6.072, 'eval_steps_per_second': 0.39, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.2404, 'grad_norm': 5.58507776260376, 'learning_rate': 8.988044997986617e-06, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2155190706253052, 'eval_macro_f1': 0.4514773759576978, 'eval_runtime': 82.1358, 'eval_samples_per_second': 6.063, 'eval_steps_per_second': 0.39, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.0981, 'grad_norm': 10.094168663024902, 'learning_rate': 6.000062210452193e-06, 'epoch': 2.995983935742972}
{'eval_loss': 1.2048016786575317, 'eval_macro_f1': 0.4934102735270541, 'eval_runtime': 82.6599, 'eval_samples_per_second': 6.025, 'eval_steps_per_second': 0.387, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.9899, 'grad_norm': 7.117640972137451, 'learning_rate': 2.9879827875344252e-06, 'epoch': 4.0}
{'eval_loss': 1.2140772342681885, 'eval_macro_f1': 0.47282739389770945, 'eval_runtime': 82.7184, 'eval_samples_per_second': 6.02, 'eval_steps_per_second': 0.387, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.9166, 'grad_norm': 11.135368347167969, 'learning_rate': 7.228990615002642e-08, 'epoch': 4.979919678714859}
{'eval_loss': 1.2129782438278198, 'eval_macro_f1': 0.49136383088358676, 'eval_runtime': 82.4693, 'eval_samples_per_second': 6.039, 'eval_steps_per_second': 0.388, 'epoch': 4.979919678714859}


[I 2024-10-17 13:26:17,034] Trial 2 finished with value: 0.49136383088358676 and parameters: {'learning_rate': 1.3132666283921465e-05, 'warmup_steps': 75, 'weight_decay': 9.03407344099097e-05}. Best is trial 2 with value: 0.49136383088358676.


{'train_runtime': 18417.3904, 'train_samples_per_second': 1.082, 'train_steps_per_second': 0.034, 'train_loss': 1.1755312027469758, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.5358, 'grad_norm': 6.262286186218262, 'learning_rate': 5.830732304358221e-05, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2977831363677979, 'eval_macro_f1': 0.40261045421197134, 'eval_runtime': 83.3029, 'eval_samples_per_second': 5.978, 'eval_steps_per_second': 0.384, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.143, 'grad_norm': 6.288310527801514, 'learning_rate': 4.5510766379288227e-05, 'epoch': 2.0}
{'eval_loss': 1.2132340669631958, 'eval_macro_f1': 0.4662443555334855, 'eval_runtime': 82.9094, 'eval_samples_per_second': 6.007, 'eval_steps_per_second': 0.386, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.8204, 'grad_norm': 6.839576721191406, 'learning_rate': 3.042163528027288e-05, 'epoch': 2.995983935742972}
{'eval_loss': 1.2936232089996338, 'eval_macro_f1': 0.5186932447884401, 'eval_runtime': 83.0413, 'eval_samples_per_second': 5.997, 'eval_steps_per_second': 0.385, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.5207, 'grad_norm': 10.311201095581055, 'learning_rate': 1.521081764013644e-05, 'epoch': 4.0}
{'eval_loss': 1.4750739336013794, 'eval_macro_f1': 0.49039909719583985, 'eval_runtime': 82.4309, 'eval_samples_per_second': 6.041, 'eval_steps_per_second': 0.388, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3144, 'grad_norm': 7.241485595703125, 'learning_rate': 3.6505962336327456e-07, 'epoch': 4.979919678714859}
{'eval_loss': 1.658421516418457, 'eval_macro_f1': 0.4890503214669162, 'eval_runtime': 81.997, 'eval_samples_per_second': 6.073, 'eval_steps_per_second': 0.39, 'epoch': 4.979919678714859}


[I 2024-10-17 18:34:25,635] Trial 3 finished with value: 0.4890503214669162 and parameters: {'learning_rate': 6.02348378549403e-05, 'warmup_steps': 125, 'weight_decay': 0.0007284599020382071}. Best is trial 2 with value: 0.49136383088358676.


{'train_runtime': 18486.5053, 'train_samples_per_second': 1.078, 'train_steps_per_second': 0.034, 'train_loss': 0.8685197337981193, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.4511, 'grad_norm': 7.17288875579834, 'learning_rate': 0.00014224776688742427, 'epoch': 0.9959839357429718}
{'eval_loss': 1.4019070863723755, 'eval_macro_f1': 0.4243562601309085, 'eval_runtime': 82.2366, 'eval_samples_per_second': 6.056, 'eval_steps_per_second': 0.389, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.1178, 'grad_norm': 7.772177696228027, 'learning_rate': 0.00010675737836822787, 'epoch': 2.0}
{'eval_loss': 1.3052412271499634, 'eval_macro_f1': 0.41561740560728805, 'eval_runtime': 81.8013, 'eval_samples_per_second': 6.088, 'eval_steps_per_second': 0.391, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.7716, 'grad_norm': 10.242654800415039, 'learning_rate': 7.183941547030883e-05, 'epoch': 2.995983935742972}
{'eval_loss': 1.5007514953613281, 'eval_macro_f1': 0.4928721211110054, 'eval_runtime': 81.6296, 'eval_samples_per_second': 6.101, 'eval_steps_per_second': 0.392, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.4158, 'grad_norm': 9.3931303024292, 'learning_rate': 3.606281414047376e-05, 'epoch': 4.0}
{'eval_loss': 1.7232837677001953, 'eval_macro_f1': 0.5070583911391978, 'eval_runtime': 81.6566, 'eval_samples_per_second': 6.099, 'eval_steps_per_second': 0.392, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.1764, 'grad_norm': 9.114377975463867, 'learning_rate': 1.1448512425547225e-06, 'epoch': 4.979919678714859}
{'eval_loss': 2.0324361324310303, 'eval_macro_f1': 0.4978889407926912, 'eval_runtime': 81.9691, 'eval_samples_per_second': 6.075, 'eval_steps_per_second': 0.39, 'epoch': 4.979919678714859}


[I 2024-10-17 23:39:58,742] Trial 4 finished with value: 0.4978889407926912 and parameters: {'learning_rate': 0.00015598598179808094, 'warmup_steps': 75, 'weight_decay': 0.00382949096326044}. Best is trial 4 with value: 0.4978889407926912.


{'train_runtime': 18331.0198, 'train_samples_per_second': 1.087, 'train_steps_per_second': 0.034, 'train_loss': 0.7884649799716088, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.7729, 'grad_norm': 1.484627604484558, 'learning_rate': 0.0025823232400513157, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-10-18 00:40:49,971] Trial 5 pruned. 
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


{'eval_loss': 1.727986216545105, 'eval_macro_f1': 0.07067510548523206, 'eval_runtime': 81.6596, 'eval_samples_per_second': 6.098, 'eval_steps_per_second': 0.392, 'epoch': 0.9959839357429718}


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6293, 'grad_norm': 6.508909702301025, 'learning_rate': 2.101356090846672e-05, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-10-18 01:42:47,312] Trial 6 pruned. 
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


{'eval_loss': 1.3444750308990479, 'eval_macro_f1': 0.341103296682689, 'eval_runtime': 138.7481, 'eval_samples_per_second': 3.589, 'eval_steps_per_second': 0.231, 'epoch': 0.9959839357429718}


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.7085, 'grad_norm': 6.752180099487305, 'learning_rate': 7.126427561442535e-06, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-10-18 02:43:41,173] Trial 7 pruned. 
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


{'eval_loss': 1.656005859375, 'eval_macro_f1': 0.12528447555670785, 'eval_runtime': 81.7095, 'eval_samples_per_second': 6.095, 'eval_steps_per_second': 0.392, 'epoch': 0.9959839357429718}


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.5524, 'grad_norm': 6.748057842254639, 'learning_rate': 5.200020271295372e-05, 'epoch': 0.9959839357429718}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2833495140075684, 'eval_macro_f1': 0.4214461466642995, 'eval_runtime': 81.688, 'eval_samples_per_second': 6.096, 'eval_steps_per_second': 0.392, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.1366, 'grad_norm': 5.936586856842041, 'learning_rate': 4.025516603461809e-05, 'epoch': 2.0}
{'eval_loss': 1.1855095624923706, 'eval_macro_f1': 0.4901134882667068, 'eval_runtime': 81.5458, 'eval_samples_per_second': 6.107, 'eval_steps_per_second': 0.392, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.8281, 'grad_norm': 7.850034236907959, 'learning_rate': 2.690853344560033e-05, 'epoch': 2.995983935742972}
{'eval_loss': 1.2965290546417236, 'eval_macro_f1': 0.5243953310228161, 'eval_runtime': 81.6376, 'eval_samples_per_second': 6.1, 'eval_steps_per_second': 0.392, 'epoch': 2.995983935742972}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.5417, 'grad_norm': 11.685166358947754, 'learning_rate': 1.3454266722800164e-05, 'epoch': 4.0}
{'eval_loss': 1.4349157810211182, 'eval_macro_f1': 0.5016302973204662, 'eval_runtime': 81.5577, 'eval_samples_per_second': 6.106, 'eval_steps_per_second': 0.392, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.34, 'grad_norm': 8.456482887268066, 'learning_rate': 3.2290240134720393e-07, 'epoch': 4.979919678714859}
{'eval_loss': 1.5720268487930298, 'eval_macro_f1': 0.5083860446558671, 'eval_runtime': 81.6326, 'eval_samples_per_second': 6.101, 'eval_steps_per_second': 0.392, 'epoch': 4.979919678714859}


[I 2024-10-18 07:48:58,736] Trial 8 finished with value: 0.5083860446558671 and parameters: {'learning_rate': 5.327889622228865e-05, 'warmup_steps': 125, 'weight_decay': 0.0020390468797308954}. Best is trial 8 with value: 0.5083860446558671.


{'train_runtime': 18315.5556, 'train_samples_per_second': 1.088, 'train_steps_per_second': 0.034, 'train_loss': 0.8813935064500378, 'epoch': 4.979919678714859}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at AIMH/mental-longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.4753, 'grad_norm': 6.962917804718018, 'learning_rate': 0.00011099180685069985, 'epoch': 0.9959839357429718}
{'eval_loss': 1.2829155921936035, 'eval_macro_f1': 0.42417051864609095, 'eval_runtime': 82.1667, 'eval_samples_per_second': 6.061, 'eval_steps_per_second': 0.389, 'epoch': 0.9959839357429718}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[W 2024-10-18 09:31:36,481] Trial 9 failed with parameters: {'learning_rate': 0.00012146693721612734, 'warmup_steps': 75, 'weight_decay': 0.0015260719324010359} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Server\Documents\Stackofsugar\ta\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "c:\Users\Server\Documents\Stackofsugar\ta\.venv\Lib\site-packages\transformers\integrations\integration_utils.py", line 247, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "c:\Users\Server\Documents\Stackofsugar\ta\.venv\Lib\site-packages\transformers\trainer.py", line 1938, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Server\Docume

KeyboardInterrupt: 