In [1]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from transformers.trainer_callback import TrainerCallback

In [2]:
model_name = "EleutherAI/pythia-70m"
# Get the current directory
current_directory = os.getcwd()
# Join the folder path
folder_path = os.path.join(current_directory, "content")
dataset_name = "ai-medical-chatbot_processed.jsonl"
dataset_path = os.path.join(folder_path, dataset_name)
#dataset_path = f"/content/{dataset_name}"
use_hf = False
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

2024-04-09 18:13:23,020 - DEBUG - utilities - Config: datasets.path: c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m
verbose: true



tokenize False c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl


2024-04-09 18:13:23,341 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-09 18:13:23,429 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-09 18:13:24,355 - DEBUG - utilities - Select CPU device


In [3]:
def train_model(hyperparameters):
  max_steps = hyperparameters["max_steps"]
  trained_model_name = f"ai_medical_{max_steps}_steps"
  output_dir = trained_model_name
  training_args = TrainingArguments(
    # Learning rate
    learning_rate=hyperparameters["learning_rate"],

    # Number of training epochs
    num_train_epochs=hyperparameters["num_train_epochs"],

    # Max steps to train for (each step is a batch of data)
    # Overrides num_train_epochs, if not -1
    max_steps=max_steps,

    # Batch size for training
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],

    # Directory to save model checkpoints
    output_dir=output_dir,

    # Other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=120, # Number of update steps between two evaluations
    save_steps=120, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim=hyperparameters["optim"],
    gradient_accumulation_steps = hyperparameters['gradient_accumulation_steps'],
    gradient_checkpointing=False,
    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
  )
  base_model.to(device)
  model_flops = (
    base_model.floating_point_ops(
      {
        "input_ids": torch.zeros(
            (1, training_config["model"]["max_length"])
        )
      }
    )
    * training_args.gradient_accumulation_steps
  )

  #print(base_model)
  print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
  print("Flops", model_flops / 1e9, "GFLOPs")

  trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
  training_output = trainer.train()
  # Evaluate the model
  eval_results = trainer.evaluate()

  return eval_results, training_output


In [4]:
hyperparameters={'learning_rate': 1e-06,
'num_train_epochs': 1,
'per_device_train_batch_size': 1,
'optim': 'adafactor',
'num_iterations': 1,
'max_steps':3,
'gradient_accumulation_steps':2}
#eval_results, training_output =train_model(hyperparameters)
import itertools

In [5]:
from tqdm import tqdm

In [6]:
def find_best_hyperparameters():
        best_hyperparameters = None
        best_loss = float('inf')
        # Define hyperparameter search space
        hyperparameter_space = {
        "learning_rate": [1e-6, 1e-5, 1e-4],
        "num_train_epochs": [1,5,10,20],
        "per_device_train_batch_size": [1],
        "optim": ["adafactor"],
        "num_iterations": [1],
        "max_steps": [3],
        "gradient_accumulation_steps": [3],
        }
        # Generate all combinations of hyperparameters
        all_hyperparameters = list(itertools.product(*hyperparameter_space.values()))

        # Assuming all_hyperparameters is a list of hyperparameter combinations
        for hyperparameter_values in tqdm(all_hyperparameters):
            hyperparameters = dict(zip(hyperparameter_space.keys(), hyperparameter_values))
            
            # Evaluate the model
            # Print the current hyperparameters
            print("Using hyperparameters:")
            for key, value in hyperparameters.items():
                print(f"{key}: {value}")
            eval_results, training_output = train_model(hyperparameters)

            # Check if this set of hyperparameters gives better results
            if eval_results["eval_loss"] < best_loss:
                    best_loss = eval_results["eval_loss"]
                    best_hyperparameters = hyperparameters

        return best_hyperparameters, best_loss

In [7]:
# Call the function to find the best hyperparameters
best_hyperparameters, best_loss = find_best_hyperparameters()

print("Best hyperparameters:", best_hyperparameters)
print("Best loss:", best_loss)

  0%|          | 0/12 [00:00<?, ?it/s]

Using hyperparameters:
learning_rate: 1e-06
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:13:26,191 - DEBUG - utilities - Step (1) Logs: {'loss': 5.2655, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
  0%|          | 0/12 [00:01<?, ?it/s]

{'loss': 5.2655, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:13:27,365 - DEBUG - utilities - Step (2) Logs: {'loss': 5.2699, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 1.1740014553070068, 'flops': 1402682127709.4304, 'remaining_time': 1.1740014553070068}
  0%|          | 0/12 [00:02<?, ?it/s]

{'loss': 5.2699, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 1.1740014553070068, 'flops': 1402682127709.4304, 'remaining_time': 1.1740014553070068}


2024-04-09 18:13:28,384 - DEBUG - utilities - Step (3) Logs: {'loss': 4.7525, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.0962685346603394, 'flops': 1502141863238.1147, 'remaining_time': 0.0}
  0%|          | 0/12 [00:03<?, ?it/s]2024-04-09 18:13:28,389 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.2506, 'train_samples_per_second': 2.769, 'train_steps_per_second': 0.923, 'total_flos': 138837393408.0, 'train_loss': 5.0959804852803545, 'epoch': 0.01, 'iter_time': 1.098726511001587, 'flops': 1498781400808.141, 'remaining_time': 0.0}
  0%|          | 0/12 [00:03<?, ?it/s]

{'loss': 4.7525, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.0962685346603394, 'flops': 1502141863238.1147, 'remaining_time': 0.0}
{'train_runtime': 3.2506, 'train_samples_per_second': 2.769, 'train_steps_per_second': 0.923, 'train_loss': 5.0959804852803545, 'epoch': 0.01, 'iter_time': 1.098726511001587, 'flops': 1498781400808.141, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:13:36,058 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.753756999969482, 'eval_runtime': 7.6692, 'eval_samples_per_second': 13.039, 'eval_steps_per_second': 13.039, 'epoch': 0.01, 'iter_time': 4.933305621147156, 'flops': 333802724932.5527, 'remaining_time': 0.0}
  8%|▊         | 1/12 [00:11<02:02, 11.17s/it]

Using hyperparameters:
learning_rate: 1e-06
num_train_epochs: 5
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:13:37,162 - DEBUG - utilities - Step (1) Logs: {'loss': 5.2069, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
  8%|▊         | 1/12 [00:12<02:02, 11.17s/it]

{'loss': 5.2069, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:13:38,112 - DEBUG - utilities - Step (2) Logs: {'loss': 5.1601, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 0.950427770614624, 'flops': 1732641774765.353, 'remaining_time': 0.950427770614624}
  8%|▊         | 1/12 [00:13<02:02, 11.17s/it]

{'loss': 5.1601, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 0.950427770614624, 'flops': 1732641774765.353, 'remaining_time': 0.950427770614624}


2024-04-09 18:13:39,120 - DEBUG - utilities - Step (3) Logs: {'loss': 4.6892, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.9791241884231567, 'flops': 1681861074146.3054, 'remaining_time': 0.0}
  8%|▊         | 1/12 [00:14<02:02, 11.17s/it]2024-04-09 18:13:39,120 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 2.8834, 'train_samples_per_second': 3.121, 'train_steps_per_second': 1.04, 'total_flos': 138837393408.0, 'train_loss': 5.018730163574219, 'epoch': 0.01, 'iter_time': 0.9791241884231567, 'flops': 1681861074146.3054, 'remaining_time': 0.0}
  8%|▊         | 1/12 [00:14<02:02, 11.17s/it]

{'loss': 4.6892, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.9791241884231567, 'flops': 1681861074146.3054, 'remaining_time': 0.0}
{'train_runtime': 2.8834, 'train_samples_per_second': 3.121, 'train_steps_per_second': 1.04, 'train_loss': 5.018730163574219, 'epoch': 0.01, 'iter_time': 0.9791241884231567, 'flops': 1681861074146.3054, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:13:46,954 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.7248992919921875, 'eval_runtime': 7.8178, 'eval_samples_per_second': 12.791, 'eval_steps_per_second': 12.791, 'epoch': 0.01, 'iter_time': 4.896141290664673, 'flops': 336336466107.26105, 'remaining_time': 0.0}
 17%|█▋        | 2/12 [00:22<01:50, 11.01s/it]

Using hyperparameters:
learning_rate: 1e-06
num_train_epochs: 10
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:13:47,989 - DEBUG - utilities - Step (1) Logs: {'loss': 5.1464, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 17%|█▋        | 2/12 [00:23<01:50, 11.01s/it]

{'loss': 5.1464, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:13:49,149 - DEBUG - utilities - Step (2) Logs: {'loss': 5.051, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 1.1603829860687256, 'flops': 1419144264466.5496, 'remaining_time': 1.1603829860687256}
 17%|█▋        | 2/12 [00:24<01:50, 11.01s/it]

{'loss': 5.051, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 1.1603829860687256, 'flops': 1419144264466.5496, 'remaining_time': 1.1603829860687256}


2024-04-09 18:13:50,232 - DEBUG - utilities - Step (3) Logs: {'loss': 4.6271, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1215522289276123, 'flops': 1468278352795.539, 'remaining_time': 0.0}
 17%|█▋        | 2/12 [00:25<01:50, 11.01s/it]2024-04-09 18:13:50,238 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.1527, 'train_samples_per_second': 2.855, 'train_steps_per_second': 0.952, 'total_flos': 138837393408.0, 'train_loss': 4.941505591074626, 'epoch': 0.01, 'iter_time': 1.1246023178100586, 'flops': 1464296163350.1902, 'remaining_time': 0.0}
 17%|█▋        | 2/12 [00:25<01:50, 11.01s/it]

{'loss': 4.6271, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1215522289276123, 'flops': 1468278352795.539, 'remaining_time': 0.0}
{'train_runtime': 3.1527, 'train_samples_per_second': 2.855, 'train_steps_per_second': 0.952, 'train_loss': 4.941505591074626, 'epoch': 0.01, 'iter_time': 1.1246023178100586, 'flops': 1464296163350.1902, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:13:57,830 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.696102142333984, 'eval_runtime': 7.5822, 'eval_samples_per_second': 13.189, 'eval_steps_per_second': 13.189, 'epoch': 0.01, 'iter_time': 4.92068612575531, 'flops': 334658788871.8931, 'remaining_time': 0.0}
 25%|██▌       | 3/12 [00:32<01:38, 10.95s/it]

Using hyperparameters:
learning_rate: 1e-06
num_train_epochs: 20
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:13:58,865 - DEBUG - utilities - Step (1) Logs: {'loss': 5.0853, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 25%|██▌       | 3/12 [00:33<01:38, 10.95s/it]

{'loss': 5.0853, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:13:59,767 - DEBUG - utilities - Step (2) Logs: {'loss': 4.9464, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 0.9016876220703125, 'flops': 1826298619341.1318, 'remaining_time': 0.9016876220703125}
 25%|██▌       | 3/12 [00:34<01:38, 10.95s/it]

{'loss': 4.9464, 'learning_rate': 5e-07, 'epoch': 0.01, 'iter_time': 0.9016876220703125, 'flops': 1826298619341.1318, 'remaining_time': 0.9016876220703125}


2024-04-09 18:14:00,700 - DEBUG - utilities - Step (3) Logs: {'loss': 4.5643, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.9176473617553711, 'flops': 1794535600379.1304, 'remaining_time': 0.0}
 25%|██▌       | 3/12 [00:35<01:38, 10.95s/it]2024-04-09 18:14:00,700 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 2.7178, 'train_samples_per_second': 3.311, 'train_steps_per_second': 1.104, 'total_flos': 138837393408.0, 'train_loss': 4.8653279940287275, 'epoch': 0.01, 'iter_time': 0.9176473617553711, 'flops': 1794535600379.1304, 'remaining_time': 0.0}
 25%|██▌       | 3/12 [00:35<01:38, 10.95s/it]

{'loss': 4.5643, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.9176473617553711, 'flops': 1794535600379.1304, 'remaining_time': 0.0}
{'train_runtime': 2.7178, 'train_samples_per_second': 3.311, 'train_steps_per_second': 1.104, 'train_loss': 4.8653279940287275, 'epoch': 0.01, 'iter_time': 0.9176473617553711, 'flops': 1794535600379.1304, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:14:07,632 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.6673688888549805, 'eval_runtime': 6.9097, 'eval_samples_per_second': 14.472, 'eval_steps_per_second': 14.472, 'epoch': 0.01, 'iter_time': 4.383354663848877, 'flops': 375682778499.6168, 'remaining_time': 0.0}
 33%|███▎      | 4/12 [00:42<01:24, 10.50s/it]

Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:14:08,680 - DEBUG - utilities - Step (1) Logs: {'loss': 5.0239, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 33%|███▎      | 4/12 [00:43<01:24, 10.50s/it]

{'loss': 5.0239, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:14:09,575 - DEBUG - utilities - Step (2) Logs: {'loss': 4.8441, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 0.8954942226409912, 'flops': 1838929629727.15, 'remaining_time': 0.8954942226409912}
 33%|███▎      | 4/12 [00:44<01:24, 10.50s/it]

{'loss': 4.8441, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 0.8954942226409912, 'flops': 1838929629727.15, 'remaining_time': 0.8954942226409912}


2024-04-09 18:14:10,394 - DEBUG - utilities - Step (3) Logs: {'loss': 4.3529, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.8570363521575928, 'flops': 1921448086908.213, 'remaining_time': 0.0}
 33%|███▎      | 4/12 [00:45<01:24, 10.50s/it]2024-04-09 18:14:10,398 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 2.5223, 'train_samples_per_second': 3.568, 'train_steps_per_second': 1.189, 'total_flos': 138837393408.0, 'train_loss': 4.740296681722005, 'epoch': 0.01, 'iter_time': 0.8591451644897461, 'flops': 1916731801944.11, 'remaining_time': 0.0}
 33%|███▎      | 4/12 [00:45<01:24, 10.50s/it]

{'loss': 4.3529, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 0.8570363521575928, 'flops': 1921448086908.213, 'remaining_time': 0.0}
{'train_runtime': 2.5223, 'train_samples_per_second': 3.568, 'train_steps_per_second': 1.189, 'train_loss': 4.740296681722005, 'epoch': 0.01, 'iter_time': 0.8591451644897461, 'flops': 1916731801944.11, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:14:17,829 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.421558856964111, 'eval_runtime': 7.4209, 'eval_samples_per_second': 13.475, 'eval_steps_per_second': 13.475, 'epoch': 0.01, 'iter_time': 4.574566006660461, 'flops': 359979691377.57965, 'remaining_time': 0.0}
 42%|████▏     | 5/12 [00:52<01:12, 10.39s/it]

Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 5
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:14:19,039 - DEBUG - utilities - Step (1) Logs: {'loss': 4.4951, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 42%|████▏     | 5/12 [00:54<01:12, 10.39s/it]

{'loss': 4.4951, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:14:20,184 - DEBUG - utilities - Step (2) Logs: {'loss': 4.0308, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.1457362174987793, 'flops': 1437286204375.1833, 'remaining_time': 1.1457362174987793}
 42%|████▏     | 5/12 [00:55<01:12, 10.39s/it]

{'loss': 4.0308, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.1457362174987793, 'flops': 1437286204375.1833, 'remaining_time': 1.1457362174987793}


2024-04-09 18:14:21,208 - DEBUG - utilities - Step (3) Logs: {'loss': 3.853, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.0848429203033447, 'flops': 1517962488803.0186, 'remaining_time': 0.0}
 42%|████▏     | 5/12 [00:56<01:12, 10.39s/it]2024-04-09 18:14:21,212 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.1746, 'train_samples_per_second': 2.835, 'train_steps_per_second': 0.945, 'total_flos': 138837393408.0, 'train_loss': 4.126304626464844, 'epoch': 0.01, 'iter_time': 1.0868380069732666, 'flops': 1515175995592.9714, 'remaining_time': 0.0}
 42%|████▏     | 5/12 [00:56<01:12, 10.39s/it]

{'loss': 3.853, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.0848429203033447, 'flops': 1517962488803.0186, 'remaining_time': 0.0}
{'train_runtime': 3.1746, 'train_samples_per_second': 2.835, 'train_steps_per_second': 0.945, 'train_loss': 4.126304626464844, 'epoch': 0.01, 'iter_time': 1.0868380069732666, 'flops': 1515175995592.9714, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:14:30,037 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.211146354675293, 'eval_runtime': 8.8153, 'eval_samples_per_second': 11.344, 'eval_steps_per_second': 11.344, 'epoch': 0.01, 'iter_time': 5.499008059501648, 'flops': 299463256181.0135, 'remaining_time': 0.0}
 50%|█████     | 6/12 [01:05<01:06, 11.01s/it]

Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 10
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:14:31,124 - DEBUG - utilities - Step (1) Logs: {'loss': 4.0376, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 50%|█████     | 6/12 [01:06<01:06, 11.01s/it]

{'loss': 4.0376, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:14:32,330 - DEBUG - utilities - Step (2) Logs: {'loss': 3.3165, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.20524263381958, 'flops': 1366323106282.1099, 'remaining_time': 1.20524263381958}
 50%|█████     | 6/12 [01:07<01:06, 11.01s/it]

{'loss': 3.3165, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.20524263381958, 'flops': 1366323106282.1099, 'remaining_time': 1.20524263381958}


2024-04-09 18:14:33,567 - DEBUG - utilities - Step (3) Logs: {'loss': 3.414, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.2213252782821655, 'flops': 1348331102734.7356, 'remaining_time': 0.0}
 50%|█████     | 6/12 [01:08<01:06, 11.01s/it]2024-04-09 18:14:33,571 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.3383, 'train_samples_per_second': 2.696, 'train_steps_per_second': 0.899, 'total_flos': 138837393408.0, 'train_loss': 3.589368979136149, 'epoch': 0.01, 'iter_time': 1.2235573530197144, 'flops': 1345871409460.1719, 'remaining_time': 0.0}
 50%|█████     | 6/12 [01:08<01:06, 11.01s/it]

{'loss': 3.414, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.2213252782821655, 'flops': 1348331102734.7356, 'remaining_time': 0.0}
{'train_runtime': 3.3383, 'train_samples_per_second': 2.696, 'train_steps_per_second': 0.899, 'train_loss': 3.589368979136149, 'epoch': 0.01, 'iter_time': 1.2235573530197144, 'flops': 1345871409460.1719, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:14:45,804 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.028529167175293, 'eval_runtime': 12.2173, 'eval_samples_per_second': 8.185, 'eval_steps_per_second': 8.185, 'epoch': 0.01, 'iter_time': 7.339709520339966, 'flops': 224361857196.1857, 'remaining_time': 0.0}
 58%|█████▊    | 7/12 [01:20<01:02, 12.56s/it]

Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 20
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:14:47,014 - DEBUG - utilities - Step (1) Logs: {'loss': 3.6411, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 58%|█████▊    | 7/12 [01:22<01:02, 12.56s/it]

{'loss': 3.6411, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:14:48,169 - DEBUG - utilities - Step (2) Logs: {'loss': 2.6848, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.155155897140503, 'flops': 1425565902697.9834, 'remaining_time': 1.155155897140503}
 58%|█████▊    | 7/12 [01:23<01:02, 12.56s/it]

{'loss': 2.6848, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.155155897140503, 'flops': 1425565902697.9834, 'remaining_time': 1.155155897140503}


2024-04-09 18:14:49,239 - DEBUG - utilities - Step (3) Logs: {'loss': 3.0217, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.112558126449585, 'flops': 1480148155961.1992, 'remaining_time': 0.0}
 58%|█████▊    | 7/12 [01:24<01:02, 12.56s/it]2024-04-09 18:14:49,244 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.2074, 'train_samples_per_second': 2.806, 'train_steps_per_second': 0.935, 'total_flos': 138837393408.0, 'train_loss': 3.1158535480499268, 'epoch': 0.01, 'iter_time': 1.115080714225769, 'flops': 1476799695533.5955, 'remaining_time': 0.0}
 58%|█████▊    | 7/12 [01:24<01:02, 12.56s/it]

{'loss': 3.0217, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.112558126449585, 'flops': 1480148155961.1992, 'remaining_time': 0.0}
{'train_runtime': 3.2074, 'train_samples_per_second': 2.806, 'train_steps_per_second': 0.935, 'train_loss': 3.1158535480499268, 'epoch': 0.01, 'iter_time': 1.115080714225769, 'flops': 1476799695533.5955, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:14:57,500 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 3.8763885498046875, 'eval_runtime': 8.2503, 'eval_samples_per_second': 12.121, 'eval_steps_per_second': 12.121, 'epoch': 0.01, 'iter_time': 5.243195295333862, 'flops': 314073912281.98425, 'remaining_time': 0.0}
 67%|██████▋   | 8/12 [01:32<00:49, 12.29s/it]

Using hyperparameters:
learning_rate: 0.0001
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:14:58,739 - DEBUG - utilities - Step (1) Logs: {'loss': 3.2677, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 67%|██████▋   | 8/12 [01:33<00:49, 12.29s/it]

{'loss': 3.2677, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:14:59,814 - DEBUG - utilities - Step (2) Logs: {'loss': 2.1176, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.0753891468048096, 'flops': 1531306935872.3, 'remaining_time': 1.0753891468048096}
 67%|██████▋   | 8/12 [01:34<00:49, 12.29s/it]

{'loss': 2.1176, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.0753891468048096, 'flops': 1531306935872.3, 'remaining_time': 1.0753891468048096}


2024-04-09 18:15:00,989 - DEBUG - utilities - Step (3) Logs: {'loss': 4.3445, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1252036094665527, 'flops': 1463513665801.9675, 'remaining_time': 0.0}
 67%|██████▋   | 8/12 [01:36<00:49, 12.29s/it]2024-04-09 18:15:00,996 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.302, 'train_samples_per_second': 2.726, 'train_steps_per_second': 0.909, 'total_flos': 138837393408.0, 'train_loss': 3.243257204691569, 'epoch': 0.01, 'iter_time': 1.1283220052719116, 'flops': 1459468885273.7153, 'remaining_time': 0.0}
 67%|██████▋   | 8/12 [01:36<00:49, 12.29s/it]

{'loss': 4.3445, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1252036094665527, 'flops': 1463513665801.9675, 'remaining_time': 0.0}
{'train_runtime': 3.302, 'train_samples_per_second': 2.726, 'train_steps_per_second': 0.909, 'train_loss': 3.243257204691569, 'epoch': 0.01, 'iter_time': 1.1283220052719116, 'flops': 1459468885273.7153, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:15:10,040 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 3.96282958984375, 'eval_runtime': 9.0344, 'eval_samples_per_second': 11.069, 'eval_steps_per_second': 11.069, 'epoch': 0.01, 'iter_time': 5.650584936141968, 'flops': 291430157740.15546, 'remaining_time': 0.0}
 75%|███████▌  | 9/12 [01:45<00:37, 12.37s/it]

Using hyperparameters:
learning_rate: 0.0001
num_train_epochs: 5
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:15:11,341 - DEBUG - utilities - Step (1) Logs: {'loss': 2.6473, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 75%|███████▌  | 9/12 [01:46<00:37, 12.37s/it]

{'loss': 2.6473, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:15:13,461 - DEBUG - utilities - Step (2) Logs: {'loss': 1.3063, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 2.120425224304199, 'flops': 776613502041.4918, 'remaining_time': 2.120425224304199}
 75%|███████▌  | 9/12 [01:48<00:37, 12.37s/it]

{'loss': 1.3063, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 2.120425224304199, 'flops': 776613502041.4918, 'remaining_time': 2.120425224304199}


2024-04-09 18:15:15,658 - DEBUG - utilities - Step (3) Logs: {'loss': 3.0391, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 2.158801317214966, 'flops': 762807974097.61, 'remaining_time': 0.0}
 75%|███████▌  | 9/12 [01:50<00:37, 12.37s/it]2024-04-09 18:15:15,658 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 5.4211, 'train_samples_per_second': 1.66, 'train_steps_per_second': 0.553, 'total_flos': 138837393408.0, 'train_loss': 2.330891410509745, 'epoch': 0.01, 'iter_time': 2.158801317214966, 'flops': 762807974097.61, 'remaining_time': 0.0}
 75%|███████▌  | 9/12 [01:50<00:37, 12.37s/it]

{'loss': 3.0391, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 2.158801317214966, 'flops': 762807974097.61, 'remaining_time': 0.0}
{'train_runtime': 5.4211, 'train_samples_per_second': 1.66, 'train_steps_per_second': 0.553, 'train_loss': 2.330891410509745, 'epoch': 0.01, 'iter_time': 2.158801317214966, 'flops': 762807974097.61, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:15:24,110 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.060362339019775, 'eval_runtime': 8.4364, 'eval_samples_per_second': 11.853, 'eval_steps_per_second': 11.853, 'epoch': 0.01, 'iter_time': 6.384790658950806, 'flops': 257917752864.05487, 'remaining_time': 0.0}
 83%|████████▎ | 10/12 [01:59<00:25, 12.89s/it]

Using hyperparameters:
learning_rate: 0.0001
num_train_epochs: 10
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:15:25,295 - DEBUG - utilities - Step (1) Logs: {'loss': 2.1804, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 83%|████████▎ | 10/12 [02:00<00:25, 12.89s/it]

{'loss': 2.1804, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:15:26,417 - DEBUG - utilities - Step (2) Logs: {'loss': 1.0426, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.1210155487060547, 'flops': 1468981283234.4578, 'remaining_time': 1.1210155487060547}
 83%|████████▎ | 10/12 [02:01<00:25, 12.89s/it]

{'loss': 1.0426, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.1210155487060547, 'flops': 1468981283234.4578, 'remaining_time': 1.1210155487060547}


2024-04-09 18:15:27,585 - DEBUG - utilities - Step (3) Logs: {'loss': 2.3931, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1446044445037842, 'flops': 1438707378056.6257, 'remaining_time': 0.0}
 83%|████████▎ | 10/12 [02:02<00:25, 12.89s/it]2024-04-09 18:15:27,585 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.2902, 'train_samples_per_second': 2.735, 'train_steps_per_second': 0.912, 'total_flos': 138837393408.0, 'train_loss': 1.8720334370930989, 'epoch': 0.01, 'iter_time': 1.1446044445037842, 'flops': 1438707378056.6257, 'remaining_time': 0.0}
 83%|████████▎ | 10/12 [02:02<00:25, 12.89s/it]

{'loss': 2.3931, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1446044445037842, 'flops': 1438707378056.6257, 'remaining_time': 0.0}
{'train_runtime': 3.2902, 'train_samples_per_second': 2.735, 'train_steps_per_second': 0.912, 'train_loss': 1.8720334370930989, 'epoch': 0.01, 'iter_time': 1.1446044445037842, 'flops': 1438707378056.6257, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:15:36,105 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.44040584564209, 'eval_runtime': 8.5047, 'eval_samples_per_second': 11.758, 'eval_steps_per_second': 11.758, 'epoch': 0.01, 'iter_time': 5.404757380485535, 'flops': 304685436058.56824, 'remaining_time': 0.0}
 92%|█████████▏| 11/12 [02:11<00:12, 12.62s/it]

Using hyperparameters:
learning_rate: 0.0001
num_train_epochs: 20
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1
max_steps: 3
gradient_accumulation_steps: 3
Memory footprint 0.30687256 GB
Flops 1646.750859264 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-09 18:15:37,369 - DEBUG - utilities - Step (1) Logs: {'loss': 2.117, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
 92%|█████████▏| 11/12 [02:12<00:12, 12.62s/it]

{'loss': 2.117, 'learning_rate': 0.0001, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-09 18:15:38,498 - DEBUG - utilities - Step (2) Logs: {'loss': 0.3299, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.1296303272247314, 'flops': 1457778548943.2874, 'remaining_time': 1.1296303272247314}
 92%|█████████▏| 11/12 [02:13<00:12, 12.62s/it]

{'loss': 0.3299, 'learning_rate': 5e-05, 'epoch': 0.01, 'iter_time': 1.1296303272247314, 'flops': 1457778548943.2874, 'remaining_time': 1.1296303272247314}


2024-04-09 18:15:39,688 - DEBUG - utilities - Step (3) Logs: {'loss': 1.7966, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1598482131958008, 'flops': 1419798591340.3328, 'remaining_time': 0.0}
 92%|█████████▏| 11/12 [02:14<00:12, 12.62s/it]2024-04-09 18:15:39,691 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.4156, 'train_samples_per_second': 2.635, 'train_steps_per_second': 0.878, 'total_flos': 138837393408.0, 'train_loss': 1.4144899249076843, 'epoch': 0.01, 'iter_time': 1.1611294746398926, 'flops': 1418231898535.446, 'remaining_time': 0.0}
 92%|█████████▏| 11/12 [02:14<00:12, 12.62s/it]

{'loss': 1.7966, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1598482131958008, 'flops': 1419798591340.3328, 'remaining_time': 0.0}
{'train_runtime': 3.4156, 'train_samples_per_second': 2.635, 'train_steps_per_second': 0.878, 'train_loss': 1.4144899249076843, 'epoch': 0.01, 'iter_time': 1.1611294746398926, 'flops': 1418231898535.446, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-09 18:15:50,110 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 5.092215061187744, 'eval_runtime': 10.4103, 'eval_samples_per_second': 9.606, 'eval_steps_per_second': 9.606, 'epoch': 0.01, 'iter_time': 6.370881915092468, 'flops': 258480832200.46603, 'remaining_time': 0.0}
100%|██████████| 12/12 [02:25<00:00, 12.10s/it]

Best hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 20, 'per_device_train_batch_size': 1, 'optim': 'adafactor', 'num_iterations': 1, 'max_steps': 3, 'gradient_accumulation_steps': 3}
Best loss: 3.8763885498046875



