In [1]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner


logger = logging.getLogger(__name__)
global_config = None

### Load the Lamini docs dataset

# Get the current directory
current_directory = os.getcwd()
# Join the folder path
folder_path = os.path.join(current_directory, "content")
dataset_name = "ai-medical-chatbot_processed.jsonl"
dataset_path = os.path.join(folder_path, dataset_name)
#dataset_path = f"/content/{dataset_name}"
use_hf = False

### Set up the model, training config, and tokenizer

model_name = "EleutherAI/pythia-70m"

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}


tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
print(train_dataset)
print(test_dataset)

### Load the base model

base_model = AutoModelForCausalLM.from_pretrained(model_name)

device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")


base_model.to(device)

### Define function to carry out inference
def inference_new(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
      text,
      return_tensors="pt",
      truncation=True,
      max_length=max_input_tokens
  )

  # Generate
  device = model.device
  attention_mask = torch.ones_like(input_ids)  # Create mask with all 1s

  # Fix: Mask all padding tokens, including the first element
  attention_mask[input_ids == tokenizer.pad_token_id] = 0

  generated_tokens_with_prompt = model.generate(
      input_ids.to(device),
      max_length=max_output_tokens,
      attention_mask=attention_mask,
      pad_token_id=tokenizer.eos_token_id  # Set pad token
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]
  return generated_text_answer

test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from ai-medical-chatbot: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference_new(test_text, base_model, tokenizer))
### Setup training
max_steps = 3
trained_model_name = f"ai_medical_{max_steps}_steps"
output_dir = trained_model_name

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,
  # Number of training epochs
  num_train_epochs=10,
  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,
  # Batch size for training
  per_device_train_batch_size=1,
  # Directory to save model checkpoints
  output_dir=output_dir,
  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,
  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")


trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
training_output = trainer.train()


2024-04-08 13:37:12,147 - DEBUG - utilities - Config: datasets.path: c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m
verbose: true



tokenize False c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl


2024-04-08 13:37:12,483 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-08 13:37:12,598 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1350
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 150
})


2024-04-08 13:37:13,810 - DEBUG - __main__ - Select CPU device


Question input (test): ### Question:
Will Kalarchikai cure multiple ovarian cysts in PCOD?
### Answer:
Correct answer from ai-medical-chatbot: Hello. I just read your query. See Kalarachi Kai choornam is helpful in amenorrhea. As far as small cysts are concerned they are unmatured eggs which failed to induce menstrual cycle previously, as a result, they got collected in the ovary and they will remain in the ovary. Now, you have got your periods you can start trying for conception. But I advise you to do it under the supervision of a nearby gynecologist because egg size is important while conception and that you can know by ovulation study. Ovulation study is performed under the supervision of a gynecologist. For gall stones, surgical intervention is required generally. Medicine is not of much help.
Model's answer: 


The answer is "Yes" and "No"

The answer is "Yes" and "No"

The answer is "Yes" and "No"

The answer is "Yes" and "No"

The answer is "Yes" and "No"

The answer is "Yes" a

  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-08 13:37:16,914 - DEBUG - utilities - Step (1) Logs: {'loss': 4.8075, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


{'loss': 4.8075, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-08 13:37:18,667 - DEBUG - utilities - Step (2) Logs: {'loss': 4.059, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.7527174949645996, 'flops': 1252722026601.5242, 'remaining_time': 1.7527174949645996}


{'loss': 4.059, 'learning_rate': 5e-06, 'epoch': 0.01, 'iter_time': 1.7527174949645996, 'flops': 1252722026601.5242, 'remaining_time': 1.7527174949645996}


2024-04-08 13:37:19,832 - DEBUG - utilities - Step (3) Logs: {'loss': 4.6117, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.4589784145355225, 'flops': 1504935090524.2888, 'remaining_time': 0.0}
2024-04-08 13:37:19,832 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 4.5758, 'train_samples_per_second': 2.623, 'train_steps_per_second': 0.656, 'total_flos': 405255094272.0, 'train_loss': 4.492728392283122, 'epoch': 0.01, 'iter_time': 1.45903742313385, 'flops': 1504874225663.0742, 'remaining_time': 0.0}


{'loss': 4.6117, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.4589784145355225, 'flops': 1504935090524.2888, 'remaining_time': 0.0}
{'train_runtime': 4.5758, 'train_samples_per_second': 2.623, 'train_steps_per_second': 0.656, 'train_loss': 4.492728392283122, 'epoch': 0.01, 'iter_time': 1.45903742313385, 'flops': 1504874225663.0742, 'remaining_time': 0.0}


In [2]:
# Evaluate the model
eval_results = trainer.evaluate()

  0%|          | 0/150 [00:00<?, ?it/s]

2024-04-08 13:38:04,209 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.126183986663818, 'eval_runtime': 10.9009, 'eval_samples_per_second': 13.76, 'eval_steps_per_second': 13.76, 'epoch': 0.01, 'iter_time': 23.647719264030457, 'flops': 92849030717.8053, 'remaining_time': 0.0}


In [3]:
eval_results

{'eval_loss': 4.126183986663818,
 'eval_runtime': 10.9009,
 'eval_samples_per_second': 13.76,
 'eval_steps_per_second': 13.76,
 'epoch': 0.01,
 'iter_time': 23.647719264030457,
 'flops': 92849030717.8053,
 'remaining_time': 0.0}

In [1]:
import os
import random
import torch
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoTokenizer
from llama import BasicModelRunner
from utilities import tokenize_and_split_data
from transformers import AutoModelForCausalLM
def find_best_hyperparameters():
    model_name = "EleutherAI/pythia-70m"
    use_hf = False
    # Get the current directory
    current_directory = os.getcwd()
    # Join the folder path
    folder_path = os.path.join(current_directory, "content")
    dataset_name = "ai-medical-chatbot_processed.jsonl"
    dataset_path = os.path.join(folder_path, dataset_name)
    ### Load the base model
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    training_config = {
        "model": {
            "pretrained_name": model_name,
            "max_length" : 2048
        },
        "datasets": {
            "use_hf": use_hf,
            "path": dataset_path
        },
        "verbose": True
    }
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

    best_hyperparameters = None
    best_loss = float('inf')  # Initialize with a very high value

    import itertools

    # Define hyperparameter search space
    hyperparameter_space = {
        "learning_rate": [1e-5, 5e-5, 1e-4],
        "num_train_epochs": [1],
        "per_device_train_batch_size": [1],
        "optim": ["adafactor"],
        "num_iterations": [1],
    }

    # Generate all combinations of hyperparameters
    all_hyperparameters = list(itertools.product(*hyperparameter_space.values()))

    for hyperparameter_values in all_hyperparameters:
        hyperparameters = dict(zip(hyperparameter_space.keys(), hyperparameter_values))

        # Print the current hyperparameters
        print("Using hyperparameters:")
        for key, value in hyperparameters.items():
            print(f"{key}: {value}")

        # Setup training_args with the current hyperparameters
        training_args = TrainingArguments(
            learning_rate=hyperparameters["learning_rate"],
            num_train_epochs=hyperparameters["num_train_epochs"],
            per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
            output_dir="./results",  # Provide a dummy output directory
            overwrite_output_dir=False,
            disable_tqdm=False,
            eval_steps=120,
            save_steps=120,
            warmup_steps=1,
            per_device_eval_batch_size=1,
            evaluation_strategy="steps",
            logging_strategy="steps",
            logging_steps=1,
            optim=hyperparameters["optim"],
            gradient_accumulation_steps=4,
            gradient_checkpointing=False,
            load_best_model_at_end=True,
            save_total_limit=1,
            metric_for_best_model="eval_loss",
            greater_is_better=False
        )
        # Ensure the model is on a CUDA device if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        base_model.to(device)

        # Setup Trainer with the new hyperparameters
        trainer = Trainer(
            model=base_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset
        )

        # Train the model
        training_output = trainer.train()

        # Evaluate the model
        eval_results = trainer.evaluate()

        # Check if this set of hyperparameters gives better results
        if eval_results["eval_loss"] < best_loss:
            best_loss = eval_results["eval_loss"]
            best_hyperparameters = hyperparameters
    return best_hyperparameters, best_loss
# Call the function to find the best hyperparameters
best_hyperparameters, best_loss = find_best_hyperparameters()

print("Best hyperparameters:", best_hyperparameters)
print("Best loss:", best_loss)


2024-04-08 22:36:46,412 - DEBUG - utilities - Config: datasets.path: c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m
verbose: true



tokenize False c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl


2024-04-08 22:36:47,143 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-08 22:36:47,264 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json


Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1


  0%|          | 0/337 [00:00<?, ?it/s]

{'loss': 4.8075, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 4.059, 'learning_rate': 9.970238095238096e-06, 'epoch': 0.01}
{'loss': 4.6117, 'learning_rate': 9.940476190476192e-06, 'epoch': 0.01}
{'loss': 3.897, 'learning_rate': 9.910714285714288e-06, 'epoch': 0.01}
{'loss': 4.0117, 'learning_rate': 9.880952380952381e-06, 'epoch': 0.01}
{'loss': 4.0519, 'learning_rate': 9.851190476190477e-06, 'epoch': 0.02}
{'loss': 3.2927, 'learning_rate': 9.821428571428573e-06, 'epoch': 0.02}
{'loss': 3.7339, 'learning_rate': 9.791666666666666e-06, 'epoch': 0.02}
{'loss': 3.4974, 'learning_rate': 9.761904761904762e-06, 'epoch': 0.03}
{'loss': 3.3106, 'learning_rate': 9.732142857142858e-06, 'epoch': 0.03}
{'loss': 3.796, 'learning_rate': 9.702380952380953e-06, 'epoch': 0.03}
{'loss': 3.3639, 'learning_rate': 9.672619047619049e-06, 'epoch': 0.04}
{'loss': 3.4088, 'learning_rate': 9.642857142857144e-06, 'epoch': 0.04}
{'loss': 3.2516, 'learning_rate': 9.61309523809524e-06, 'epoch': 0.04}
{'loss': 3.36

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.2275211811065674, 'eval_runtime': 12.403, 'eval_samples_per_second': 12.094, 'eval_steps_per_second': 12.094, 'epoch': 0.36}
{'loss': 1.3276, 'learning_rate': 6.4285714285714295e-06, 'epoch': 0.36}
{'loss': 2.7252, 'learning_rate': 6.398809523809524e-06, 'epoch': 0.36}
{'loss': 2.7075, 'learning_rate': 6.369047619047619e-06, 'epoch': 0.36}
{'loss': 1.9005, 'learning_rate': 6.3392857142857145e-06, 'epoch': 0.37}
{'loss': 3.3708, 'learning_rate': 6.30952380952381e-06, 'epoch': 0.37}
{'loss': 1.1787, 'learning_rate': 6.279761904761906e-06, 'epoch': 0.37}
{'loss': 1.9572, 'learning_rate': 6.25e-06, 'epoch': 0.38}
{'loss': 1.494, 'learning_rate': 6.220238095238096e-06, 'epoch': 0.38}
{'loss': 2.0389, 'learning_rate': 6.1904761904761914e-06, 'epoch': 0.38}
{'loss': 2.2449, 'learning_rate': 6.160714285714286e-06, 'epoch': 0.39}
{'loss': 2.406, 'learning_rate': 6.130952380952382e-06, 'epoch': 0.39}
{'loss': 3.184, 'learning_rate': 6.101190476190477e-06, 'epoch': 0.39}
{'loss': 

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.127291679382324, 'eval_runtime': 12.8375, 'eval_samples_per_second': 11.685, 'eval_steps_per_second': 11.685, 'epoch': 0.71}
{'loss': 2.1837, 'learning_rate': 2.8571428571428573e-06, 'epoch': 0.71}
{'loss': 2.9419, 'learning_rate': 2.8273809523809524e-06, 'epoch': 0.72}
{'loss': 0.0664, 'learning_rate': 2.797619047619048e-06, 'epoch': 0.72}
{'loss': 1.4968, 'learning_rate': 2.767857142857143e-06, 'epoch': 0.72}
{'loss': 2.0161, 'learning_rate': 2.7380952380952387e-06, 'epoch': 0.73}
{'loss': 1.3777, 'learning_rate': 2.7083333333333334e-06, 'epoch': 0.73}
{'loss': 2.8532, 'learning_rate': 2.6785714285714285e-06, 'epoch': 0.73}
{'loss': 3.6408, 'learning_rate': 2.648809523809524e-06, 'epoch': 0.73}
{'loss': 3.6157, 'learning_rate': 2.6190476190476192e-06, 'epoch': 0.74}
{'loss': 1.4322, 'learning_rate': 2.5892857142857148e-06, 'epoch': 0.74}
{'loss': 1.7301, 'learning_rate': 2.5595238095238095e-06, 'epoch': 0.74}
{'loss': 2.024, 'learning_rate': 2.529761904761905e-06, 'ep

  0%|          | 0/150 [00:00<?, ?it/s]

Using hyperparameters:
learning_rate: 5e-05
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1


  0%|          | 0/337 [00:00<?, ?it/s]

{'loss': 1.7219, 'learning_rate': 5e-05, 'epoch': 0.0}
{'loss': 2.5403, 'learning_rate': 4.985119047619048e-05, 'epoch': 0.01}
{'loss': 1.8522, 'learning_rate': 4.9702380952380955e-05, 'epoch': 0.01}
{'loss': 3.1502, 'learning_rate': 4.955357142857143e-05, 'epoch': 0.01}
{'loss': 3.3608, 'learning_rate': 4.940476190476191e-05, 'epoch': 0.01}
{'loss': 3.4451, 'learning_rate': 4.925595238095238e-05, 'epoch': 0.02}
{'loss': 1.9429, 'learning_rate': 4.910714285714286e-05, 'epoch': 0.02}
{'loss': 2.8489, 'learning_rate': 4.8958333333333335e-05, 'epoch': 0.02}
{'loss': 2.0629, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.03}
{'loss': 2.1752, 'learning_rate': 4.866071428571429e-05, 'epoch': 0.03}
{'loss': 2.6894, 'learning_rate': 4.8511904761904764e-05, 'epoch': 0.03}
{'loss': 2.2427, 'learning_rate': 4.836309523809524e-05, 'epoch': 0.04}
{'loss': 2.8168, 'learning_rate': 4.8214285714285716e-05, 'epoch': 0.04}
{'loss': 2.1201, 'learning_rate': 4.806547619047619e-05, 'epoch': 0.04}
{'los

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.3299429416656494, 'eval_runtime': 13.357, 'eval_samples_per_second': 11.23, 'eval_steps_per_second': 11.23, 'epoch': 0.36}
{'loss': 1.0521, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.36}
{'loss': 2.9697, 'learning_rate': 3.199404761904762e-05, 'epoch': 0.36}
{'loss': 2.9199, 'learning_rate': 3.1845238095238096e-05, 'epoch': 0.36}
{'loss': 1.6859, 'learning_rate': 3.169642857142857e-05, 'epoch': 0.37}
{'loss': 3.4755, 'learning_rate': 3.154761904761905e-05, 'epoch': 0.37}
{'loss': 1.0215, 'learning_rate': 3.1398809523809525e-05, 'epoch': 0.37}
{'loss': 1.8486, 'learning_rate': 3.125e-05, 'epoch': 0.38}
{'loss': 1.2737, 'learning_rate': 3.110119047619048e-05, 'epoch': 0.38}
{'loss': 1.8317, 'learning_rate': 3.095238095238095e-05, 'epoch': 0.38}
{'loss': 2.3297, 'learning_rate': 3.080357142857143e-05, 'epoch': 0.39}
{'loss': 2.5795, 'learning_rate': 3.0654761904761905e-05, 'epoch': 0.39}
{'loss': 3.3708, 'learning_rate': 3.0505952380952385e-05, 'epoch': 0.39}
{'lo

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.207026243209839, 'eval_runtime': 12.9115, 'eval_samples_per_second': 11.618, 'eval_steps_per_second': 11.618, 'epoch': 0.71}
{'loss': 2.2343, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.71}
{'loss': 3.2538, 'learning_rate': 1.4136904761904762e-05, 'epoch': 0.72}
{'loss': 0.0528, 'learning_rate': 1.398809523809524e-05, 'epoch': 0.72}
{'loss': 1.5524, 'learning_rate': 1.3839285714285715e-05, 'epoch': 0.72}
{'loss': 2.1422, 'learning_rate': 1.3690476190476192e-05, 'epoch': 0.73}
{'loss': 1.3807, 'learning_rate': 1.3541666666666666e-05, 'epoch': 0.73}
{'loss': 3.1919, 'learning_rate': 1.3392857142857144e-05, 'epoch': 0.73}
{'loss': 3.8646, 'learning_rate': 1.324404761904762e-05, 'epoch': 0.73}
{'loss': 3.8602, 'learning_rate': 1.3095238095238096e-05, 'epoch': 0.74}
{'loss': 1.4858, 'learning_rate': 1.2946428571428574e-05, 'epoch': 0.74}
{'loss': 1.8031, 'learning_rate': 1.2797619047619047e-05, 'epoch': 0.74}
{'loss': 2.2941, 'learning_rate': 1.2648809523809524e-05, 

  0%|          | 0/150 [00:00<?, ?it/s]

Using hyperparameters:
learning_rate: 0.0001
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1


  0%|          | 0/337 [00:00<?, ?it/s]

{'loss': 1.7181, 'learning_rate': 0.0001, 'epoch': 0.0}
{'loss': 2.2646, 'learning_rate': 9.970238095238096e-05, 'epoch': 0.01}
{'loss': 1.6909, 'learning_rate': 9.940476190476191e-05, 'epoch': 0.01}
{'loss': 3.1052, 'learning_rate': 9.910714285714286e-05, 'epoch': 0.01}
{'loss': 3.2772, 'learning_rate': 9.880952380952381e-05, 'epoch': 0.01}
{'loss': 3.346, 'learning_rate': 9.851190476190477e-05, 'epoch': 0.02}
{'loss': 1.6313, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.02}
{'loss': 2.8151, 'learning_rate': 9.791666666666667e-05, 'epoch': 0.02}
{'loss': 2.2995, 'learning_rate': 9.761904761904762e-05, 'epoch': 0.03}
{'loss': 2.1667, 'learning_rate': 9.732142857142858e-05, 'epoch': 0.03}
{'loss': 2.4295, 'learning_rate': 9.702380952380953e-05, 'epoch': 0.03}
{'loss': 2.2649, 'learning_rate': 9.672619047619048e-05, 'epoch': 0.04}
{'loss': 2.6327, 'learning_rate': 9.642857142857143e-05, 'epoch': 0.04}
{'loss': 2.026, 'learning_rate': 9.613095238095238e-05, 'epoch': 0.04}
{'loss': 3

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.616199254989624, 'eval_runtime': 13.5111, 'eval_samples_per_second': 11.102, 'eval_steps_per_second': 11.102, 'epoch': 0.36}
{'loss': 1.0211, 'learning_rate': 6.428571428571429e-05, 'epoch': 0.36}
{'loss': 2.988, 'learning_rate': 6.398809523809524e-05, 'epoch': 0.36}
{'loss': 2.8994, 'learning_rate': 6.369047619047619e-05, 'epoch': 0.36}
{'loss': 1.7205, 'learning_rate': 6.339285714285714e-05, 'epoch': 0.37}
{'loss': 3.4134, 'learning_rate': 6.30952380952381e-05, 'epoch': 0.37}
{'loss': 1.0706, 'learning_rate': 6.279761904761905e-05, 'epoch': 0.37}
{'loss': 1.9448, 'learning_rate': 6.25e-05, 'epoch': 0.38}
{'loss': 1.3066, 'learning_rate': 6.220238095238095e-05, 'epoch': 0.38}
{'loss': 1.7735, 'learning_rate': 6.19047619047619e-05, 'epoch': 0.38}
{'loss': 2.2413, 'learning_rate': 6.160714285714286e-05, 'epoch': 0.39}
{'loss': 2.628, 'learning_rate': 6.130952380952381e-05, 'epoch': 0.39}
{'loss': 3.477, 'learning_rate': 6.101190476190477e-05, 'epoch': 0.39}
{'loss': 2.62

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.4516303539276123, 'eval_runtime': 13.1963, 'eval_samples_per_second': 11.367, 'eval_steps_per_second': 11.367, 'epoch': 0.71}
{'loss': 2.4676, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.71}
{'loss': 3.6556, 'learning_rate': 2.8273809523809523e-05, 'epoch': 0.72}
{'loss': 0.055, 'learning_rate': 2.797619047619048e-05, 'epoch': 0.72}
{'loss': 1.7965, 'learning_rate': 2.767857142857143e-05, 'epoch': 0.72}
{'loss': 2.4816, 'learning_rate': 2.7380952380952383e-05, 'epoch': 0.73}
{'loss': 1.4736, 'learning_rate': 2.7083333333333332e-05, 'epoch': 0.73}
{'loss': 3.6722, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.73}
{'loss': 4.3311, 'learning_rate': 2.648809523809524e-05, 'epoch': 0.73}
{'loss': 4.3169, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.74}
{'loss': 1.6305, 'learning_rate': 2.5892857142857148e-05, 'epoch': 0.74}
{'loss': 2.034, 'learning_rate': 2.5595238095238093e-05, 'epoch': 0.74}
{'loss': 2.6591, 'learning_rate': 2.529761904761905e-05, 'epo

  0%|          | 0/150 [00:00<?, ?it/s]

Best hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 1, 'optim': 'adafactor', 'num_iterations': 1}
Best loss: 2.127291679382324
