In [14]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from transformers.trainer_callback import TrainerCallback
from IPython.display import clear_output

In [15]:
#model_name = "EleutherAI/pythia-160m"
model_name = "EleutherAI/pythia-70m-deduped"
#model_name = "EleutherAI/pythia-70m"
# Get the current directory
current_directory = os.getcwd()
# Join the folder path
folder_path = os.path.join(current_directory, "content")
dataset_name = "ai-medical-chatbot_processed.jsonl"
dataset_path = os.path.join(folder_path, dataset_name)
#dataset_path = f"/content/{dataset_name}"
use_hf = False
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

2024-04-10 14:42:01,555 - DEBUG - utilities - Config: datasets.path: c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m-deduped
verbose: true



tokenize False c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl


2024-04-10 14:42:01,931 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-10 14:42:01,945 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
2024-04-10 14:42:02,040 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/tmp4ae1ycbg
2024-04-10 14:42:04,738 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/tmpynrhkjug
2024-04-10 14:42:04,762 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-f1c6af33428df321/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/tmpjsdsk3lr


config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

2024-04-10 14:42:38,571 - DEBUG - utilities - Select CPU device


In [16]:
def train_model(hyperparameters):
  max_steps = hyperparameters["max_steps"]
  trained_model_name = f"ai_medical_{max_steps}_steps"
  output_dir = trained_model_name
  training_args = TrainingArguments(
    # Learning rate
    learning_rate=hyperparameters["learning_rate"],

    # Number of training epochs
    num_train_epochs=hyperparameters["num_train_epochs"],

    # Max steps to train for (each step is a batch of data)
    # Overrides num_train_epochs, if not -1
    max_steps=max_steps,

    # Batch size for training
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],

    # Directory to save model checkpoints
    output_dir=output_dir,

    # Other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=120, # Number of update steps between two evaluations
    save_steps=120, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim=hyperparameters["optim"],
    gradient_accumulation_steps = hyperparameters['gradient_accumulation_steps'],
    gradient_checkpointing=False,
    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
  )
  base_model.to(device)
  model_flops = (
    base_model.floating_point_ops(
      {
        "input_ids": torch.zeros(
            (1, training_config["model"]["max_length"])
        )
      }
    )
    * training_args.gradient_accumulation_steps
  )

  #print(base_model)
  print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
  print("Flops", model_flops / 1e9, "GFLOPs")

  trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
  training_output = trainer.train()
  # Evaluate the model
  eval_results = trainer.evaluate()
  
  return eval_results, training_output


In [17]:
hyperparameters={'learning_rate': 1e-06,
'num_train_epochs': 1,
'per_device_train_batch_size': 1,
'optim': 'adafactor',
'num_iterations': 1,
'max_steps':3,
'gradient_accumulation_steps':2}
eval_results, training_output =train_model(hyperparameters)
import itertools

Memory footprint 0.30687256 GB
Flops 1097.833906176 GFLOPs


  0%|          | 0/3 [00:00<?, ?it/s]

2024-04-10 14:42:40,807 - DEBUG - utilities - Step (1) Logs: {'loss': 5.0693, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


{'loss': 5.0693, 'learning_rate': 1e-06, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}


2024-04-10 14:42:41,984 - DEBUG - utilities - Step (2) Logs: {'loss': 5.0176, 'learning_rate': 5e-07, 'epoch': 0.0, 'iter_time': 1.176931619644165, 'flops': 932793280299.4285, 'remaining_time': 1.176931619644165}


{'loss': 5.0176, 'learning_rate': 5e-07, 'epoch': 0.0, 'iter_time': 1.176931619644165, 'flops': 932793280299.4285, 'remaining_time': 1.176931619644165}


2024-04-10 14:42:43,130 - DEBUG - utilities - Step (3) Logs: {'loss': 5.3909, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1616612672805786, 'flops': 945055100912.5088, 'remaining_time': 0.0}
2024-04-10 14:42:43,133 - DEBUG - utilities - Step (3) Logs: {'train_runtime': 3.8223, 'train_samples_per_second': 1.57, 'train_steps_per_second': 0.785, 'total_flos': 91932868608.0, 'train_loss': 5.159242312113444, 'epoch': 0.01, 'iter_time': 1.1631618738174438, 'flops': 943835876061.6521, 'remaining_time': 0.0}


{'loss': 5.3909, 'learning_rate': 0.0, 'epoch': 0.01, 'iter_time': 1.1616612672805786, 'flops': 945055100912.5088, 'remaining_time': 0.0}
{'train_runtime': 3.8223, 'train_samples_per_second': 1.57, 'train_steps_per_second': 0.785, 'train_loss': 5.159242312113444, 'epoch': 0.01, 'iter_time': 1.1631618738174438, 'flops': 943835876061.6521, 'remaining_time': 0.0}


  0%|          | 0/100 [00:00<?, ?it/s]

2024-04-10 14:42:52,253 - DEBUG - utilities - Step (3) Logs: {'eval_loss': 4.671843528747559, 'eval_runtime': 9.1069, 'eval_samples_per_second': 10.981, 'eval_steps_per_second': 10.981, 'epoch': 0.01, 'iter_time': 5.72236967086792, 'flops': 191849525514.74884, 'remaining_time': 0.0}


In [18]:
from tqdm import tqdm

In [19]:
def find_best_hyperparameters():
        best_hyperparameters = None
        best_loss = float('inf')
        # Define hyperparameter search space
        hyperparameter_space = {
        "learning_rate": [6.0e-5, 3.0e-4],
        "num_train_epochs": [1,5],
        "per_device_train_batch_size": [1],
        "optim": ["adafactor"],
        "num_iterations": [1],
        "max_steps": [1,5,10,50],
        "gradient_accumulation_steps": [2],
        }
        # Generate all combinations of hyperparameters
        all_hyperparameters = list(itertools.product(*hyperparameter_space.values()))

        # Assuming all_hyperparameters is a list of hyperparameter combinations
        for hyperparameter_values in tqdm(all_hyperparameters):
            hyperparameters = dict(zip(hyperparameter_space.keys(), hyperparameter_values))
            
            # Evaluate the model
            # Print the current hyperparameters
            print("Using hyperparameters:")
            for key, value in hyperparameters.items():
                print(f"{key}: {value}")
            eval_results, training_output = train_model(hyperparameters)
            clear_output()
            # Check if this set of hyperparameters gives better results
            if eval_results["eval_loss"] < best_loss:
                    best_loss = eval_results["eval_loss"]
                    best_hyperparameters = hyperparameters

        return best_hyperparameters, best_loss

In [20]:
# Call the function to find the best hyperparameters
best_hyperparameters, best_loss = find_best_hyperparameters()

print("Best hyperparameters:", best_hyperparameters)
print("Best loss:", best_loss)

100%|██████████| 16/16 [07:50<00:00, 29.43s/it]

Best hyperparameters: {'learning_rate': 6e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 1, 'optim': 'adafactor', 'num_iterations': 1, 'max_steps': 50, 'gradient_accumulation_steps': 2}
Best loss: 2.6349027156829834



