In [1]:
import os
import random
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from llama import BasicModelRunner
from utilities import tokenize_and_split_data
from transformers.trainer_callback import TrainerCallback
import matplotlib.pyplot as plt


class MetricsCollector(TrainerCallback):
    """
    Callback to collect metrics during training.

    This callback stores all the logs it receives during training in a list
    called `metrics`. This list can then be used to plot training loss, learning rate,
    and other metrics.
    """

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Stores the logs received during training.

        This method is called whenever the trainer logs information. It simply
        appends the entire `logs` dictionary to the `metrics` list.

        Args:
          args: Arguments passed to the trainer.
          state: State of the trainer.
          control: Control object for the trainer.
          logs: Dictionary containing the logged metrics. (optional)
          **kwargs: Additional keyword arguments.
        """
        self.metrics.append(logs)


def plot_loss(metrics, output_dir):
    """
    Plots the training loss from the collected metrics and saves the plot.

    This function iterates through the `metrics` list and extracts the `loss` value
    from each dictionary. It then filters out any entries where `loss` is missing
    and plots the remaining values. The plot is saved in the specified `output_dir`.

    Args:
      metrics: List of dictionaries containing training logs.
      output_dir: Directory to save the plot.
    """
    losses = [m.get('loss', None) for m in metrics]  # Use .get() to handle missing keys
    non_none_losses = [loss for loss in losses if loss is not None]
    plt.plot(non_none_losses)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.savefig(os.path.join(output_dir, 'training_loss_plot.png'))
    plt.close()


def plot_learning_rate(metrics, output_dir):
    """
    Plots the learning rate from the collected metrics and saves the plot.

    This function follows the same logic as `plot_loss` to extract and plot the
    learning rate values from the `metrics` list, handling missing entries.
    The plot is saved in the specified `output_dir`.

    Args:
      metrics: List of dictionaries containing training logs.
      output_dir: Directory to save the plot.
    """
    learning_rates = [m.get('learning_rate', None) for m in metrics]
    non_none_learning_rates = [lr for lr in learning_rates if lr is not None]
    plt.plot(non_none_learning_rates)
    plt.xlabel('Iteration')
    plt.ylabel('Learning Rate')
    plt.title('Learning Rate')
    plt.savefig(os.path.join(output_dir, 'learning_rate_plot.png'))
    plt.close()


def find_best_hyperparameters():
    model_name = "EleutherAI/pythia-70m"
    use_hf = False
    current_directory = os.getcwd()
    folder_path = os.path.join(current_directory, "content")
    dataset_name = "ai-medical-chatbot_processed.jsonl"
    dataset_path = os.path.join(folder_path, dataset_name)
    
    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    training_config = {
        "model": {
            "pretrained_name": model_name,
            "max_length" : 2048
        },
        "datasets": {
            "use_hf": use_hf,
            "path": dataset_path
        },
        "verbose": True
    }
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

    best_hyperparameters = None
    best_loss = float('inf')

    import itertools

    # Define hyperparameter search space
    hyperparameter_space = {
        "learning_rate": [1e-05],
        "num_train_epochs": [1],
        "per_device_train_batch_size": [1],
        "optim": ["adafactor"],
        "num_iterations": [1],
    }

    # Generate all combinations of hyperparameters
    all_hyperparameters = list(itertools.product(*hyperparameter_space.values()))

    for hyperparameter_values in all_hyperparameters:
        hyperparameters = dict(zip(hyperparameter_space.keys(), hyperparameter_values))

        # Print the current hyperparameters
        print("Using hyperparameters:")
        for key, value in hyperparameters.items():
            print(f"{key}: {value}")

        # Setup training_args with the current hyperparameters
        training_args = TrainingArguments(
            learning_rate=hyperparameters["learning_rate"],
            num_train_epochs=hyperparameters["num_train_epochs"],
            per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
            output_dir="./results",  # Provide a dummy output directory
            overwrite_output_dir=False,
            disable_tqdm=False,
            eval_steps=120,
            save_steps=120,
            warmup_steps=1,
            per_device_eval_batch_size=1,
            evaluation_strategy="steps",
            logging_strategy="steps",
            logging_steps=1,
            optim=hyperparameters["optim"],
            gradient_accumulation_steps=4,
            gradient_checkpointing=False,
            load_best_model_at_end=True,
            save_total_limit=1,
            metric_for_best_model="eval_loss",
            greater_is_better=False
        )

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        base_model.to(device)

        trainer = Trainer(
            model=base_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset
        )

        metrics_collector = MetricsCollector()
        trainer.add_callback(metrics_collector)

        training_output = trainer.train()

        eval_results = trainer.evaluate()

        if eval_results["eval_loss"] < best_loss:
            best_loss = eval_results["eval_loss"]
            best_hyperparameters = hyperparameters
        

        output_dir = os.path.join(current_directory, "SearchGrid")
        # Create the folder if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        plot_loss(metrics_collector.metrics, output_dir)
        plot_learning_rate(metrics_collector.metrics, output_dir)

    return best_hyperparameters, best_loss


best_hyperparameters, best_loss = find_best_hyperparameters()

print("Best hyperparameters:", best_hyperparameters)
print("Best loss:", best_loss)

2024-04-09 10:32:12,721 - DEBUG - utilities - Config: datasets.path: c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m
verbose: true



tokenize False c:\Blog\How-to-Finetuning-Large-Language-Models\content\ai-medical-chatbot_processed.jsonl


2024-04-09 10:32:13,075 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-04-09 10:32:13,190 - DEBUG - fsspec.local - open file: C:/Users/066226758/.cache/huggingface/datasets/json/default-59ea57fe03c7d0e8/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json


Using hyperparameters:
learning_rate: 1e-05
num_train_epochs: 1
per_device_train_batch_size: 1
optim: adafactor
num_iterations: 1


  0%|          | 0/337 [00:00<?, ?it/s]

{'loss': 4.8075, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 4.059, 'learning_rate': 9.970238095238096e-06, 'epoch': 0.01}
{'loss': 4.6117, 'learning_rate': 9.940476190476192e-06, 'epoch': 0.01}
{'loss': 3.897, 'learning_rate': 9.910714285714288e-06, 'epoch': 0.01}
{'loss': 4.0117, 'learning_rate': 9.880952380952381e-06, 'epoch': 0.01}
{'loss': 4.0519, 'learning_rate': 9.851190476190477e-06, 'epoch': 0.02}
{'loss': 3.2927, 'learning_rate': 9.821428571428573e-06, 'epoch': 0.02}
{'loss': 3.7339, 'learning_rate': 9.791666666666666e-06, 'epoch': 0.02}
{'loss': 3.4974, 'learning_rate': 9.761904761904762e-06, 'epoch': 0.03}
{'loss': 3.3106, 'learning_rate': 9.732142857142858e-06, 'epoch': 0.03}
{'loss': 3.796, 'learning_rate': 9.702380952380953e-06, 'epoch': 0.03}
{'loss': 3.3639, 'learning_rate': 9.672619047619049e-06, 'epoch': 0.04}
{'loss': 3.4088, 'learning_rate': 9.642857142857144e-06, 'epoch': 0.04}
{'loss': 3.2516, 'learning_rate': 9.61309523809524e-06, 'epoch': 0.04}
{'loss': 3.36

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.2275211811065674, 'eval_runtime': 17.4102, 'eval_samples_per_second': 8.616, 'eval_steps_per_second': 8.616, 'epoch': 0.36}
{'loss': 1.3276, 'learning_rate': 6.4285714285714295e-06, 'epoch': 0.36}
{'loss': 2.7252, 'learning_rate': 6.398809523809524e-06, 'epoch': 0.36}
{'loss': 2.7075, 'learning_rate': 6.369047619047619e-06, 'epoch': 0.36}
{'loss': 1.9005, 'learning_rate': 6.3392857142857145e-06, 'epoch': 0.37}
{'loss': 3.3708, 'learning_rate': 6.30952380952381e-06, 'epoch': 0.37}
{'loss': 1.1787, 'learning_rate': 6.279761904761906e-06, 'epoch': 0.37}
{'loss': 1.9572, 'learning_rate': 6.25e-06, 'epoch': 0.38}
{'loss': 1.494, 'learning_rate': 6.220238095238096e-06, 'epoch': 0.38}
{'loss': 2.0389, 'learning_rate': 6.1904761904761914e-06, 'epoch': 0.38}
{'loss': 2.2449, 'learning_rate': 6.160714285714286e-06, 'epoch': 0.39}
{'loss': 2.406, 'learning_rate': 6.130952380952382e-06, 'epoch': 0.39}
{'loss': 3.184, 'learning_rate': 6.101190476190477e-06, 'epoch': 0.39}
{'loss': 2

  0%|          | 0/150 [00:00<?, ?it/s]

{'eval_loss': 2.127291679382324, 'eval_runtime': 16.3041, 'eval_samples_per_second': 9.2, 'eval_steps_per_second': 9.2, 'epoch': 0.71}
{'loss': 2.1837, 'learning_rate': 2.8571428571428573e-06, 'epoch': 0.71}
{'loss': 2.9419, 'learning_rate': 2.8273809523809524e-06, 'epoch': 0.72}
{'loss': 0.0664, 'learning_rate': 2.797619047619048e-06, 'epoch': 0.72}
{'loss': 1.4968, 'learning_rate': 2.767857142857143e-06, 'epoch': 0.72}
{'loss': 2.0161, 'learning_rate': 2.7380952380952387e-06, 'epoch': 0.73}
{'loss': 1.3777, 'learning_rate': 2.7083333333333334e-06, 'epoch': 0.73}
{'loss': 2.8532, 'learning_rate': 2.6785714285714285e-06, 'epoch': 0.73}
{'loss': 3.6408, 'learning_rate': 2.648809523809524e-06, 'epoch': 0.73}
{'loss': 3.6157, 'learning_rate': 2.6190476190476192e-06, 'epoch': 0.74}
{'loss': 1.4322, 'learning_rate': 2.5892857142857148e-06, 'epoch': 0.74}
{'loss': 1.7301, 'learning_rate': 2.5595238095238095e-06, 'epoch': 0.74}
{'loss': 2.024, 'learning_rate': 2.529761904761905e-06, 'epoch': 

  0%|          | 0/150 [00:00<?, ?it/s]

2024-04-09 10:45:51,851 - DEBUG - matplotlib.pyplot - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-04-09 10:45:51,853 - DEBUG - matplotlib.pyplot - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-04-09 10:45:51,858 - DEBUG - matplotlib.font_manager - findfont: Matching sans\-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=10.0.
2024-04-09 10:45:51,859 - DEBUG - matplotlib.font_manager - findfont: score(FontEntry(fname='c:\\Blog\\How-to-Finetuning-Large-Language-Models\\.venv\\lib\\site-packages\\matplotlib\\mpl-data\\fonts\\ttf\\DejaVuSansMono-BoldOblique.ttf', name='DejaVu Sans Mono', style='oblique', variant='normal', weight=700, stretch='normal', size='scalable')) = 11.335
2024-04-09 10:45:51,861 - DEBUG - matplotlib.font_manager - findfont: score(FontEntry(fname='c:\\Blog\\How-to-Finetuning-Large-Language-Models\\.venv\\lib\\site-packages\\matplotlib\\mpl-data\\fonts\\ttf\\cmtt10.ttf', name='cmtt10

Best hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 1, 'optim': 'adafactor', 'num_iterations': 1}
Best loss: 2.127291679382324
