<a href="https://colab.research.google.com/github/rezzie-rich/colab-notebooks/blob/main/RB_MisLoROpt_t1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Modularized and Dynamic Training Script for Mistral 7B with LoRA, PEFT, and Hyperparameter Optimization

import logging
import pandas as pd
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq, trainer_utils)
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, prepare_model_for_int8_training, get_peft_model
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login, HfFolder
import optuna
from optuna.integration import HuggingFacePruner
import os
import numpy as np

# Authenticate with Hugging Face
notebook_login()

# Setting up logging for debugging and tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Custom Callback for Checkpointing during training
class CustomCheckpointCallback(trainer_utils.Callback):
    def __init__(self, model):
        self.model = model

    def on_step_end(self, args, state, control, **kwargs):
        # Save model at every 100 steps for tracking and potential rollback
        if state.global_step % 100 == 0:
            self.model.save_pretrained(f'{args.output_dir}/checkpoint_{state.global_step}')

# Function Definitions

def input_with_validation(prompt, type_=None, validation=None, error_msg='Invalid input'):
    """Validates and converts user input."""
    while True:
        try:
            value = input(prompt)
            if type_:
                value = type_(value)
            if validation and not validation(value):
                raise ValueError
            return value
        except ValueError:
            print(error_msg)

def initialize_model_and_tokenizer(model_name):
    """Initializes the model and tokenizer."""
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return model, tokenizer
    except Exception as e:
        logging.error(f"Error initializing model and tokenizer: {e}")
        raise

def configure_lora(model, r, lora_alpha, target_modules, lora_dropout, bias_config):
    """
    Configures LoRA for the model. LoRA allows efficient tuning of large language models by
    learning low-rank updates instead of full-rank weight matrices, reducing the number of trainable parameters.
    """
    try:
        lora_config = LoraConfig(r=r, lora_alpha=lora_alpha, target_modules=target_modules, lora_dropout=lora_dropout, bias=bias_config)
        # Preparing model for quantized INT8 training for efficiency
        model = prepare_model_for_int8_training(model)
        # Applying PEFT for efficient distributed training
        model = get_peft_model(model, lora_config)
        return model
    except Exception as e:
        logging.error(f"Error configuring LoRA: {e}")
        raise

def load_and_prepare_dataset(dataset_name, text_field, response_field=None, local_path=None, tokenizer=None, max_length=512):
    """
    Loads and prepares the dataset. This includes loading the dataset from a local path or from Hugging Face,
    tokenizing the text, and formatting it properly for the model training.
    """
    try:
        if local_path:
            dataset = Dataset.from_pandas(pd.read_csv(local_path))
        else:
            dataset = load_dataset(dataset_name, split='train')

        def preprocess_function(examples):
            if response_field:
                text = ['Prompt: ' + ex[text_field] + '\nResponse: ' + ex[response_field] for ex in examples]
            else:
                text = examples[text_field]
            return tokenizer(text, truncation=True, padding='max_length', max_length=max_length)

        dataset = dataset.map(preprocess_function, batched=True)
        return dataset
    except Exception as e:
        logging.error(f"Error loading and preparing dataset: {e}")
        raise

def train_model(model, training_args, train_dataset, test_dataset, tokenizer):
    """
    Trains the model with the provided datasets. Uses Trainer from Hugging Face for simplicity and efficiency.
    Custom checkpoints are used for saving the model at regular intervals.
    """
    try:
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=8)
        trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, data_collator=data_collator, callbacks=[CustomCheckpointCallback(model)])
        trainer.train()
        return trainer
    except Exception as e:
        logging.error(f"Error during model training: {e}")
        raise

def optimize_hyperparameters(model, tokenizer, datasets_info, base_training_args):
    def objective(trial):
        # Define the hyperparameters to optimize
        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
        num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
        per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])

        # Update base training arguments with suggestions
        training_args = TrainingArguments(
            output_dir=base_training_args.output_dir,
            learning_rate=learning_rate,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            # ... other arguments from base_training_args ...
        )

        # Run the training and evaluation loop for each dataset
        for i, dataset_info in enumerate(datasets_info):
            # Load and prepare dataset
            dataset = load_and_prepare_dataset(**dataset_info, tokenizer=tokenizer)
            train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)
            trainer = train_model(model, training_args, train_dataset, test_dataset, tokenizer)

        # Here you should return a metric from the evaluation on the test set
        # For instance, if the trainer returns a dictionary with an 'eval_loss' you can use that
        eval_result = trainer.evaluate()
        return eval_result["eval_loss"]  # or another metric that you prefer

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=10)  # You can specify the number of trials

    # Log the best hyperparameters
    logging.info(f"Best trial: {study.best_trial.params}")

    # You can return the best parameters here if you want to use them for further training
    return study.best_trial.params

# Experience Replay
def experience_replay(previous_dataset, current_dataset, replay_rate=0.1):
    """
    This function takes a fraction of the previous dataset and adds it to the current training dataset.
    It helps in preventing catastrophic forgetting by retraining the model on a portion of the previous data.
    """
    replay_samples = previous_dataset.shuffle(seed=42).select(range(int(replay_rate * len(previous_dataset))))
    combined_dataset = concatenate_datasets([current_dataset, replay_samples])
    return combined_dataset.shuffle(seed=42)

# Main Script Execution
def main():
     try:
        # Model and tokenizer initialization
        model_name = input("Enter the model name (e.g., 'mistralai/Mistral-7B-v0.1'): ")
        model, tokenizer = initialize_model_and_tokenizer(model_name)

        # Configuring LoRA parameters
        # LoRA rank and alpha values should be tuned based on the specific model and training data
        r = input_with_validation("Enter LoRA rank (e.g., 16): ", int)
        lora_alpha = input_with_validation("Enter LoRA alpha (e.g., 32): ", int)
        target_modules = input("Enter target modules separated by comma (e.g., 'q_proj,k_proj,v_proj,o_proj'): ").split(',')
        lora_dropout = input_with_validation("Enter LoRA dropout rate (e.g., 0.05): ", float)
        bias_config = input("Enter bias configuration ('none' or other): ")
        model = configure_lora(model, r, lora_alpha, target_modules, lora_dropout, bias_config)

        # Ask the user if they want to perform hyperparameter optimization
        perform_optuna_optimization = input("Do you want to perform hyperparameter optimization? (yes/no): ").lower() == 'yes'

        # Training arguments setup
        learning_rate = input_with_validation("Enter learning rate (e.g., 1e-3): ", float)
        num_train_epochs = input_with_validation("Enter number of training epochs (e.g., 5): ", int)
        batch_size = input_with_validation("Enter per device train batch size (e.g., 8): ", int)
        output_dir = input("Enter output directory path: ")
        training_args = TrainingArguments(output_dir=output_dir, learning_rate=learning_rate, num_train_epochs=num_train_epochs, per_device_train_batch_size=batch_size, logging_steps=50, save_steps=100, fp16=True, gradient_checkpointing=True, evaluation_strategy="epoch", warmup_steps=500, weight_decay=0.01, lr_scheduler_type="linear")

        # Prepare datasets information
        num_datasets = input_with_validation("Enter the number of datasets: ", int)
        datasets_info = []
        for i in range(num_datasets):
            dataset_name = input(f"Enter the dataset name for dataset {i+1} (leave blank if using local dataset): ")
            text_field = input(f"Enter the name of the text field in the dataset for dataset {i+1}: ")
            response_field = input(f"Enter the name of the response field in the dataset for dataset {i+1} (leave blank if not applicable): ")
            local_path = input(f"Enter local dataset path for dataset {i+1} (leave blank if using Hugging Face dataset): ")
            datasets_info.append({
                'dataset_name': dataset_name,
                'text_field': text_field,
                'response_field': response_field,
                'local_path': local_path
            })

        if perform_optuna_optimization:
            # Perform hyperparameter optimization
            best_params = optimize_hyperparameters(model, tokenizer, datasets_info, training_args)
            logging.info(f"Optuna optimization completed. Best parameters: {best_params}")

            # Update training_args with best_params
            training_args.learning_rate = best_params.get('learning_rate', training_args.learning_rate)
            training_args.num_train_epochs = best_params.get('num_train_epochs', training_args.num_train_epochs)
            training_args.per_device_train_batch_size = best_params.get('per_device_train_batch_size', training_args.per_device_train_batch_size)

        previous_dataset = None
        for dataset_info in datasets_info:
            dataset = load_and_prepare_dataset(**dataset_info, tokenizer=tokenizer)
            if previous_dataset is not None:
                dataset = experience_replay(previous_dataset, dataset)
            previous_dataset = dataset

            # Splitting the dataset into training and testing sets
            train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

            # Starting the training process
            trainer = train_model(model, training_args, train_dataset, test_dataset, tokenizer)

        # Uploading the trained model to Hugging Face for easy access and version control
        trainer.save_model()
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)
            model.push_to_hub(f"{model_name}_trained_model")
    except Exception as e:
        logging.error(f"An error occurred in the main script: {e}")

if __name__ == "__main__":
    main()
