In [None]:
!pip install transformers bitsandbytes accelerate  # we need latest transformers for this
!pip install peft==0.5.0
!pip install -U datasets
import locale # colab workaround
locale.getpreferredencoding = lambda: "UTF-8" # colab workaround
!pip install wandb
!pip install torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118

In [None]:
from datetime import datetime
import os
import sys
import torch

In [None]:
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

In [None]:
# Import the notebook_login function from the huggingface_hub module
from huggingface_hub import notebook_login

# This function call will prompt for your Hugging Face API token and log you into the Hugging Face Hub
# directly from a Jupyter notebook. It's useful for accessing private models or datasets and managing API calls.
notebook_login()

In [None]:
# Import the load_dataset function from the 'datasets' module, which is part of the Hugging Face 'datasets' library
from datasets import load_dataset

# Load the dataset named "ttbui/html_alpaca" from the Hugging Face dataset repository, specifying that
# we only want the 'train' split of this dataset.
dataset = load_dataset("ttbui/html_alpaca", split="train")

# Split the loaded training dataset into training and test datasets using the train_test_split function.
# Here, 10% of the data is reserved for the test set. The "train" subset of this split is assigned to train_dataset.
train_dataset = dataset.train_test_split(test_size=0.1)["train"]

# Similarly, this line retrieves the "test" subset from the same split, assigning it to eval_dataset.
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]


In [None]:
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import bitsandbytes

In [None]:
# Specify the identifier for the pretrained model that you want to use. In this case, the model is
# "codellama/CodeLlama-7b-hf" from the Hugging Face model hub.
base_model = "codellama/CodeLlama-7b-hf"

# Load the pretrained model specified by `base_model` using the AutoModelForCausalLM class,
# which is designed for causal language modeling tasks (like generating text based on previous context).
# The model is configured to load in 8-bit format for memory efficiency, use 16-bit floating point precision,
# and automatically map the model to the available device (GPU/CPU) for optimal performance.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,  # Enables 8-bit quantization to reduce memory usage
    torch_dtype=torch.float16,  # Sets the data type to 16-bit floating point to save memory and potentially speed up computation
    device_map="auto",  # Automatically places model layers on the most appropriate device (CPU/GPU)
)

# Load the tokenizer associated with the specified model. Tokenizers convert input text into a format
# that the model can understand (i.e., convert strings to token ids).
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")


In [None]:
import accelerate

In [None]:
# Define the evaluation prompt for the model. This prompt sets the context for the task the model needs to perform:
# generating HTML code for a health website focused on common behavior disorders in children.
eval_prompt = """You are a powerful text-to-HTML-generation model. Your job is to generate code. You are given a question.

You must output the HTML code that answers the question.
### Input:
create a Health website for 4 Common Behavior Disorders in Children

### Response:
"""

# Use the tokenizer to convert the eval_prompt into a format suitable for the model (token IDs). 
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# Put the model in evaluation mode. 
model.eval()

# Disable gradient calculations to save memory and speed up computation since gradients are not needed during evaluation.
with torch.no_grad():
    # Generate text based on the model_input. The model generates tokens until it reaches 500 new tokens,
    # and then stops to form the complete output text.
    # The output tokens are then decoded back into string format, omitting any special tokens like padding or EOS.
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))


In [None]:
# Set the `add_eos_token` attribute of the tokenizer to True. This ensures that an end-of-sequence (EOS)
# token is automatically appended to each sequence processed by the tokenizer. 
tokenizer.add_eos_token = True

# Set the `pad_token_id` attribute of the tokenizer to 0. This is the token ID that the tokenizer will
# use to pad sequences to a uniform length when batching together multiple sequences.
tokenizer.pad_token_id = 0

# Set the `padding_side` attribute of the tokenizer to "left". This indicates that padding, if necessary,
# should be added to the left side of the sequence rather than the right.
tokenizer.padding_side = "left"


In [None]:
# Define a function named 'tokenize' that takes a single parameter 'prompt'.
# This function processes the prompt to prepare it for model input.
def tokenize(prompt):
    # Tokenize the input prompt using the global tokenizer object. The tokenizer converts the prompt into
    # token IDs with the following settings:
    # - Truncation: Long inputs are cut down to the maximum length.
    # - max_length: Set the maximum number of tokens in the sequence to 512.
    # - padding: No padding is added to the sequences; each sequence retains its original length.
    # - return_tensors: Do not convert the output to tensor format (e.g., PyTorch tensors).
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # Create a new key 'labels' in the result dictionary. This is commonly done in self-supervised learning
    # tasks where the input itself serves as the label for training. Here, the 'labels' are a copy of the 'input_ids',
    # which are the token IDs corresponding to the input prompt.
    result["labels"] = result["input_ids"].copy()

    # Return the modified dictionary which now includes both token IDs and labels.
    return result


In [None]:
# Define a function named 'generate_and_tokenize_prompt' that takes one parameter 'data_point'.
# This function generates a formatted prompt for a text-to-HTML generation task and then tokenizes it.
def generate_and_tokenize_prompt(data_point):
    # Construct a full prompt using an f-string for string interpolation. The prompt sets the context
    # as a powerful text-to-HTML generation model tasked with generating HTML code in response to a question.
    # The actual content of the question ('instruction') and the expected output HTML code ('output') are
    # dynamically inserted from the 'data_point' dictionary.
    full_prompt = f"""You are a powerful text-to-HTML-generation model. Your job is to generate code. You are given a question.

You must output the HTML code that answers the question.

### Input:
{data_point["instruction"]}

### Response:
{data_point["output"]}
"""

    # Call the 'tokenize' function, defined earlier, passing the fully constructed prompt.
    # The 'tokenize' function handles the specifics of converting the string prompt into a format
    # suitable for model input, including tokenization and setting up labels for self-supervised learning.
    return tokenize(full_prompt)


In [None]:
# Apply the 'generate_and_tokenize_prompt' function to each example in the train_dataset.
# The 'map' method iteratively applies the function to each element of the dataset.
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)

# Similarly, apply the 'generate_and_tokenize_prompt' function to each example in the eval_dataset.
# This ensures that the validation dataset is processed in the same way as the training dataset,
# allowing for consistent model evaluation.
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)


In [None]:
# Load a pretrained model and tokenizer from Hugging Face's model hub.
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,  # Reduce model size for faster loading.
    torch_dtype=torch.float16,  # Use 16-bit floats to decrease memory usage.
    device_map="auto",  # Automatically distribute the model across available devices.
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Add a padding token to the tokenizer if it doesn't already have one, and adjust the model's token embeddings accordingly.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Update the model to handle the new number of tokens.


In [None]:
# Switches the model to training mode, enabling layers like dropout and batch normalization.
model.train()

# Prepares the model for INT8 training for improved efficiency and reduced memory usage.
model = prepare_model_for_int8_training(model)

# Configure LoRA (Low-Rank Adaptation) parameters for enhancing the model with minimal extra parameters.
config = LoraConfig(
    r=16,  # Rank of the adaptation.
    lora_alpha=16,  # Scaling factor for LoRA weights.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Transformer modules to apply LoRA.
    lora_dropout=0.05,  # Dropout rate for LoRA adaptations.
    bias="none",  # Specifies the bias configuration in LoRA layers.
    task_type="CAUSAL_LM",  # Type of task for which the model is being adapted.
)

# Apply the LoRA configuration to the model for efficient parameter-efficient fine-tuning.
model = get_peft_model(model, config)


In [None]:
# Define the name of the Weights & Biases project for tracking and organizing experiments.
wandb_project = "html-generator"

# Check if the project name string is not empty.
if len(wandb_project) > 0:
    # Set an environment variable 'WANDB_PROJECT' to the name of the Weights & Biases project.
    # This environment variable is used by the wandb library to associate the running script with a specific project.
    os.environ["WANDB_PROJECT"] = wandb_project


In [None]:
# Check if more than one GPU is available in the system using PyTorch.
if torch.cuda.device_count() > 1:
    # Set the 'is_parallelizable' attribute of the model to True. This indicates that the model
    # can be parallelized across multiple GPUs, avoiding automatic data parallelism attempts by some frameworks.
    model.is_parallelizable = True

    # Enable model parallelism. This setting directs the framework to split the model across
    # multiple GPUs, which can lead to more efficient utilization of multiple GPUs during training or inference.
    model.model_parallel = True

In [None]:
!pip install wandb

In [None]:
import wandb

In [None]:
# Define batch sizes and calculate how many gradient accumulation steps are needed
# based on the per-device batch size.
batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size

# Set the directory where training outputs will be saved.
output_dir = "html-code-llama"

# Configure training parameters. This setup includes the batch size per device, number of gradient accumulation steps,
# learning rate settings, and logging frequency. Evaluation and saving strategies are also set to occur at specified step intervals.
training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,  # Number of steps to perform learning rate warmup.
        max_steps=400,  # Total number of training steps.
        learning_rate=3e-4,  # Starting learning rate.
        fp16=True,  # Use mixed precision training.
        logging_steps=10,  # Interval of training steps between logging.
        optim="adamw_torch",  # Use the AdamW optimizer from the Torch library.
        evaluation_strategy="steps",  # Perform evaluation at specified intervals.
        save_strategy="steps",  # Save the model at specified intervals.
        eval_steps=20,  # Number of steps between evaluations.
        save_steps=20,  # Number of steps between model saves.
        output_dir=output_dir,  # Directory for saving output files.
        group_by_length=True,  # Improve efficiency by grouping sequences of similar lengths.
        report_to="wandb",  # Send all logging output to Weights & Biases.
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",  # Unique run name based on the current date and time.
    )

# Initialize the Trainer, which handles the training and evaluation loops.
# It is configured with the model, datasets, training arguments, and data collator which handles how data batches are formed.
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)


In [None]:
# Disable caching within the model's configuration to potentially increase efficiency or handle specific use cases
# where caching previous states is not desirable.
model.config.use_cache = False

# Backup the original state dictionary method of the model.
old_state_dict = model.state_dict

# Override the state dictionary method to use a custom function that integrates Low-Rank Adaptation (LoRA)
# adjustments before returning the state. This is done via a lambda that captures the old state dictionary method.
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)

# Conditional compilation of the model based on the Python environment:
# Only compile if using PyTorch version 2 or higher and not on a Windows platform.
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    # Use PyTorch's just-in-time compilation to optimize the model for the hardware it's running on.
    model = torch.compile(model)


In [None]:
# Start training the model using the Trainer object. 
trainer.train()

In [None]:
# Import the PyTorch library and specific classes from the Hugging Face Transformers library.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the base model identifier for loading the model from Hugging Face's Model Hub.
base_model = "codellama/CodeLlama-7b-hf"

# Load the pre-trained causal language model with configurations optimized for lower memory usage.
# This includes using 8-bit quantization and 16-bit floating point precision, and automatically mapping the model to the best available device.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,  # Enable 8-bit model loading for reduced model size.
    torch_dtype=torch.float16,  # Use 16-bit floating point types for tensors.
    device_map="auto",  # Automatically map model layers to available devices (GPU/CPU).
)

# Load the tokenizer that matches the pre-trained model, which is necessary for processing text inputs.
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

In [None]:
# Import the PeftModel class from the peft module.
from peft import PeftModel

# Load a pre-trained model into a PEFT model wrapper. This wrapper might handle specialized tasks like fine-tuning or
# applying low-rank adaptations for more efficient parameter usage. The model is initialized from a previously saved 
# state or configuration stored under "model_code_llama".
model = PeftModel.from_pretrained(model, "model_code_llama")

In [None]:
# Define the evaluation prompt that instructs the model to generate HTML code for a specific purpose.
eval_prompt = """You are a powerful text-to-HTML-generation model. Your job is to generate code. You are given a question.

You must output the HTML code that answers the question.
### Input:
Create a Health website for Dalmane (Flurazepam): Side Effects, Uses, Dosage, Interactions, Warnings.

### Response:
"""

# Tokenize the prompt text and prepare it for the model by converting it into a format (tensor) suitable for processing,
# and ensure it is placed on a CUDA device for GPU acceleration.
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# Set the model to evaluation mode, which is crucial for making predictions as it disables certain layers like dropout.
model.eval()

# Disable gradient calculations for efficiency, as they are unnecessary during inference.
with torch.no_grad():
    # Generate text (HTML code) in response to the input prompt, limiting the generation to 500 new tokens.
    # Decode the generated tokens back into a string while skipping special tokens like padding or EOS.
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))


In [None]:
# Save the trained model to the directory "model_code_llama".
trainer.save_model("model_code_llama")

In [None]:
# Save the trained model to the specified directory for later use.
model.save_pretrained("model_code_llama")

# Save the associated tokenizer to the same directory as the model.
tokenizer.save_pretrained("model_code_llama")

In [None]:
# Define the evaluation prompt for the text-to-HTML generation task.
eval_prompt = """You are a powerful text-to-HTML-generation model. Your job is to generate code. You are given a question.

You must output the HTML code that answers the question.
### Input:
Create a Health website for Self-Improvement Strategies for Mental Health.

### Response:
"""

# Convert the text prompt into a format (tensor) that the model can process and move it to a GPU device for efficient computation.
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# Set the model to evaluation mode to disable training-specific operations like dropout.
model.eval()

# Disable gradient calculations to enhance performance during the inference process.
with torch.no_grad():
    # Generate HTML code from the prompt, limit the output to 500 tokens, and decode the result into plain text.
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))


In [None]:
# Define the input prompt that instructs the model to generate HTML for a specific content type.
eval_prompt = """You are a powerful text-to-HTML-generation model. Your job is to generate code. You are given a question.

You must output the HTML code that answers the question.
### Input:
Create a Health website for Self-Improvement Strategies for Mental Health.

### Response:
"""

# Prepare the model input by tokenizing the prompt and loading it onto the GPU for efficient processing.
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# Set the model to evaluation mode, ensuring it's ready for inference without tracking gradients.
model.eval()

# Execute the model's generation function within a context that prevents PyTorch from calculating gradients,
# enhancing performance during the inference phase.
with torch.no_grad():
    # Generate text based on the input, limit the output to 500 new tokens, and decode the tokens to plain text.
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=500)[0], skip_special_tokens=True))
