In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install trl
!pip install datasets
!pip install numpy==1.22.1
!python3 -m pip install --upgrade h5py

In [None]:
#Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
from huggingface_hub import notebook_login

# Execute the notebook_login function to start the authentication process.
# This will prompt you to enter your Hugging Face API token directly in the notebook,
# which enables you to access private models or datasets and manage API usage.
notebook_login()

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import accelerate
import bitsandbytes

In [None]:
# Load configuration for a parameter-efficient fine-tuning model.
config = PeftConfig.from_pretrained("PrincySinghal991/falcon-7b-sharded-bf16-finetuned-html-code-generation")

# Define the model name for the pretrained model.
model_name = "ybelkada/falcon-7b-sharded-bf16"  # sharded falcon-7b model

# Configure model loading and precision using BitsAndBytesConfig.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # Load model in 4-bit precision for efficiency.
    bnb_4bit_quant_type="nf4",    # Quantize pre-trained model in 4-bit NF format.
    bnb_4bit_use_double_quant=True, # Use double quantization to enhance precision.
    bnb_4bit_compute_dtype=torch.bfloat16, # Use BF16 format during computation for performance.
)

# Load the model with specific quantization and device placement settings.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Apply quantization settings.
    device_map="auto",  # Automatically assign model layers to optimal devices.
    trust_remote_code=True, # Allow execution of custom remote code.
)

# Convert the loaded model to a PEFT model with specific configurations.
model = PeftModel.from_pretrained(model, "PrincySinghal991/falcon-7b-sharded-bf16-finetuned-html-code-generation",
    torch_dtype=torch.float16,  # Set tensor data type to float16 for memory efficiency.
    trust_remote_code=True,     # Enable execution of remote code for this model as well.
    device_map='auto',          # Auto-assign model components to devices.
)


In [None]:
# Load a tokenizer for the specified model, allowing for the execution of remote code and setting the padding direction.
tokenizer = AutoTokenizer.from_pretrained(
    model_name,                 # Model identifier from Hugging Face's model hub.
    trust_remote_code=True,     # Allow execution of custom code associated with the model.
    padding_side='left',        # Specify that padding should be added to the left side of token sequences.
)

# Set the padding token to be the same as the end-of-sequence token for the tokenizer.
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Import the json module to handle JSON data.
import json

# Open the file located at "data/data03.json" in read mode.
f = open("data/data03.json", "r")

# Load the JSON data from the file into the 'data' variable.
data = json.load(f)

In [None]:
# Initialize two empty lists to store prompts and responses separately.
prompts = []
responses = []

# Loop through each dictionary in the 'data' list loaded from the JSON file.
for i in data:
    # Append the value associated with the key 'prompt' from each dictionary to the 'prompts' list.
    prompts.append(i["prompt"])
    # Append the value associated with the key 'output' from each dictionary to the 'responses' list.
    responses.append(i["output"])

In [None]:
# Import the train_test_split function from the sklearn.model_selection module to handle data splitting.
from sklearn.model_selection import train_test_split

# Split the 'prompts' and 'responses' lists into training and testing sets.
# The data is shuffled to ensure randomness and split such that 10% is reserved for testing.
X_train, X_test, y_train, y_test = train_test_split(
  prompts, responses,      # Lists containing the input data and corresponding labels.
  random_state=104,        # Seed for the random number generator for reproducibility.
  test_size=0.1,           # Proportion of the dataset to include in the test split.
  shuffle=True             # Enable shuffling to randomize the distribution of data between splits.
)

In [None]:
# Import the DatasetDict and Dataset classes from the datasets library.
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

# Create dictionaries for training and testing datasets using the previously split data.
# Each dictionary maps column names to the respective data lists.
d = {
    'train': Dataset.from_dict({'Code': y_train, 'Prompt': X_train}),  # Create a dataset from the training data.
    'test': Dataset.from_dict({'Code': y_test, 'Prompt': X_test})      # Create a dataset from the testing data.
}

# Combine these datasets into a DatasetDict, which conveniently manages multiple sets (e.g., train and test).
webdataset = DatasetDict(d)

In [None]:
# Import the BitsAndBytesConfig class from the transformers library (unused in the shown code but may be required elsewhere).
from transformers import BitsAndBytesConfig

# Prepare the model for training using lower bit precision to reduce memory usage and possibly increase speed.
model = prepare_model_for_kbit_training(model)

# Set LoRA configuration parameters: scaling factor, dropout probability, and rank of the low-rank matrices.
lora_alpha = 32  # Scaling factor influencing the magnitude of parameter updates.
lora_dropout = 0.05  # Dropout rate for LoRA layers to prevent overfitting.
lora_rank = 32  # Rank for the low-rank approximation, balancing model complexity and expressiveness.

# Configure LoRA settings for the model, specifying layers to target and other options.
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # Opt not to train bias parameters to focus on weights.
    task_type="CAUSAL_LM",  # Specify the type of model task.
    target_modules=[
        "query_key_value",  # Transformer sub-layers to apply LoRA.
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

# Apply the LoRA configuration to the model to create a parameter-efficient fine-tuned (PEFT) model.
peft_model = get_peft_model(model, peft_config)

In [None]:
output_dir = "webcraft_falcon"
per_device_train_batch_size = 2 #batch size of 16 and 32 might give better results
gradient_accumulation_steps = 2  # increase gradient accumulation steps by 2x if batch size is reduced ie here Gradients are accumulated for 2 steps before performing a backward pass and updating the model weights.
optim = "paged_adamw_32bit" # activates the paging for better memory management --> AdamW optimizer variant
save_strategy="steps" # checkpoint save strategy to adopt during training
save_steps = 20 # number of updates steps before two checkpoint saves
logging_steps = 20  # number of update steps between two logs if logging_strategy="steps"
learning_rate = 2e-4  # learning rate for AdamW optimizer
max_grad_norm = 0.3 # maximum gradient norm (for gradient clipping) to avoid exploding gradient problem
max_steps = 320        # training will happen for 320 steps
warmup_ratio = 0.03 # number of steps used for a linear warmup from 0 to learning_rate--> Warmup slowly increases the learning rate from 0 to the specified learning rate to help stabilize training in the early stages.
lr_scheduler_type = "cosine"  # learning rate follows cosine curve--> slowly decrease lr

# Configure training parameters to fine-tune the model with specific hardware optimization and learning strategies.
training_arguments = TrainingArguments(
    output_dir="webcraft_falcon",  # Directory for saving training outputs.
    per_device_train_batch_size=2,  # Small batch size could be increased for better performance.
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps to effectively increase batch size.
    optim="paged_adamw_32bit",  # Use a memory-efficient variant of the AdamW optimizer.
    save_steps=20,  # Save model checkpoint every 20 steps.
    logging_steps=20,  # Log training progress every 20 steps.
    learning_rate=2e-4,  # Set learning rate for the optimizer.
    bf16=False,  # Do not use Brain Floating Point 16-bit precision.
    max_grad_norm=0.3,  # Clip gradients to a maximum norm of 0.3 to prevent gradient explosion.
    max_steps=320,  # Set a finite number of training steps to 320.
    warmup_ratio=0.03,  # Linear warmup over 3% of training steps.
    group_by_length=True,  # Group similar length prompts to improve batching efficiency.
    lr_scheduler_type="cosine",  # Use a cosine learning rate scheduler.
    push_to_hub=True,  # Automatically push checkpoints to the Hugging Face Hub.
    tf32=False,  # Disable TensorFlow 32-bit precision.
    evaluation_strategy="steps",  # Evaluate model periodically during training.
    eval_steps=20,  # Perform evaluation every 20 steps.
    load_best_model_at_end=True,  # Load the best performing model at the end of training.
)

In [None]:
# Import the EarlyStoppingCallback from the transformers library, which can be used to halt training
# when a monitored metric has stopped improving.
from transformers import EarlyStoppingCallback

# Open the file "data/data27.json" in read mode to access its contents.
f2 = open("data/data27.json", "r")

# Load the JSON content from the opened file into the 'test_data' variable. This JSON data is typically 
# structured as a dictionary or list of dictionaries, depending on its layout.
test_data = json.load(f2)

In [None]:
# Define a function named concat_fields that takes an example (dictionary) as input and returns a new dictionary
# containing only the 'Code' field from the input example.
def concat_fields(example):
    return {'Code': example['Code']}

# Apply the concat_fields function to each example in the train dataset, retaining only the 'Code' field.
train_dataset = webdataset["train"].map(concat_fields)

# Apply the concat_fields function to each example in the test dataset, retaining only the 'Code' field.
test_dataset = webdataset["test"].map(concat_fields)

# Print the resulting train and test datasets to inspect their structure and content.
print(train_dataset)
print(test_dataset)

In [None]:
# Initialize the SFTTrainer instance with the following parameters:
# - peft_model: The parameter-efficient fine-tuned (PEFT) model to be trained.
# - train_dataset: The dataset used for training the model.
# - peft_config: Configuration for the PEFT model, including LoRA settings.
# - dataset_text_field: The field in the dataset containing text data ('Code' in this case).
# - max_seq_length: Maximum sequence length for tokenized inputs.
# - tokenizer: The tokenizer used to preprocess text inputs.
# - args: Training arguments, including optimization settings and training schedule.
# - eval_dataset: The dataset used for evaluation during training (test dataset).
# - callbacks: Optional list of callbacks, such as early stopping criteria.
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=webdataset["train"],
    peft_config=peft_config,
    dataset_text_field="Code",  # Field containing text data in the dataset.
    max_seq_length=1024,  # Maximum sequence length for tokenized inputs.
    tokenizer=tokenizer,  # Tokenizer used for preprocessing.
    args=training_arguments,  # Training arguments and configuration.
    eval_dataset=webdataset["test"],  # Evaluation dataset (test dataset).
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Callback for early stopping.
)

# Adjust the norm layers in the model for dealing with large models with reduced precision.
# This loop iterates through all named modules in the model and converts them to float32 if "norm" is in their name.
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
# Disable cache usage in the PEFT model configuration to prevent caching during training.
peft_model.config.use_cache = False

# Initiate the training process by calling the train() method on the trainer object.
trainer.train()

In [None]:
# Loss :
        
    
# [260/320 1:34:09 < 21:53, 0.05 it/s, Epoch 2/4]
# Step	Training Loss	Validation Loss
# 20	1.140900	1.104002
# 40	1.069200	0.946198
# 60	0.968000	0.840632
# 80	0.659900	0.771222
# 100	0.796700	0.767768
# 120	0.599200	0.737263
# 140	0.542400	0.724135
# 160	0.507300	0.699540
# 180	0.578600	0.697789
# 200	0.606400	0.695403
# 220	0.463400	0.707009
# 240	0.473700	0.701999
# 260	0.496700	0.702401

In [None]:
# Upload the trained model checkpoints and associated files to the Hugging Face Model Hub.
trainer.push_to_hub()

In [None]:
# Define the target device for inference (GPU).
DEVICE = "cuda"

# Tokenize the input text using the tokenizer, returning PyTorch tensors.
inputs = tokenizer("create a Recreation website for Home - BMW Riders of Oregon", return_tensors="pt")

# Extract input_ids and attention_mask tensors from the tokenized inputs.
input_ids = inputs["input_ids"]               # Tokenized input IDs.
input_attention_mask = inputs["attention_mask"]  # Attention mask indicating valid tokens.

In [None]:
# Execute model generation within a no-gradient context to prevent gradient computation and storage.
with torch.no_grad():
    # Generate output text using the model's generate method.
    generation_output = model.generate(
        input_ids=input_ids,                       # Input token IDs.
        attention_mask=input_attention_mask,       # Attention mask for valid tokens.
        return_dict_in_generate=True,             # Return a dictionary in the generation output.
        max_new_tokens=1000,                       # Maximum number of tokens to generate.
        eos_token_id=tokenizer.eos_token_id,       # ID of the end-of-sequence token.
    )

# Extract the generated sequence from the generation output.
generation_output = generation_output.sequences[0]


In [None]:
# Decode the generated token sequence into human-readable text, skipping special tokens.
output = tokenizer.decode(generation_output, skip_special_tokens=True)

In [None]:
print(output)

In [None]:
# Define the target device for inference (GPU).
DEVICE = "cuda"

# Tokenize the input text using the tokenizer, returning PyTorch tensors.
inputs = tokenizer("create a Recreation website for Home - BMW Riders of Oregon", return_tensors="pt")

# Extract input_ids and attention_mask tensors from the tokenized inputs.
input_ids = inputs["input_ids"]               # Tokenized input IDs.
input_attention_mask = inputs["attention_mask"]  # Attention mask indicating valid tokens.

In [None]:
# Execute model generation within a no-gradient context to prevent gradient computation and storage.
with torch.no_grad():
    # Generate output text using the model's generate method.
    generation_output = model.generate(
        input_ids=input_ids,                       # Input token IDs.
        attention_mask=input_attention_mask,       # Attention mask for valid tokens.
        return_dict_in_generate=True,             # Return a dictionary in the generation output.
        max_new_tokens=1000,                       # Maximum number of tokens to generate.
        eos_token_id=tokenizer.eos_token_id,       # ID of the end-of-sequence token.
    )

# Extract the generated sequence from the generation output.
generation_output = generation_output.sequences[0]

In [None]:
# Decode the generated token sequence into human-readable text, skipping special tokens.
output = tokenizer.decode(generation_output, skip_special_tokens=True)

In [None]:
print(output)