In [1]:
import sys
import os

import argparse
import time
import json
from datetime import date

from pprint import pprint

import torch

#Transformers
import transformers
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM , AutoTokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import AutoConfig
from transformers import BitsAndBytesConfig
from lightning.pytorch.loggers import TensorBoardLogger

#Dataset
from datasets import load_dataset

#PEFT
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
from peft import prepare_model_for_kbit_training

#SFTT
from trl import SFTTrainer

#NLTK
from nltk.tokenize import word_tokenize


import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()
cache_dir = "/work/LitArt/cache" 

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer_name = "meta-llama/Llama-2-7b-hf"
dataset_name = "ccdv/govreport-summarization"

In [3]:
#Define a function to print the number of trainable parameters in the given model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param}")


In [4]:
dataset = load_dataset(dataset_name,cache_dir=cache_dir)

In [5]:
dataset["train"]

Dataset({
    features: ['report', 'summary'],
    num_rows: 17517
})

In [6]:
def find_length(row):
    row["length"] = len(word_tokenize(row["report"]))
    return row

In [7]:
dataset = dataset.map(find_length)

In [8]:
train_dataset = dataset["train"].filter( lambda row: row["length"] <= 4000) 

In [9]:
train_dataset

Dataset({
    features: ['report', 'summary', 'length'],
    num_rows: 3476
})

In [10]:
#Bnb Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, #4bit quantizaition - load_in_4bit is used to load models in 4-bit quantization 
    bnb_4bit_use_double_quant=True, #nested quantization technique for even greater memory efficiency without sacrificing performance. This technique has proven beneficial, especially when fine-tuning large models
    bnb_4bit_quant_type="nf4", #quantization type used is 4 bit Normal Float Quantization- The NF4 data type is designed for weights initialized using a normal distribution
    bnb_4bit_compute_dtype=torch.bfloat16, #modify the data type used during computation. This can result in speed improvements. 
)

In [11]:
#Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)
# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

In [12]:
def tokenize_input(example,tokenizer):

    prompt_start = "Summarize the following government report : \n"
    prompt_end = "\n\nSummary:"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in example["report"]]

    example["input_ids"] = tokenizer(prompt ,max_length=4096 , padding="max_length" , truncation=True , return_tensors="pt").input_ids
    example["labels"] = tokenizer(example["summary"] , max_length=256 , padding="max_length" , truncation=True , return_tensors="pt").input_ids

    return example

In [13]:
tokenized_dataset = train_dataset.shuffle().map(tokenize_input, batched=True, fn_kwargs={"tokenizer": tokenizer})
tokenized_dataset = tokenized_dataset.remove_columns(['report', 'summary', 'length'])

Map:   0%|          | 0/3476 [00:00<?, ? examples/s]

In [14]:
#Loading the base model
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                    device_map="auto",
                                                    trust_remote_code=True,
                                                    quantization_config=bnb_config,
                                                    cache_dir=cache_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
# Enable gradient checkpointing for the model. Gradient checkpointing is a technique used to reduce the memory consumption during the backward pas. Instead of storing all intermediate activations in the forward pass (which is what's typically done to compute gradients in the backward pass), gradient checkpointing stores only a subset of them
base_model.gradient_checkpointing_enable() 

# Prepare the model for k-bit training . Applies some preprocessing to the model to prepare it for training.
base_model = prepare_model_for_kbit_training(base_model)

qlora_config = LoraConfig(
    r=32, #The rank of decomposition r is << min(d,k). The default of r is 8.
    lora_alpha=32,#∆W is scaled by α/r where α is a constant. When optimizing with Adam, tuning α is similar as tuning the learning rate.
    target_modules=["q_proj", "v_proj"], #Modules to Apply LoRA to target_modules. You can select specific modules to fine-tune.
    lora_dropout=0.05,#Dropout Probability for LoRA Layers #to reduce overfitting
    bias="none", #Bias Type for Lora. Bias can be ‘none’, ‘all’ or ‘lora_only’. If ‘all’ or ‘lora_only’, the corresponding biases will be updated during training. 
    task_type= "CAUSAL_LM", #Task Type
    )

base_model_qlora = get_peft_model(base_model, qlora_config)

# Print the number of trainable parameters in the model
print_trainable_parameters(base_model_qlora)

Trainable params: 16777216 || All params: 3517190144 || Trainable %: 0.477006226934315


In [16]:
batch_size = 16
epochs = 2

In [17]:
#Training Arguments
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size = batch_size ,     # Specifies the batch size for training on each device (GPU).
    #auto_find_batch_size=True,      # Uncommenting this would let the library automatically find an optimal batch size.
    gradient_accumulation_steps=2,   # Number of forward and backward passes to accumulate gradients before performing an optimizer step.
    # This effectively multiplies the batch size by this number without increasing memory usage.
    num_train_epochs=epochs,              # Specifies the total number of training epochs.
    learning_rate=2e-4,              # Specifies the learning rate for the optimizer.
    fp16=True,     # Enables mixed precision training (fp16) which can speed up training and reduce memory usage.
    save_total_limit=3,              # Limits the total number of model checkpoints saved. Only the last 3 checkpoints are saved.
    logging_steps=10,                 # Specifies how often to log training updates. 
    output_dir="results/",          # Directory where the model checkpoints and training outputs will be saved.
    # max_steps = 200 ,                 # Limits the total number of training steps. Training will stop after 80 steps regardless of epochs.
    save_strategy='epoch',    # Uncommenting this would change the strategy for saving model checkpoints. 'epoch' means save after each epoch.
    optim="paged_adamw_8bit",     # Specifies the optimizer to use. it's set to a specific variant of AdamW.
    lr_scheduler_type = 'constant_with_warmup',     # Specifies the learning rate scheduler type. 'cosine' means it uses cosine annealing.
    warmup_ratio = 0.05,           # Specifies the ratio of total steps for the learning rate warmup phase.
    report_to="tensorboard"
)

In [18]:
trainer = SFTTrainer(
    model=base_model_qlora,
    train_dataset=tokenized_dataset,
    peft_config=qlora_config,
    max_seq_length=None,
    args=training_arguments,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    packing=True,
)

base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


In [None]:
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['report', 'summary', 'length'],
        num_rows: 3476
    })
    validation: Dataset({
        features: ['report', 'summary', 'length'],
        num_rows: 155
    })
    test: Dataset({
        features: ['report', 'summary', 'length'],
        num_rows: 169
    })
})