In [1]:
import sys
import os

import argparse
import time
import json
from datetime import date

import torch
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping

#Transformers
import transformers
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM , AutoTokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import AutoConfig
from transformers import BitsAndBytesConfig
from lightning.pytorch.loggers import TensorBoardLogger

#Dataset
from datasets import load_dataset

#PEFT
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
from peft import prepare_model_for_kbit_training


import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()

In [2]:
# Define a function to print the number of trainable parameters in the given model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param}")

def tokenize_input(df,tokenizer,tokenizer_chapter_max_length,tokenizer_summary_max_length):

    prompt_start = "Summarize the following : \n"
    prompt_end = "\n\nSummary:"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in df["chapter"]]

    df["input_ids"] = tokenizer(prompt, max_length=tokenizer_chapter_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids
    df["labels"] = tokenizer(df["summary_text"],max_length=tokenizer_summary_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids

    return df

In [3]:
cache_dir = "/work/LitArt/cache" 
log_path = "/work/LitArt/verma/"

tokenizer_chapter_max_length = 1024
tokenizer_summary_max_length = 128
base_model_name = "tiiuae/falcon-7b"
tokenizer_name = "tiiuae/falcon-7b"

today = date.today()

#Training Parameters
batch_size = 1
epochs = 3
log_path = log_path+base_model_name.replace("/","-")+"-" +str(today)+"-"+time.strftime("%H:%M:%S", time.localtime())
logger = TensorBoardLogger(log_path, name="my_model")


In [4]:
from transformers import AutoTokenizer
cache_dir = "/work/LitArt/cache" 

#Bits and Bytes config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, #4bit quantizaition - load_in_4bit is used to load models in 4-bit quantization 
bnb_4bit_use_double_quant=True, #nested quantization technique for even greater memory efficiency without sacrificing performance. This technique has proven beneficial, especially when fine-tuning large models
bnb_4bit_quant_type="nf4", #quantization type used is 4 bit Normal Float Quantization- The NF4 data type is designed for weights initialized using a normal distribution
bnb_4bit_compute_dtype=torch.bfloat16, #modify the data type used during computation. This can result in speed improvements. 
)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name,
                                                    device_map="auto",
                                                    trust_remote_code=True,
                                                    quantization_config=bnb_config,
                                                    cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)
# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing for the model. Gradient checkpointing is a technique used to reduce the memory consumption during the backward pas. Instead of storing all intermediate activations in the forward pass (which is what's typically done to compute gradients in the backward pass), gradient checkpointing stores only a subset of them
base_model.gradient_checkpointing_enable() 

# Prepare the model for k-bit training . Applies some preprocessing to the model to prepare it for training.
base_model = prepare_model_for_kbit_training(base_model)

config = LoraConfig(
    r=16, #The rank of decomposition r is << min(d,k). The default of r is 8.
    lora_alpha=32,#∆W is scaled by α/r where α is a constant. When optimizing with Adam, tuning α is similar as tuning the learning rate.
    target_modules=["query_key_value"], #Modules to Apply LoRA to target_modules. You can select specific modules to fine-tune.
    lora_dropout=0.05,#Dropout Probability for LoRA Layers #to reduce overfitting
    bias="none", #Bias Type for Lora. Bias can be ‘none’, ‘all’ or ‘lora_only’. If ‘all’ or ‘lora_only’, the corresponding biases will be updated during training. 
    task_type= "CAUSAL_LM", #Task Type
    )

base_model = get_peft_model(base_model, config)

# Print the number of trainable parameters in the model
print_trainable_parameters(base_model)





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


Trainable params: 4718592 || All params: 3613463424 || Trainable %: 0.13058363808693696


In [5]:
data = load_dataset('csv', 
                    data_files={
                        'train': "/work/LitArt/data/chunked_dataset/train_dataset_with_summaries.csv",
                        'test': "/work/LitArt/data/chunked_dataset/test_dataset_with_summaries.csv",
                        'val':"/work/LitArt/data/chunked_dataset/validation_dataset_with_summaries.csv"})

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 10668
    })
    test: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1614
    })
    val: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1548
    })
})

In [7]:
tokenized_dataset = data["train"].shuffle().map(tokenize_input, batched=True, fn_kwargs={"tokenizer": tokenizer, "tokenizer_chapter_max_length": 1024,"tokenizer_summary_max_length":128})
tokenized_dataset = tokenized_dataset.remove_columns(['chapter', 'human_summary', '__index_level_0__', 'summary_text'])

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

In [8]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size = batch_size ,     # Specifies the batch size for training on each device (GPU).
    #auto_find_batch_size=True,      # Uncommenting this would let the library automatically find an optimal batch size.
    gradient_accumulation_steps=4,   # Number of forward and backward passes to accumulate gradients before performing an optimizer step.
    # This effectively multiplies the batch size by this number without increasing memory usage.
    num_train_epochs=epochs,              # Specifies the total number of training epochs.
    learning_rate=2e-4,              # Specifies the learning rate for the optimizer.
    fp16=True,     # Enables mixed precision training (fp16) which can speed up training and reduce memory usage.
    save_total_limit=3,              # Limits the total number of model checkpoints saved. Only the last 3 checkpoints are saved.
    logging_steps=5,                 # Specifies how often to log training updates. 
    output_dir=log_path ,          # Directory where the model checkpoints and training outputs will be saved.
    max_steps = 10 ,                 # Limits the total number of training steps. Training will stop after 80 steps regardless of epochs.
    #save_strategy='epoch',    # Uncommenting this would change the strategy for saving model checkpoints. 'epoch' means save after each epoch.
    optim="paged_adamw_8bit",     # Specifies the optimizer to use. it's set to a specific variant of AdamW.
    lr_scheduler_type = 'cosine',     # Specifies the learning rate scheduler type. 'cosine' means it uses cosine annealing.
    warmup_ratio = 0.05,           # Specifies the ratio of total steps for the learning rate warmup phase.
)

In [9]:
trainer = transformers.Trainer(
    model=base_model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
5,3.5322
10,3.3107


TrainOutput(global_step=10, training_loss=3.4214361190795897, metrics={'train_runtime': 330.2808, 'train_samples_per_second': 0.121, 'train_steps_per_second': 0.03, 'total_flos': 1629627246182400.0, 'train_loss': 3.4214361190795897, 'epoch': 0.0})