In [1]:
import os

if "KAGGLE_KERNEL_RUN_TYPE" in os.environ:
    print("Running on Kaggle")
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("SKT_HUGGINGFACE_API_KEY")
    # WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
else:
    from dotenv import load_dotenv
    import os
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('SKT_HUGGINGFACE_API_KEY')
    # WANDB_API_KEY = os.getenv("WANDB_API_KEY")

Running on Kaggle


In [3]:
!pip install -q --upgrade datasets

!pip install -q loralib einops
!pip install -q -U bitsandbytes
!pip install -q peft==0.6.0
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [4]:
from datasets import load_dataset, Dataset
import torch
from torch.optim import AdamW
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import bitsandbytes as bnb
from transformers import AutoTokenizer, get_scheduler, BitsAndBytesConfig, GenerationConfig, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from peft import PeftModel, PeftConfig, prepare_model_for_kbit_training, LoraConfig, get_peft_model

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# API login

In [5]:
# import wandb
# !wandb login WANDB_API_KEY

In [6]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token=HUGGINGFACE_API_KEY, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# commit_log = "Fine tuning on first 10k dataset for 5 epochs, made the model's head as Trainable"
commit_log = "full_run_epoch6_data75k_batch16"
model_save = True
num_epochs = 1
BATCH_SIZE = 16 
n_samples = 75000
checkpoint = "google/flan-t5-large"
initial_learning_rate = 1e-6
peft_combine = True    # turn it false for the first epoch only
patience = 1000
decay_factor = 0.8

# lora_config is effective only when peft_combine = False
lora_config = LoraConfig( r=8, lora_alpha=512, target_modules=['q', 'v'], lora_dropout=0.01, bias="none", task_type= "SEQ_2_SEQ_LM" )#,modules_to_save=["lm_head"])

In [8]:
# dataset = load_dataset("SKT27182/Preprocessed_OpenOrca", streaming=True)
dataset = load_dataset("shirsh10mall/LLM_Instruct_Learning_Project_Preprocessed_Tokenized_Open_Orca_Dataset_Flan_T5", streaming=True)

data = { "input_ids":[], "attention_mask":[], "labels": [], "system_prompt":[], "question":[], "response":[]}

for i, sample in enumerate(dataset["train"]):
    if i >= n_samples:
        break
    for key, value in sample.items():
        if key in data.keys():
            data[key].append(value)
        
open_orca = Dataset.from_dict(data)

Downloading readme:   0%|          | 0.00/886 [00:00<?, ?B/s]

In [9]:
open_orca

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'system_prompt', 'question', 'response'],
    num_rows: 75000
})

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_length

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

512

In [12]:
# if peft_combine:
#     peft_model_id = "shirsh10mall/First_LLM_Project"
#     # peft_model_id = "("SKT27182/Flan_T5_Instruct_Fine_Tuning_QLoRA_Dataset_Open_Orca"

#     config = PeftConfig.from_pretrained(peft_model_id)
#     model = AutoModelForSeq2SeqLM.from_pretrained( config.base_model_name_or_path, return_dict=True, load_in_8bit=True, 
#                                                      device_map={"":0}, trust_remote_code=True, )

#     tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

#     # Load the Lora model
#     model = PeftModel.from_pretrained(model, peft_model_id) #, is_trainable=True)
# else:
#     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map={"":0}, trust_remote_code=True)

#     model.gradient_checkpointing_enable()

#     model = prepare_model_for_kbit_training(model)

#     model = get_peft_model(model, lora_config)

In [13]:
# # Function to compare model layers
# def compare_models(model1, model2):
#     for (name1, param1), (name2, param2) in zip(model1.named_parameters(), model2.named_parameters()):
#         print(param1.shape, param2.shape)
#         print(name1 , "  |  " , name2 , "\n \n")
#         if not torch.equal(param1, param2):
#             print(f"Weights are not the same in layer: {name1}")
#             return False
#     return True

In [14]:
import copy

if peft_combine:
#     peft_model_id = "shirsh10mall/First_LLM_Project"
    peft_model_id = "SKT27182/Flan_T5_Instruct_Fine_Tuning_QLoRA_Dataset_Open_Orca"
    config = PeftConfig.from_pretrained(peft_model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained( config.base_model_name_or_path, return_dict=True, quantization_config=bnb_config,
                                                     device_map={"":0}, trust_remote_code=True, )
    
    model = prepare_model_for_kbit_training(model)

    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

    # Load the Lora model
    model = PeftModel.from_pretrained(model, peft_model_id, is_trainable=True)
    
    original_head = copy.deepcopy(model.lm_head.state_dict())
#     print(original_head)
    
    # adding the fine-tuned head
    lm_head_state_dict = torch.load("/kaggle/input/qlora-llm-instruct-fine-tuning-flan-t5-large/my_model/LLM_Fine_Tuned_Model_NumExample_75000_Epoch_5.pt")
    model.lm_head.load_state_dict(lm_head_state_dict)
        
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map={"":0}, trust_remote_code=True)
    # model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    

try:
    !mkdir /kaggle/working/my_model
except:
    print("Directory Exists")

adapter_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/9.54M [00:00<?, ?B/s]

In [15]:
# model.lm_head.state_dict()

In [16]:
def check_lm_head_same(model, old_head):
    new_head = model.lm_head.state_dict()["weight"]
    old_head = old_head["weight"]
    if not torch.equal(new_head, old_head):
        print("Weights are different, Good to go")
    else:
        print("Weights are same. Try to add the correct lm_head")
        
if peft_combine:
    check_lm_head_same(model, original_head)

Weights are different, Good to go


In [17]:
# def get_base_and_lora_model(peft_model, full_model_config):
#     torch.cuda.empty_cache()
#     full_model_config = peft_model.model.config
    
#     base_model = AutoModelForSeq2SeqLM.from_config(full_model_config).to(device)
#     base_model_dict = base_model.state_dict()
    
#     for name, param in peft_model.model.named_parameters():
        
#         # skip the lora layers
#         if "lora_A" in name or "lora_B" in name:
#             continue
#         name = ".".join(name.split(".")[1:])
#         base_model_dict[name] = param
        
#     base_model.load_state_dict(base_model_dict)
    
#     return base_model

# torch.cuda.empty_cache()
# # out_config
# trr = get_base_and_lora_model(model, base_model_2.config)

In [18]:
# # Function to compare model layers
# def compare_models(model1, model2):
#         if "lora_A" in name1 or "lora_B" in name1:
#         print(param1.shape, param2.shape)
#         print(name1 , "  |  " , name2 , "\n \n")
#             print(param1[:5], param2[:5])
#             return False
#     return True

# compare_models(model.model, trr)

In [19]:
# For encoder-decoder based model (AutoModelForSeq2SeqLM)  for pre_tokenized dataset
tokenized_open_orca_data = open_orca.remove_columns(["system_prompt", "question", "response"])

# setting to format to torch as using torch for training
tokenized_open_orca_data.set_format("torch")

# printing the columns which are present in the tokenized dataset
print(tokenized_open_orca_data.column_names)

# creating dataloader not for Trainer
from torch.utils.data import DataLoader
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, label_pad_token_id=-100, return_tensors="pt")

# train dataloader
train_dataloader = DataLoader( tokenized_open_orca_data, shuffle=True, batch_size=BATCH_SIZE , collate_fn=data_collator , pin_memory=True, prefetch_factor=2 , num_workers=2)

['input_ids', 'attention_mask', 'labels']


In [22]:
print(model.base_model.model.lm_head.weight.requires_grad)
model.base_model.model.lm_head.weight.requires_grad = True
print(model.base_model.model.lm_head.weight.requires_grad)

False
True


The code calculates and prints the number of trainable parameters, the total number of parameters, and the percentage of trainable parameters in the given model. In the provided example, there are 35,258,368 trainable parameters out of a total of 496,102,400 parameters, representing 7.11% of the total parameters.

In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" )


print_trainable_parameters(model)

trainable params: 35258368 || all params: 496102400 || trainable%: 7.107074668455545


The code verifies the data types used in the model's parameters and calculates the percentage of parameters for each data type. The output shows that approximately 41.66% of the model's parameters are of type `torch.float32`, and about 58.34% of the parameters are of type `torch.uint8`.

In [24]:
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float32 206695424 0.4166386294442438
torch.uint8 289406976 0.5833613705557562


# Fine Tuning

## Trainer Style 

In [26]:
# trainer.train() 

## Fine Tuning Model | Pytorch Style

In [28]:
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau

steps_per_epoch = len(train_dataloader)
num_training_steps = num_epochs * steps_per_epoch
progress_bar = tqdm(range(num_training_steps))
warmup_steps = int(0.05*num_training_steps)

optimizer = AdamW(model.parameters(), lr=initial_learning_rate)

step_count = 0

# scheduler_exp = ExponentialLR(optimizer, gamma=0.95)
reduce_lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=decay_factor, patience=patience, verbose=True)
# linear_lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=warmup_steps, 
#                                         num_training_steps=num_training_steps)

# best_loss = 10000
# max_patience = 2

history = {"loss":[],"steps":[],"learning_rate":[],"epochs":[]}

model.train()
for epoch in range(num_epochs):
    losses = 0
    for batch in train_dataloader:
        
        batch = {k: v for k, v in batch.items()}
        outputs = model(**batch)
        
        loss = outputs.loss
        loss = torch.mean(loss)
        history["epochs"].append(epoch)
        history["loss"].append(loss.detach().numpy())
        history["steps"].append(step_count)
        
        # Get the current learning rate
        current_lr = optimizer.param_groups[0]['lr']
        history["learning_rate"].append(current_lr)
        
#         # Update the learning rate (warm-up and decay)
#         if step_count < warmup_steps:
#             linear_lr_scheduler.step()
#         else:
#             # Exponential decay: use the ExponentialLR scheduler for the main learning rate schedule
#             scheduler_exp.step()

        
        # Check if the validation loss has improved, if not, increase the patience
#         if loss < best_loss:
#             best_loss = loss
#             patience = 0
#         else:
#             patience += 1
            
#         if patience==max_patience:
#             scheduler_exp.gamma = 1.0

        step_count+=1
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.set_postfix(loss=loss.item())
        progress_bar.update(1)
        losses += loss.detach().numpy()
        
        reduce_lr_scheduler.step(loss)
    # model.save_pretrained("/kaggle/working/")
    print("Epoch: ", epoch, "  |  Loss: ", losses/len(train_dataloader) )
    torch.save(model.lm_head.state_dict(), "/kaggle/working/my_model/LLM_Fine_Tuned_Model_NumExample_"+str(n_samples)+"_Epoch_"+str(6)+".pt")

  0%|          | 0/4688 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch 01340: reducing learning rate of group 0 to 8.0000e-07.
Epoch 02368: reducing learning rate of group 0 to 6.4000e-07.
Epoch 04079: reducing learning rate of group 0 to 5.1200e-07.
Epoch:  0   |  Loss:  1.2402301286267745


In [29]:
# model.push_to_hub("shirsh10mall/First_LLM_Project",  commit_message = "NumExample_"+str(n_samples)+"_Epoch_"+str(epoch))

In [30]:
import pandas as pd
history_df = pd.DataFrame.from_dict(history)
history_df.to_csv("history_df.csv")
history_df

Unnamed: 0,loss,steps,learning_rate,epochs
0,1.3820169,0,1.000000e-06,0
1,1.2483375,1,1.000000e-06,0
2,1.1928674,2,1.000000e-06,0
3,1.2828995,3,1.000000e-06,0
4,1.4154831,4,1.000000e-06,0
...,...,...,...,...
4683,0.85566443,4683,5.120000e-07,0
4684,1.3587308,4684,5.120000e-07,0
4685,1.3001906,4685,5.120000e-07,0
4686,1.3864732,4686,5.120000e-07,0


In [31]:
if model_save:
    model.push_to_hub("SKT27182/Flan_T5_Instruct_Fine_Tuning_QLoRA_Dataset_Open_Orca", commit_message = "NumExample_"+str(n_samples)+"Finetune_Epoch_6")
    login(token=user_secrets.get_secret("SHIRSH_HUGGINGFACE_API_KEY"), write_permission=True)
    model.push_to_hub("shirsh10mall/First_LLM_Project",  commit_message = "NumExample_"+str(n_samples)+"_Epoch_"+str(6))
    

adapter_model.bin:   0%|          | 0.00/9.54M [00:00<?, ?B/s]

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


adapter_model.bin:   0%|          | 0.00/9.54M [00:00<?, ?B/s]

In [32]:
# Empty the cache
torch.cuda.empty_cache()

The code uses the Plotly library to create two line plots: one showing the loss vs. steps and the other showing the learning rate vs. steps. Each data point is represented as a marker colored based on a category. The categories are mapped to colors, such as red, blue, green, orange, and purple. The plots are displayed with appropriate titles and axis labels using the `fig.show()` function.

In [34]:
# # now getting back the model from the DataParallel instance
# model = model.module

# Evaluation

# Prediction

In [36]:
def analyse_zero_shot_model(data, indx, tokenizer, model, peft=False):
    
    prompt = data[indx]["system_prompt"]

    question = data[indx]["question"]
    
    response = data[indx]["response"]
    
    print(f"Input: \n\n{prompt}\n\n{question}")
    print()
    
    print(f"Exact Response: \n\n{response}")
    print()
    
    tokenized_input = tokenizer( f"{prompt}\n\n{question}", padding=False, truncation=True, return_tensors="pt")
    
    device = model.device
    
    if peft:
    
        predicted_response = model.generate(input_ids = tokenized_input["input_ids"].to(device), 
                                            generation_config=GenerationConfig(max_new_tokens=512, num_beams=25, early_stopping=True,
                                                                              min_length=1, num_beam_groups=5, repetition_penalty=1.75,
                                                                              length_penalty=2.25, diversity_penalty=0.25 ))
    else:
        predicted_response = model.generate(input_ids = tokenized_input["input_ids"].to(device))
        
    predicted_output = tokenizer.decode(predicted_response[0], skip_special_tokens=True)
        
    print(f"Fine-tuned Model's Response: \n\n{predicted_output}")
    
index = 1
analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True)

Input: 

You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.

Sinceitspremierein1842,ithasundergonechanges,abbreviationsandadditions,butmostnotablyinthisbeautifulandcriticallyacclaimedstagedirection,wherethesecondactappearswithnewchoreographybySorellaEnglundandNikolajHübbeandnewmusicbyLouiseAlenius.
Generate a sentence using the above characters:

Exact Response: 

Sorella Englund and Nikolaj Hübbe created new choreography for the second act of the performance, while Louise Alenius composed new music for it.





Fine-tuned Model's Response: 

Since the production premiered in 1842, it has undergone changes, modifications, and additions, but most notably in this beautiful and critically acclaimed stage direction, where the second act appears with new choreography by Sorella Englund and Nikolalaj Hübbe, and new music by Louise Alenius.


In [37]:
import numpy as np
number_exmaples_for_inference=10
for yo in range(number_exmaples_for_inference):
    index = np.random.randint(0,n_samples)
    print("example index:", index)
    analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True)
    print("\n\n\n\n  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- ")

example index: 26967
Input: 

You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.

Please answer the following question: What is the missing first step of the following process:  -  Carbon dioxide from the burning fossil fuels enter the atmosphere - The make-up of the atmosphere changes - Some energy from the sun cannot escape back through the changed atmosphere - The earth begins to heat because of the extra trapped energy - Human beings cut down trees - Human beings do not re-plant the trees - Trees cannot help take the extra carbon dioxide from the atmosphere - The extra trapped energy starts to change the earth&#x27;s short term weather - Eventually the long-term climate patterns start to change -
Answer:

Exact Response: 

The missing first step in the given process is the burning of fossil fuels. The process starts with the burning of fossil fuels, which releases carbon dioxide into the atmosphere. This carbon dioxide 