# Environment Setup

In [22]:
import sys

if 'kaggle_web_client' in sys.modules:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("SKT_HUGGINGFACE_API_KEY")
    # WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
else:
    from dotenv import load_dotenv
    import os
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')
    # WANDB_API_KEY = os.getenv("WANDB_API_KEY")

# Installing Required Libraries

In [2]:
!pip install -q accelerate evaluate rouge_score
!pip install -q datasets loralib einops
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git

# Huggingface login

In [23]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token=HUGGINGFACE_API_KEY)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# imports


In [4]:
from datasets import load_dataset, Dataset
import evaluate

import torch
import copy

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import Accelerator

import bitsandbytes as bnb
from peft import PeftModel, PeftConfig, prepare_model_for_kbit_training, LoraConfig, get_peft_model

from transformers import AutoTokenizer, get_scheduler, BitsAndBytesConfig, GenerationConfig
from transformers import  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the data, tokenizer and Peft fine-tunned model

In [6]:
# dataset = load_dataset("SKT27182/Preprocessed_OpenOrca", streaming=True)
dataset = load_dataset("shirsh10mall/LLM_Instruct_Learning_Project_Preprocessed_Tokenized_Open_Orca_Dataset_Flan_T5", streaming=True)

n_samples = 300

data = { "input_ids":[], "attention_mask":[], "labels": [], "system_prompt":[], "question":[], "response":[]}

for i, sample in enumerate(dataset["train"]):
    if i >= n_samples:
        break
    for key, value in sample.items():
        if key in data.keys():
            data[key].append(value)
        
open_orca = Dataset.from_dict(data)

In [7]:
# split the data for training and validation purpose
# open_orca = open_orca.train_test_split(train_size=0.9, seed=42)

In [8]:
# peft_model_id = "shirsh10mall/First_LLM_Project"
# # peft_model_id = "SKT27182/Qlora"

# config = PeftConfig.from_pretrained(peft_model_id)
# model = AutoModelForSeq2SeqLM.from_pretrained( config.base_model_name_or_path, return_dict=True, load_in_8bit=True, 
#                                                  device_map={"":0}, trust_remote_code=True, )

# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# # Load the Lora model
# model = PeftModel.from_pretrained(model, peft_model_id)

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

checkpoint = "google/flan-t5-large"
lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=['q', 'v'], lora_dropout=0.05, bias="none", task_type= "SEQ_2_SEQ_LM" )
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map={"":0}, trust_remote_code=True)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [28]:
import copy
tuned_model  = copy.deepcopy(model)

In [8]:
# def load_checkpoint(model,filepath):
#     checkpoint = torch.load(filepath)
#     model.load_state_dict(checkpoint['state_dict'])
# #     for parameter in model.parameters():
# #         parameter.requires_grad = False
# #     model.eval()
#     return model

# tuned_model = load_checkpoint(tuned_model,'/kaggle/input/llm-google-flan-t5-large-qlora-fine-tuning/LLM_Project_model_dict.pth.pth')

In [13]:
model.peft_config

{'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='google/flan-t5-large', revision=None, task_type='SEQ_2_SEQ_LM', inference_mode=False, r=8, target_modules=['q', 'v'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)}

In [14]:
model.model?

[0;31mSignature:[0m      [0mmodel[0m[0;34m.[0m[0mmodel[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           T5ForConditionalGeneration
[0;31mString form:[0m   
T5ForConditionalGeneration(
           (shared): Embedding(32128, 1024)
           (encoder): T5Stack(
           (embed_t <...> p=0.1, inplace=False)
           )
           (lm_head): Linear(in_features=1024, out_features=32128, bias=False)
           )
[0;31mFile:[0m           /opt/conda/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py
[0;31mDocstring:[0m     
T5 Model with a `language modeling` head on top.

The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder trans

In [11]:
model?

[0;31mSignature:[0m   [0mmodel[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m        PeftModelForSeq2SeqLM
[0;31mString form:[0m
PeftModelForSeq2SeqLM(
           (base_model): LoraModel(
           (model): T5ForConditionalGeneration(
           <...> se)
           )
           (lm_head): Linear(in_features=1024, out_features=32128, bias=False)
           )
           )
           )
[0;31mFile:[0m        /opt/conda/lib/python3.10/site-packages/peft/peft_model.py
[0;31mDocstring:[0m  
Peft model for sequence-to-sequence language modeling.

Args:
    model ([`~transformers.PreTrainedModel`]): Base transformer model.
    peft_config ([`PeftConfig`]): Peft config.


Example:

    ```py
    >>> from transformers import AutoModelForSeq2SeqLM
    >>> from peft import PeftModelForSeq2SeqLM, get_peft_config

    >>> config = {
    ...     "peft_type": "LORA",
    ...     "task_type": "SEQ_2_SEQ_LM

In [19]:
model.model.push_to_hub?

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0mpush_to_hub[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrepo_id[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_temp_dir[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcommit_message[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprivate[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_shard_size[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;

In [16]:
model.model.push_to_hub("shirsh10mall/First_LLM_Project")   # full model LoRA + Base



pytorch_model.bin:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shirsh10mall/First_LLM_Project/commit/9835b6b83ae46184d3093246f6e44e4391bfc1e2', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='9835b6b83ae46184d3093246f6e44e4391bfc1e2', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
model.push_to_hub("SKT27182/Qlora", use_authentication=True)  # LoRA layer

CommitInfo(commit_url='https://huggingface.co/SKT27182/Qlora/commit/f6908d4795f2f095f20cc6cdc2e62fd429693cf2', commit_message='Upload model', commit_description='', oid='f6908d4795f2f095f20cc6cdc2e62fd429693cf2', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
new_model = AutoModelForSeq2SeqLM.from_pretrained("shirsh10mall/First_LLM_Project", trust_remote_code=True)  # this is ignoring the LoRA layer's weights

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

Some weights of the model checkpoint at shirsh10mall/First_LLM_Project were not used when initializing T5ForConditionalGeneration: ['decoder.block.7.layer.1.EncDecAttention.v.lora_A.default.weight', 'decoder.block.12.layer.0.SelfAttention.v.lora_A.default.weight', 'decoder.block.13.layer.1.EncDecAttention.v.lora_A.default.weight', 'decoder.block.6.layer.0.SelfAttention.v.lora_B.default.weight', 'decoder.block.16.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.13.layer.0.SelfAttention.v.lora_B.default.weight', 'decoder.block.5.layer.1.EncDecAttention.v.lora_B.default.weight', 'decoder.block.10.layer.0.SelfAttention.q.lora_B.default.weight', 'encoder.block.5.layer.0.SelfAttention.v.lora_A.default.weight', 'encoder.block.11.layer.0.SelfAttention.q.lora_B.default.weight', 'decoder.block.5.layer.0.SelfAttention.v.lora_A.default.weight', 'encoder.block.4.layer.0.SelfAttention.q.lora_A.default.weight', 'decoder.block.17.layer.1.EncDecAttention.v.lora_A.default.weight', 'encoder

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
def compare_model_weights(model1, model2, tolerance=0):
    state_dict1 = model1.state_dict()
    state_dict2 = model2.state_dict()

    # Check if the keys of the state dictionaries match
    if set(state_dict1.keys()) != set(state_dict2.keys()):
        return False

    # Check if the values of the state dictionaries match for each key
    for key in state_dict1.keys():
        tensor1 = state_dict1[key]
        tensor2 = state_dict2[key]

        # Check if tensors have the same shape
        if tensor1.shape != tensor2.shape:
            return False

        # Check if tensors have the same values within the specified tolerance
        if not torch.allclose(tensor1, tensor2, atol=tolerance):
            return False


    return True

are_models_equal = compare_model_weights(model, tuned_model)

if are_models_equal:
    print("The models have the same weights.")
else:
    print("The models have different weights.")

The models have different weights.


In [None]:
print(model.base_model.model.lm_head.weight.requires_grad)

# Zero shot Prediction


In [None]:
def analyse_zero_shot_model(data, indx, tokenizer, model, peft=False, max_tokens=200):
    
    prompt = data[indx]["system_prompt"]

    question = data[indx]["question"]
    
    response = data[indx]["response"]
    
    print("Prompt:")
    print(prompt)
    print()
    
    print('Question:')
    print(question)
    print()
    
    print("Response")
    print(response)
    print()
    
    tokenized_input = tokenizer( prompt + " " + question, padding=False, truncation=True, return_tensors="pt")
    
    print("Tokenized Input: Prompt + Question")
    print(tokenized_input)
    print()
    
    print(tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0]))
    print()
    
    print("Zero-shot Prediction:")
    
    device = model.device
    
    if peft:
    
        predicted_response = model.generate(input_ids = tokenized_input["input_ids"].to(device), generation_config=GenerationConfig(max_new_tokens=max_tokens, num_beams=100))
    else:
        predicted_response = model.generate(input_ids = tokenized_input["input_ids"].to(device))
    
    
    predicted_output = tokenizer.decode(predicted_response[0], skip_special_tokens=True)
    print(predicted_output)
    
    
index = 34
    
analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True, max_tokens=200)

In [None]:
index = 110
analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True)

In [None]:
index = 899
analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True, max_tokens=300)

In [None]:
index = 655
analyse_zero_shot_model(open_orca, index, tokenizer, model, peft=True)

---