## Installing libraries

In [1]:
!pip install "transformers==4.35" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"

Collecting transformers==4.35
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.13.0
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [1]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

# Load dataset from local

In [3]:
df = pd.read_csv("qna_data.csv")

In [4]:
# Concatenate 'Question' and 'Answer' columns into a new column 'text'
df['text'] = 'Question:\n' + df['Question'] + '\n\nAnswer:\n' + df['Answer']

In [5]:
df.head()

Unnamed: 0,Question,Answer,text
0,1. What is the name of the new vision foundati...,Florence-2,Question:\n1. What is the name of the new visi...
1,2. What is the main objective of the Florence-...,To perform a diversity of tasks with simple in...,Question:\n2. What is the main objective of th...
2,3. How does the Florence-2 model take user ins...,The Florence-2 model takes text-prompt as task...,Question:\n3. How does the Florence-2 model ta...
3,4. What is the output format of the tasks that...,The output format of the tasks that the Floren...,Question:\n4. What is the output format of the...
4,5. Which annotation process do you use to gene...,The annotation process used in the paper is no...,Question:\n5. Which annotation process do you ...


In [7]:
print(df['text'].loc[0])

Question:
1. What is the name of the new vision foundation model introduced in the paper?

Answer:
Florence-2


In [8]:
#drop columns other than 'text'
df.drop(columns=['Question','Answer'], axis=1, inplace=True)

In [9]:
#convert to Huggingface Datasets format
train = Dataset.from_pandas(df)

In [10]:
train

Dataset({
    features: ['text'],
    num_rows: 321
})

# Load the dataset from Huggingface

In [None]:
# !huggingface-cli login

In [None]:
# from datasets import load_dataset, Dataset
# dataset = load_dataset("HuggingFaceH4/no_robots")

In [None]:
# dataset

# Fine-Tuning

In [6]:
model_id = "genaitraining/llama-2-7b-domain-tuned"

In [12]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [13]:
# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


In [7]:
%%time
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

CPU times: user 76.2 ms, sys: 9.77 ms, total: 86 ms
Wall time: 207 ms


In [15]:
%%time
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto")

Downloading config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

CPU times: user 28.1 s, sys: 27.1 s, total: 55.1 s
Wall time: 6min 46s


In [16]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.1,
                          r=64,
                          bias="none",
                          task_type="CAUSAL_LM"
                        )

In [17]:
# Define the training arguments. For full list of arguments, check
#https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
args = TrainingArguments(
    output_dir='llama2-7b-tuned-qna',
    num_train_epochs=10, # adjust based on the data size
    per_device_train_batch_size=2, # use 4 if you have more GPU RAM
    save_strategy="epoch", #steps
    # evaluation_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    seed=42
)

In [18]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)



In [19]:
# train
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss




TrainOutput(global_step=90, training_loss=1.0671097649468315, metrics={'train_runtime': 687.5173, 'train_samples_per_second': 4.669, 'train_steps_per_second': 2.342, 'total_flos': 7058212081336320.0, 'train_loss': 1.0671097649468315, 'epoch': 9.06})

In [20]:
# save model in local
trainer.save_model()

# Merge the base model and adapters and save it

Clean the memory

In [21]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

0

In [22]:
torch.cuda.empty_cache()

In [23]:
gc.collect()

20730

Reload the saved model and merge it then we can save the whole model

In [2]:
%%time
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    'llama2-7b-tuned-qna',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 24.1 s, sys: 8.19 s, total: 32.3 s
Wall time: 1min 26s


# Test the model

In [18]:
prompt = "What is the output format of the tasks that the Florence-2 model can handle?"
#ground truth = "The output format of the tasks that the Florence-2 model can handle is text forms, whether it be captioning, object detection, grounding or segmentation."

In [19]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

In [20]:
outputs = new_model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.6)

In [21]:
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

In [22]:
print(result)

What is the output format of the tasks that the Florence-2 model can handle?

Answer:
The output format of the tasks that the Florence-2 model can handle depends on the specific task and the input format used. However, in general, the Florence-2 model can generate text with formats such as:

1. Caption: The output format is a sentence or phrase describing the visual content of an image or video.
2. Text-phrase-region triplets: The output format consists of a sentence or phrase, which acts as the region'

a. Question: What is the input format of the tasks that the Florence-2 model can handle?

Answer:
The input format of the tasks that the Florence-2 model can handle depends on the specific task and the input format used. However, in general, the Florence-2 model can handle a wide range of input formats, including text, images, and videos.


In [23]:
# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

In [24]:
# Save the merged model
merged_model.save_pretrained("metallama2-7b-qa-tuned-merged", safe_serialization=True)
tokenizer.save_pretrained("metallama2-7b-qa-tuned-merged")

('metallama2-7b-qa-tuned-merged/tokenizer_config.json',
 'metallama2-7b-qa-tuned-merged/special_tokens_map.json',
 'metallama2-7b-qa-tuned-merged/tokenizer.json')

In [None]:
# !huggingface-cli login

In [25]:
# push merged model to the hub
%%time
hf_model_repo = "genaitraining/llama-2-7b-qna-tuned"
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)

HfHubHTTPError: ignored

# Load the model from the HF Hub and test it

In [None]:
import torch
from transformers import BitsAndBytesConfig

# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

hf_model_repo = "genaitraining/llama-2-7b-qna-tuned"

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")

In [None]:
# prompt = "Question: What is the name of the new vision foundation model introduced in the paper?\n\nAnswer:\n"
# prompt = "Question: How does the Florence-2 model take user instructions?\n\nAnswer:\n"
# prompt = "Question: What is the output format of the tasks that the Florence-2 model can handle?\n\nAnswer:\n"
prompt = "Question: What is the main challenge addressed by the paper?\n\nAnswer:\n"

In [None]:
# Generate response
%%time
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.6)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Print the result
print(f"Generated response:\n{result}")


# Use Transformers Pipeline for Inference

In [None]:
import transformers

tokenizer = AutoTokenizer.from_pretrained("genaitraining/llama-2-7b-domain-tuned",  trust_remote_code=True)
pipeline = transformers.pipeline(
    "text-generation",
    model="genaitraining/llama-2-7b-domain-tuned",
    trust_remote_code=True

)

In [None]:
%%time
sequences = pipeline(
    prompt,
    temperature=0.6,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)

In [None]:
for seq in sequences:
    print(seq['generated_text'])