<a href="https://colab.research.google.com/github/shake/colab-Llama-2-ipynb/blob/main/domain_knowledge_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing libraries

In [None]:
!pip install "transformers==4.35" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"

In [None]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

# Load dataset from local

In [None]:
!ls -ltr *.csv

-rw-r--r-- 1 root root 128607 Dec  1 12:36 train.csv


In [None]:
df = pd.read_csv("train.csv")

In [None]:
#convert to Huggingface Datasets format
train = Dataset.from_pandas(df)

In [None]:
train

Dataset({
    features: ['text'],
    num_rows: 132
})

# Load the dataset from Huggingface

In [None]:
# !huggingface-cli login

In [None]:
# from datasets import load_dataset, Dataset
# dataset = load_dataset("HuggingFaceH4/no_robots")

In [None]:
# dataset

# Fine-Tuning

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
!huggingface-cli login

In [None]:
# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


In [None]:
%%time
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

CPU times: user 132 ms, sys: 14.9 ms, total: 147 ms
Wall time: 319 ms


In [None]:
%%time
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto")

Downloading config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

CPU times: user 23.2 s, sys: 24.3 s, total: 47.5 s
Wall time: 2min 27s


In [None]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM"
)

In [None]:
# Define the training arguments. For full list of arguments, check
#https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
args = TrainingArguments(
    output_dir='llama2-7b',
    num_train_epochs=10, # adjust based on the data size
    per_device_train_batch_size=2, # use 4 if you have more GPU RAM
    save_strategy="epoch", #steps
    # evaluation_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    seed=42
)

In [None]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)

In [None]:
# train
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss




TrainOutput(global_step=200, training_loss=1.301544189453125, metrics={'train_runtime': 1559.1291, 'train_samples_per_second': 0.847, 'train_steps_per_second': 0.423, 'total_flos': 1.66075578384384e+16, 'train_loss': 1.301544189453125, 'epoch': 9.3})

In [None]:
# save model in local
trainer.save_model()

# Merge the base model and adapters and save it

Clean the memory

In [None]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
torch.cuda.empty_cache()

In [None]:
gc.collect()

0

Reload the saved model and merge it then we can save the whole model

In [None]:
%%time
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    'llama2-7b',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 22.5 s, sys: 7.96 s, total: 30.4 s
Wall time: 1min 26s


In [None]:
# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

In [None]:
# Save the merged model
merged_model.save_pretrained("metallama2-7b-tuned-merged", safe_serialization=True)
tokenizer.save_pretrained("metallama2-7b-tuned-merged")

('metallama2-7b-tuned-merged/tokenizer_config.json',
 'metallama2-7b-tuned-merged/special_tokens_map.json',
 'metallama2-7b-tuned-merged/tokenizer.json')

# Test the model

In [None]:
prompt = "We introduce Florence-2 as"

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
outputs = merged_model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.6)

In [None]:
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

In [None]:
print(result)

We introduce Florence-2 as a novel large language model
Florence-2
developed by Google. Florence-2 is a transformer-based
Foundation and Large Language Model (LLM) that
model trained on a diverse set of tasks, including text-
generates comprehensive caption, text-to-image and text-
input and text output. It is designed to handle complex
and open-ended tasks, such as answering complex ques-
tions, understanding abstract concepts, and generating text
in a wide range of formats and styles.
Florence-2 is a sequential model that processes input
sequences one token at a time. It takes as input a sequence
of tokens representing text, and produces as output a
sequence of tokens representing the predicted text.
Florence-2 is based on the Transformer architecture [6],
which is a type of encoder-decoder model that uses self-
attention mechanisms


In [None]:
# push merged model to the hub
%%time
hf_model_repo = "genaitraining/llama-2-7b-domain-tuned"
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)

# Load the model from the HF Hub and test it

In [None]:
!pip install bitsandbytes accelerate #restart kernel

In [None]:
import torch
from transformers import BitsAndBytesConfig

# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

hf_model_repo = "genaitraining/llama-2-7b-domain-tuned"

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")

In [None]:
prompt = "We introduce Florence-2 as"

In [None]:
# Generate response
%%time
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.6)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Print the result
print(f"Generated response:\n{result}")


# Use Transformers Pipeline for Inference

In [None]:
import transformers

tokenizer = AutoTokenizer.from_pretrained("genaitraining/llama-2-7b-domain-tuned",  trust_remote_code=True)
pipeline = transformers.pipeline(
    "text-generation",
    model="genaitraining/llama-2-7b-domain-tuned",
    trust_remote_code=True

)

In [None]:
%%time
sequences = pipeline(
    prompt,
    temperature=0.6,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)

In [None]:
for seq in sequences:
    print(seq['generated_text'])