### Upvote if this starter Notebook helps

### User Input: Give me a sky blue color.
### LLM response: #6092ff

# Import Required Library

In [None]:
!pip install -q bitsandbytes accelerate loralib trl 
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install -U datasets

In [1]:
import os
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig,AutoPeftModelForCausalLM,PeftModel
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,TrainingArguments,pipeline
from trl import SFTTrainer
from time import perf_counter

import warnings
warnings.filterwarnings('ignore')



### Define params

In [2]:
class CFG:
    dataset_id="burkelibbey/colors"
    base_model_id="PY007/TinyLlama-1.1B-Chat-v0.3"
    output_directory="tinyllama-colorist-lora"
    access_token="Paste your key"

cfg = CFG()

### Prepare the dataset using template f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>\n"

---



In [3]:
def format_data(dataset_id):
    data = load_dataset(dataset_id,split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["description", "color"]].apply(lambda x: "<|im_start|>user\n" + x["description"] + " <|im_end|>\n<|im_start|>assistant\n" + x["color"] + "<|im_end|>\n", axis=1)
    data = Dataset.from_pandas(data_df)
    data = data.train_test_split(seed=42, test_size=0.2)
    return data

In [4]:
data = format_data(cfg.dataset_id)
data

Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.38M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['color', 'description', 'text'],
        num_rows: 27109
    })
    test: Dataset({
        features: ['color', 'description', 'text'],
        num_rows: 6778
    })
})

In [5]:
data['train'][0]

{'color': '#20a080',
 'description': "Medium blue-green: This shade is a medium intensity blue-green, somewhat similar to the color of a tropical ocean. It's a cool color that straddles the line between blue and green, but is slightly more on the green side.",
 'text': "<|im_start|>user\nMedium blue-green: This shade is a medium intensity blue-green, somewhat similar to the color of a tropical ocean. It's a cool color that straddles the line between blue and green, but is slightly more on the green side. <|im_end|>\n<|im_start|>assistant\n#20a080<|im_end|>\n"}

### Define tokenizer model

In [6]:
def get_tokenizer_and_model(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=bnb_config, device_map="auto")
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model,tokenizer

model,tokenizer = get_tokenizer_and_model(cfg.base_model_id)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

### Finetune TinyLLama

In [7]:
peft_config = LoraConfig(
    r=8,lora_alpha=16,lora_dropout=0.05,bias="none",task_type="CAUSAL_LM"
)
training_arguments = TrainingArguments(
    output_dir=cfg.output_directory,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=1,
    max_steps=200,
    fp16=True,
    push_to_hub=False
)
trainer = SFTTrainer(
        model=model,
        train_dataset=data['train'],
        eval_dataset=data['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

Map:   0%|          | 0/27109 [00:00<?, ? examples/s]

Map:   0%|          | 0/6778 [00:00<?, ? examples/s]

### Train

In [8]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,2.6984
20,2.3789
30,2.047
40,1.7977
50,1.6685
60,1.5954
70,1.5087
80,1.4892
90,1.4423
100,1.4444


TrainOutput(global_step=200, training_loss=1.5877786588668823, metrics={'train_runtime': 476.7214, 'train_samples_per_second': 13.425, 'train_steps_per_second': 0.42, 'total_flos': 3073578732158976.0, 'train_loss': 1.5877786588668823, 'epoch': 0.24})

### Evaluate

In [9]:
trainer.evaluate()

{'eval_loss': 1.3607922792434692,
 'eval_runtime': 227.1998,
 'eval_samples_per_second': 29.833,
 'eval_steps_per_second': 3.732,
 'epoch': 0.24}

### Merging base model and peft trained model

In [11]:
model = AutoModelForCausalLM.from_pretrained(cfg.base_model_id,torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)
peft_model = PeftModel.from_pretrained(model,'/kaggle/working/tinyllama-colorist-lora/checkpoint-200',from_transformers=True, device_map={"":0})
model = peft_model.merge_and_unload()

In [12]:
model_id_colorist_final="ssarkar4445/tinyllama-colorist-peft"
model.push_to_hub(model_id_colorist_final,token=cfg.access_token)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(cfg.base_model_id)
tokenizer.push_to_hub(model_id_colorist_final,token=cfg.access_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ssarkar4445/tinyllama-colorist-peft/commit/58b88a94bb25d486143811cfa4c685b5b51eb3fc', commit_message='Upload tokenizer', commit_description='', oid='58b88a94bb25d486143811cfa4c685b5b51eb3fc', pr_url=None, pr_revision=None, pr_num=None)

## Model Inference

In [14]:
model_id_colorist_final="ssarkar4445/tinyllama-colorist-peft"


def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"


tokenizer = AutoTokenizer.from_pretrained(model_id_colorist_final)
pipe = pipeline(
    "text-generation",
    model=model_id_colorist_final,
    torch_dtype=torch.float16,
    device_map="auto",
)

start_time = perf_counter()

prompt = formatted_prompt('give me a pure brown color')

sequences = pipe(
    prompt,
    do_sample=True,
    temperature=0.1,
    top_p=0.9,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=12
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

output_time = perf_counter() - start_time
print(f"Time taken for inference: {round(output_time,2)} seconds")

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Result: <|im_start|>user
give me a pure brown color<|im_end|>
<|im_start|>assistant: #806055 

Time taken for inference: 1.91 seconds
