In [None]:
!pip install accelerate peft bitsandbytes transformers trl

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

### Load Dataset
Load dataset from https://huggingface.co/datasets/burkelibbey/colors

In [3]:
# load dataset from https://huggingface.co/datasets/burkelibbey/colors?row=0
dataset = "burkelibbey/colors"

data = load_dataset(dataset, split="train")
data_df = data.to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.38M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
data_df.head()

Unnamed: 0,color,description
0,#000000,Pure Black: A shade that completely absorbs li...
1,#000010,Extremely dark blue: This is such a dark shade...
2,#000011,Very dark blue: A nearly black shade of blue t...
3,#000020,Very dark blue: Almost black with a very sligh...
4,#000022,Very dark blue: An almost black color with jus...


In [5]:
print(f"shape:{data_df.shape}")
print(f"max:{max([len(des) for des in data_df['description']])}")
print(f"min:{min([len(des) for des in data_df['description']])}")

shape:(33887, 2)
max:349
min:60


In [6]:
# Reformat the data in the ChatML format

def formatted_train(input,response)->str:
  return f"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>\n"

# Example
input = 'Pure Black: A shade that completely absorbs light and does not reflect any colors. It is the darkest possible shade.'
response = '#000000'

print(formatted_train(input, response))

<|im_start|>user
Pure Black: A shade that completely absorbs light and does not reflect any colors. It is the darkest possible shade.<|im_end|>
<|im_start|>assistant
#000000<|im_end|>



In [7]:
def prepare_train_data(data_id):
    data = load_dataset(data_id, split="train")
    data_df = data.to_pandas()
    data_df["text"] = data_df[["description", "color"]].apply(lambda x: "<|im_start|>user\n" + x["description"] + " <|im_end|>\n<|im_start|>assistant\n" + x["color"] + "<|im_end|>\n", axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [8]:
data = prepare_train_data(dataset)

In [9]:
print(data)
print(data[0]['text'])

Dataset({
    features: ['color', 'description', 'text'],
    num_rows: 33887
})
<|im_start|>user
Pure Black: A shade that completely absorbs light and does not reflect any colors. It is the darkest possible shade. <|im_end|>
<|im_start|>assistant
#000000<|im_end|>



### Fetch Model

We will use TinyLlama pretrained model from https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

It has 1.1B parameters, but we won't be finetuning all the parameters. Instead we will use LoRA for fine-tuning

In [10]:
# Fetch TinyLlama pretrained model and tokenizer from
# https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token

  # use 4 bit quantization to load the model
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto"
  )

  model.config.use_cache = False
  model.config.pretraining_tp = 1
  return model, tokenizer

model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

### Set up LoRA

In [12]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [13]:
output_model="sparsh-tinyllama-colorist-v1"
training_args = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=2e-3,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=500,
        fp16=True,
    )

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_args,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=2048
)

Map:   0%|          | 0/33887 [00:00<?, ? examples/s]

In [15]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.5985
20,0.9563
30,0.9216
40,0.8961
50,0.8881
60,0.8837
70,0.858
80,0.8637
90,0.8677
100,0.846


TrainOutput(global_step=500, training_loss=0.8314585075378418, metrics={'train_runtime': 389.5948, 'train_samples_per_second': 82.137, 'train_steps_per_second': 1.283, 'total_flos': 2.0458323373326336e+16, 'train_loss': 0.8314585075378418, 'epoch': 0.94})

### Merge LoRA with the base TinyLlama model

In [10]:
from peft import LoraConfig, AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch

In [11]:
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tinyLlama_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

lora_model_path = "/content/sparsh-tinyllama-colorist-v1/checkpoint-500"

peft_model = PeftModel.from_pretrained(tinyLlama_model, lora_model_path, from_transformers=True, device_map="auto")

merged_model = peft_model.merge_and_unload()

merged_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

### Fine-tuned model Inference

In [12]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

In [13]:
def print_color_space(hex_color):
    def hex_to_rgb(hex_color):
        hex_color = hex_color.lstrip('#')
        return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    r, g, b = hex_to_rgb(hex_color)
    print(f'{hex_color}: \033[48;2;{r};{g};{b}m           \033[0m')

In [14]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=13,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = merged_model.generate(**inputs, generation_config=generation_config)
  output_time = perf_counter() - start_time
  raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print('raw output:')
  print(raw_output)
  print('-----------------------------------')
  color_code = raw_output.split('\n')[3].split('<')[0]
  print(f'color code - {color_code}')
  print_color_space(color_code)

  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [15]:
generate_response(user_input='Dark yellow')

raw output:
<|im_start|>user
Dark yellow<|im_end|>
<|im_start|>assistant
#c0aa11<|im_end|>
-----------------------------------
color code - #c0aa11
#c0aa11: [48;2;192;170;17m           [0m
Time taken for inference: 0.99 seconds


In [16]:
generate_response(user_input='dark red')

raw output:
<|im_start|>user
dark red<|im_end|>
<|im_start|>assistant
#c02233<|im_end|
-----------------------------------
color code - #c02233
#c02233: [48;2;192;34;51m           [0m
Time taken for inference: 0.37 seconds


In [17]:
generate_response(user_input='dark green: Deep, dark lush green color of the rain forest')

raw output:
<|im_start|>user
dark green: Deep, dark lush green color of the rain forest<|im_end|>
<|im_start|>assistant
#207733<|im_end|
-----------------------------------
color code - #207733
#207733: [48;2;32;119;51m           [0m
Time taken for inference: 0.38 seconds
