In [1]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
import pandas as pd
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer, setup_chat_format

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, concatenate_datasets
dataset = load_dataset("SALT-NLP/CultureBank")

combined_dataset  = concatenate_datasets([dataset['tiktok'], dataset['reddit']])
train_test_split = combined_dataset.train_test_split(test_size=0.2)
train_data = train_test_split['train']
test_data = train_test_split['test']

In [4]:
def generate_gemma2_prompt(dataset):
    prompt_list = []
    for i in range(len(dataset['cultural group'])):
        # Extracting necessary fields from the dataset
        cultural_group = dataset['cultural group'][i]
        context = dataset['context'][i]
        topic = dataset['topic'][i]
        eval_question = dataset['eval_question'][i]
        eval_whole_desc = dataset['eval_whole_desc'][i]

        # Creating the prompt as a formatted string for gemma2-9b-it model
        prompt = (
            "System: You are a travel advisor specialized in providing culturally appropriate advice. "
            "You help people understand cultural norms and avoid faux pas when traveling abroad.\n"
            f"User: I'm traveling abroad and have a question regarding {cultural_group} culture in {context}. "
            f"Specifically, I want to know about {topic}. \n\n{eval_question}\n"
            f"Assistant: {eval_whole_desc}"
        )

        prompt_list.append(prompt)

    return prompt_list

In [5]:
BASE_MODEL = "google/gemma-2-2b-it"
bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map = "auto",
    quantization_config=bnbConfig
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
lora_config = LoraConfig(
    r=6,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [7]:
# if torch.cuda.get_device_capability()[0] >= 8:
#     # !pip install -qqq flash-attn
#     torch_dtype = torch.bfloat16
#     attn_implementation = "flash_attention_2"
# else:
#     torch_dtype = torch.float16
#     attn_implementation = "eager"

torch_dtype = torch.float16
attn_implementation = "eager"
    

In [8]:
BASE_MODEL = "google/gemma-2-2b-it"
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config, attn_implementation=attn_implementation)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs_gemma2_2b_it",
        num_train_epochs = 1,
        max_steps=9000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_gemma2_prompt,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/18392 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [10]:
trainer.train()

  0%|          | 0/9000 [00:00<?, ?it/s]

{'loss': 1.3478, 'grad_norm': 0.9412083625793457, 'learning_rate': 0.00019799777530589546, 'epoch': 0.02}
{'loss': 1.0566, 'grad_norm': 0.8407226800918579, 'learning_rate': 0.00019577308120133482, 'epoch': 0.04}
{'loss': 1.0227, 'grad_norm': 0.8355636596679688, 'learning_rate': 0.00019354838709677422, 'epoch': 0.07}
{'loss': 1.0067, 'grad_norm': 0.7948058843612671, 'learning_rate': 0.00019132369299221358, 'epoch': 0.09}
{'loss': 1.0002, 'grad_norm': 0.7955641150474548, 'learning_rate': 0.00018909899888765297, 'epoch': 0.11}
{'loss': 0.9733, 'grad_norm': 0.7029200196266174, 'learning_rate': 0.00018687430478309233, 'epoch': 0.13}
{'loss': 0.9825, 'grad_norm': 0.7755066752433777, 'learning_rate': 0.0001846496106785317, 'epoch': 0.15}
{'loss': 0.9696, 'grad_norm': 0.7542682886123657, 'learning_rate': 0.0001824249165739711, 'epoch': 0.17}
{'loss': 0.9693, 'grad_norm': 0.7301678657531738, 'learning_rate': 0.00018020022246941045, 'epoch': 0.2}
{'loss': 0.9552, 'grad_norm': 0.7631163001060486,

TrainOutput(global_step=9000, training_loss=0.8782309977213542, metrics={'train_runtime': 13290.2325, 'train_samples_per_second': 2.709, 'train_steps_per_second': 0.677, 'total_flos': 1.1500082641619712e+17, 'train_loss': 0.8782309977213542, 'epoch': 1.9573727707698998})

In [11]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [12]:
!ls -alh lora_adapter

total 30M
drwxrwxr-x 2 baebro baebro 4.0K 10월  3 17:09 .
drwxrwxr-x 5 baebro baebro 4.0K 10월  3 17:09 ..
-rw-rw-r-- 1 baebro baebro  720 10월  3 17:09 adapter_config.json
-rw-rw-r-- 1 baebro baebro  30M 10월  3 17:09 adapter_model.safetensors
-rw-rw-r-- 1 baebro baebro 5.0K 10월  3 17:09 README.md


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)

model = model.merge_and_unload()
# model.save_pretrained('gemma-2b-it-CultureBank')
model.save_pretrained('gemma2-2b-it-CultureBank')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
!ls -alh ./gemma2-2b-it-CultureBank

total 4.9G
drwxrwxr-x 2 baebro baebro 4.0K 10월  3 17:09 .
drwxrwxr-x 6 baebro baebro 4.0K 10월  3 17:09 ..
-rw-rw-r-- 1 baebro baebro  880 10월  3 17:09 config.json
-rw-rw-r-- 1 baebro baebro  187 10월  3 17:09 generation_config.json
-rw-rw-r-- 1 baebro baebro 4.7G 10월  3 17:09 model-00001-of-00002.safetensors
-rw-rw-r-- 1 baebro baebro 230M 10월  3 17:09 model-00002-of-00002.safetensors
-rw-rw-r-- 1 baebro baebro  24K 10월  3 17:09 model.safetensors.index.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
BASE_MODEL = "google/gemma-2-2b-it"
FINETUNE_MODEL = "./gemma2-2b-it-CultureBank"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)

In [7]:
def format_to_gemma2_prompt_inference(data):
    # 시스템 메시지 포함 없이 사용자 질문 생성
    user_message = (
        f"{data['eval_question']}"
    )

    # 최종 프롬프트 형식
    gemma2_prompt = [
        {"role": "user", "content": user_message},
    ]

    # 프롬프트를 템플릿으로 적용
    prompt = pipe_finetuned.tokenizer.apply_chat_template(gemma2_prompt, tokenize=False, add_generation_prompt=True)
    
    return prompt

In [8]:
messages = format_to_gemma2_prompt_inference(test_data[0])
print(messages)

<bos><start_of_turn>user
I'm planning a gap year trip to Sierra Leone, and I've heard a lot about how friendly and welcoming the people are. I'm really into volunteering and making a difference in the communities I visit. What are some ways I can connect with locals and contribute positively, maybe even help with something like education or community development?<end_of_turn>
<start_of_turn>model



In [28]:
# prompt = pipe_finetuned.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [9]:
outputs = pipe_finetuned(
    messages,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])

<bos><start_of_turn>user
I'm planning a gap year trip to Sierra Leone, and I've heard a lot about how friendly and welcoming the people are. I'm really into volunteering and making a difference in the communities I visit. What are some ways I can connect with locals and contribute positively, maybe even help with something like education or community development?<end_of_turn>
<start_of_turn>model
In Sierra Leone, both locals and tourists are known for their warm hospitality and eagerness to help, creating a welcoming environment for visitors. This includes offering assistance with education, healthcare, and community development, reflecting the strong emphasis on community support and mutual aid. The goal of this behavior is to foster a sense of community and provide support to those in need. In return, visitors and the community are expected to accept help and actively participate in community development efforts. This cultural norm of hospitality and community engagement is widely re

In [10]:
prompt3 = [
    {"role": "user", 
    "content": "Tell me about Korea food culture"},
]
prompt3 = pipe_finetuned.tokenizer.apply_chat_template(prompt3, tokenize=False, add_generation_prompt=True)
print(prompt3)

<bos><start_of_turn>user
Tell me about Korea food culture<end_of_turn>
<start_of_turn>model



In [11]:
outputs = pipe_finetuned(
    prompt3,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])

<bos><start_of_turn>user
Tell me about Korea food culture<end_of_turn>
<start_of_turn>model
Korean cuisine is a vibrant and diverse culinary tradition that has been shaped by influences from China, Japan, and other Asian countries. It is characterized by a rich tapestry of flavors, textures, and ingredients, reflecting the country's history and cultural diversity. Korean food is known for its bold and spicy flavors, often incorporating fermented ingredients like kimchi, gochujang, and doenjang, as well as unique dishes such as bibimbap and bulgogi. The goal of Korean cuisine is to satisfy hunger and provide a unique culinary experience. This culinary tradition is widely regarded as a normative and integral part of Korean culture, with a significant portion of the sampled population embracing and enjoying the diverse flavors and traditions that Korean food offers. 



In [13]:
prompt4 = [
    {"role": "user", 
    "content": "I will visit Korea. Is there anything to know before visiting like public etiquette?"},
]
prompt4 = pipe_finetuned.tokenizer.apply_chat_template(prompt4, tokenize=False, add_generation_prompt=True)
print(prompt4)

<bos><start_of_turn>user
I will visit Korea. Is there anything to know before visiting like public etiquette?<end_of_turn>
<start_of_turn>model



In [14]:
outputs = pipe_finetuned(
    prompt4,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])

<bos><start_of_turn>user
I will visit Korea. Is there anything to know before visiting like public etiquette?<end_of_turn>
<start_of_turn>model
In Korea, it is customary for people to practice cultural norms such as bowing, using chopsticks, and maintaining personal space. These behaviors are deeply ingrained in Korean culture and are widely regarded as standard practices by the sampled population. Additionally, there are specific etiquette rules for using chopsticks, such as not using them for eating rice and using them in a specific manner. These cultural norms are considered a significant part of Korean identity and are highly influential in shaping social interactions. The majority of the sampled population agrees that these practices are essential for maintaining respect and harmony within Korean society. 

