In [1]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
import pandas as pd
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer, setup_chat_format

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, concatenate_datasets
dataset = load_dataset("SALT-NLP/CultureBank")

combined_dataset  = concatenate_datasets([dataset['tiktok'], dataset['reddit']])
train_test_split = combined_dataset.train_test_split(test_size=0.2)
train_data = train_test_split['train']
test_data = train_test_split['test']

In [4]:
BASE_MODEL = "google/gemma-2-2b-it"
FINETUNE_MODEL = "./gemma2-2b-it-CultureBank"

finetune_model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map="auto")
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
pipe_finetuned = pipeline("text-generation", model=finetune_model, tokenizer=tokenizer, max_new_tokens=512)

In [6]:
def format_to_gemma2_prompt_inference(data):
    # 시스템 메시지 포함 없이 사용자 질문 생성
    user_message = (
        f"{data['eval_question']}"
    )

    # 최종 프롬프트 형식
    gemma2_prompt = [
        {"role": "user", "content": user_message},
    ]

    # 프롬프트를 템플릿으로 적용
    prompt = pipe_finetuned.tokenizer.apply_chat_template(gemma2_prompt, tokenize=False, add_generation_prompt=True)
    
    return prompt

In [7]:
messages = format_to_gemma2_prompt_inference(test_data[0])
print(messages)

<bos><start_of_turn>user
I've been invited to a friend's wedding in Utah and I've heard that it's a big deal in their community. I want to be a supportive guest, but I'm not really familiar with their traditions. Could you give me some tips on what to expect and how to be a respectful guest during the ceremony? I've heard that they place a lot of importance on spiritual union, so I'm really curious about what that means in their context.<end_of_turn>
<start_of_turn>model



In [8]:
outputs = pipe_finetuned(
    messages,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])

<bos><start_of_turn>user
I've been invited to a friend's wedding in Utah and I've heard that it's a big deal in their community. I want to be a supportive guest, but I'm not really familiar with their traditions. Could you give me some tips on what to expect and how to be a respectful guest during the ceremony? I've heard that they place a lot of importance on spiritual union, so I'm really curious about what that means in their context.<end_of_turn>
<start_of_turn>model
In Utah, during weddings, it is customary for people to engage in spiritual union ceremonies, which involve the use of rings and the presence of spiritual leaders. These ceremonies are deeply rooted in the cultural and religious practices of the community, reflecting the significance of spiritual union in their traditions. The use of rings and the involvement of spiritual leaders are integral parts of these ceremonies, emphasizing the importance of spiritual connection and guidance in the union. This practice is widely

In [None]:
prompt3 = [
    {"role": "user", 
    "content": "Tell me about Korea food culture"},
]
prompt3 = pipe_finetuned.tokenizer.apply_chat_template(prompt3, tokenize=False, add_generation_prompt=True)
print(prompt3)

In [None]:
outputs = pipe_finetuned(
    prompt3,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])

In [None]:
prompt4 = [
    {"role": "user", 
    "content": "I will visit Korea. Is there anything to know before visiting like public etiquette?"},
]
prompt4 = pipe_finetuned.tokenizer.apply_chat_template(prompt4, tokenize=False, add_generation_prompt=True)
print(prompt4)

In [None]:
outputs = pipe_finetuned(
    prompt4,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
# print(outputs[0]["generated_text"][len(prompt):])
print(outputs[0]["generated_text"])