In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 경로 설정
base_model_path = "NCSOFT/Llama-VARCO-8B-Instruct"
adapter_path = "./llama3-8b-qa-ko/checkpoint-600"
merged_model_path = "./output_dir"

# 디바이스 설정
device_arg = {"device_map": "auto"}

# 베이스 모델 로드
print(f"Loading base model from: {base_model_path}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    return_dict=True,
    torch_dtype=torch.float16,
    **device_arg
)

# LoRA 어댑터 로드 및 병합
print(f"Loading and merging PEFT from: {adapter_path}")
model = PeftModel.from_pretrained(base_model, adapter_path, **device_arg)
model = model.merge_and_unload()

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# 저장
print(f"Saving merged model to: {merged_model_path}")
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print("✅ 모델과 토크나이저 저장 완료")

Loading base model from: NCSOFT/Llama-VARCO-8B-Instruct


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading and merging PEFT from: ./llama3-8b-qa-ko/checkpoint-600
Saving merged model to: ./output_dir
✅ 모델과 토크나이저 저장 완료


In [5]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk

test_dataset = load_from_disk('test_dataset')

In [6]:
model = 'NCSOFT/Llama-VARCO-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model)

prompt_lst = []
label_lst = []

for messages in test_dataset["messages"]:
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    input = text.split('<|start_header_id|>assistant<|end_header_id|>\n')[0] + '<|start_header_id|>assistant<|end_header_id|>\n'
    label = text.split('<|start_header_id|>assistant<|end_header_id|>\n')[1].split('<|eot_id|>')[0]
    prompt_lst.append(input)
    label_lst.append(label)

In [7]:
eos_token = tokenizer("<|eot_id|>",add_special_tokens=False)["input_ids"][0]

def test_inference(pipe, prompt):
    outputs = pipe(prompt, max_new_tokens=1024, eos_token_id=eos_token, do_sample=False)
    return outputs[0]['generated_text'][len(prompt):].strip()

In [8]:
model_id = './output_dir'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", low_cpu_mem_usage=True, torch_dtype=torch.float16)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

for prompt, label in zip(prompt_lst[300:305], label_lst[300:305]):
    # print(f"    prompt:\n{prompt}")
    print(f"    response:\n{test_inference(pipe, prompt)}")
    print(f"    label:\n{label}")
    print("-"*50)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
from huggingface_hub import HfApi
api = HfApi()

username = "julietz"

In [10]:
MODEL_NAME = 'llama-3-8b-rag-ko-checkpoint-600'

In [12]:
api.create_repo(
    token="key",
    repo_id=f"{username}/{MODEL_NAME}",
    repo_type="model"
)

api.upload_folder(
    token="key",
    repo_id=f"{username}/{MODEL_NAME}",
    folder_path="output_dir",
)

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/julietz/llama-3-8b-rag-ko-checkpoint-600/commit/3749adc55e3961910a26948d0765ec28e29b72a3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='3749adc55e3961910a26948d0765ec28e29b72a3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/julietz/llama-3-8b-rag-ko-checkpoint-600', endpoint='https://huggingface.co', repo_type='model', repo_id='julietz/llama-3-8b-rag-ko-checkpoint-600'), pr_revision=None, pr_num=None)