In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizerFast,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
)

2024-04-09 14:13:45.859225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-09 14:13:45.965522: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-09 14:13:46.448647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-04-09 14:13:46.448698: W tensorflow/compiler/xla/stream_exe

In [2]:
base_model = "beomi/open-llama-2-ko-7b"

In [3]:
# QLoRA 모델을 사용하기 위한 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 
)



In [4]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
# 토크나이저 로드
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 토크나이저 병렬처리 방지(오류 방지)
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' # __cell__ 오류 방지

tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # 패딩 토큰을 문장의 끝으로 설정 </s>
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가

In [6]:
# 학습 양식
import json
from datasets import load_dataset
file_name = 'dataset/labeling_223.json'

In [7]:
instruction = '''
사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''

In [8]:
# 데이터셋 로드
data = load_dataset('json', data_files=file_name, split="train")

In [35]:
print(data)

Dataset({
    features: ['overview_tk', 'tagline_tk', 'movie_info'],
    num_rows: 223
})


In [9]:
# 데이터 매핑 함수 정의
def map_data_to_format(example):

    imdb_id = json.dumps(example['movie_info']['imdb_id'], ensure_ascii=False)
    title_str = json.dumps(example['movie_info']['title'], ensure_ascii=False)
    key_info_str = json.dumps(example['movie_info']['keywords'], ensure_ascii=False)
    genres_str = json.dumps(example['movie_info']['genres'], ensure_ascii=False)
    overview_tk_str = json.dumps(example['overview_tk'], ensure_ascii=False)
    tagline_tk_str = json.dumps(example['tagline_tk'], ensure_ascii=False)
    popularity = json.dumps(example['movie_info']['popularity'], ensure_ascii=False)
    vote_average = json.dumps(example['movie_info']['vote_average'], ensure_ascii=False)
    
    text = (
        f"keyword_info:\n{key_info_str}\n\n"
        f"genres:\n{genres_str}\n\n"
        f"overview_tk_str:\n{overview_tk_str}\n\n"
        f"tagline_tk_str:\n{tagline_tk_str}\n\n"
    )
    
    # completion은 모델이 생성해야 할 예상 출력을 포함합니다.
    # 여기서는 추천 목록 영화 title, imdb_id, 장르, 인기도, 평점을 포함합니다.
    completion = f"{{\"title\": {title_str}, \"imdb_id\": {imdb_id}, \"genres\": {genres_str}, \
        \"popularity\": {popularity}, \"vote_average\": {vote_average} \
    }}"
    
    return {'text': text, 'completion': completion}

In [33]:
print(mapped_data['overview_tk'])
print(mapped_data['tagline_tk'])
print(mapped_data['movie_info'])
print(mapped_data['text'])
print(mapped_data['completion'])



[['led', 'woody', 'andy', 'toys', 'live', 'happily', 'room', 'andy', 'birthday', 'brings', 'buzz', 'lightyear', 'onto', 'scene', 'afraid', 'losing', 'place', 'andy', 'heart', 'woody', 'plots', 'buzz', 'circumstances', 'separate', 'buzz', 'woody', 'owner', 'duo', 'eventually', 'learns', 'put', 'aside', 'differences'], ['obsessive', 'master', 'thief', 'neil', 'mccauley', 'leads', 'top-notch', 'crew', 'various', 'insane', 'heists', 'throughout', 'los', 'angeles', 'mentally', 'unstable', 'detective', 'vincent', 'hanna', 'pursues', 'without', 'rest', 'man', 'recognizes', 'respects', 'ability', 'dedication', 'even', 'though', 'aware', 'cat-and-mouse', 'game', 'may', 'end', 'violence'], ['life', 'gambling', 'paradise', 'las', 'vegas', 'dark', 'mafia', 'underbelly'], ['two', 'homicide', 'detectives', 'desperate', 'hunt', 'serial', 'killer', 'whose', 'crimes', 'based', 'seven', 'deadly', 'sins', 'dark', 'haunting', 'film', 'takes', 'viewers', 'tortured', 'remains', 'one', 'victim', 'next', 'sea

In [34]:
print(mapped_data)


Dataset({
    features: ['overview_tk', 'tagline_tk', 'movie_info', 'text', 'completion'],
    num_rows: 223
})


In [10]:
# 데이터 매핑 적용
mapped_data = data.map(map_data_to_format)

# 데이터셋 분할
split_data = mapped_data.train_test_split(test_size=0.1)  # 10%를 테스트셋으로 사용

train_set = split_data['train']
eval_set = split_data['test']

train_set = train_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)
eval_set = eval_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)

# lora 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [11]:
model = get_peft_model(model, peft_params)

# prameter
epochs = 10

batch_size = 2

lr = 2e-4

training_params = TrainingArguments(
    output_dir="models",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=20,
    logging_steps=20,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    dataloader_num_workers=1,
)


In [16]:
! wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [17]:
import transformers
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

trainer = Trainer(
    model=model,
    args=training_params,
    train_dataset=train_set,
    eval_dataset=eval_set,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 1.9545, 'grad_norm': 0.12352690100669861, 'learning_rate': 0.00015611870653623825, 'epoch': 3.2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.5646125078201294, 'eval_runtime': 3.1498, 'eval_samples_per_second': 7.302, 'eval_steps_per_second': 0.952, 'epoch': 3.2}
{'loss': 1.5696, 'grad_norm': 0.11202830821275711, 'learning_rate': 5.3159155930021e-05, 'epoch': 6.4}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.5324630737304688, 'eval_runtime': 2.9772, 'eval_samples_per_second': 7.725, 'eval_steps_per_second': 1.008, 'epoch': 6.4}
{'loss': 1.5601, 'grad_norm': 0.12602967023849487, 'learning_rate': 0.0, 'epoch': 9.6}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.5600029230117798, 'eval_runtime': 3.1214, 'eval_samples_per_second': 7.368, 'eval_steps_per_second': 0.961, 'epoch': 9.6}
{'train_runtime': 1025.8079, 'train_samples_per_second': 1.95, 'train_steps_per_second': 0.058, 'train_loss': 1.6947284062703452, 'epoch': 9.6}


TrainOutput(global_step=60, training_loss=1.6947284062703452, metrics={'train_runtime': 1025.8079, 'train_samples_per_second': 1.95, 'train_steps_per_second': 0.058, 'train_loss': 1.6947284062703452, 'epoch': 9.6})

In [20]:
instruction = '''
사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''
key_info_str = {"keywords": ['jealousy', 'toy', 'boy']}
genres_str={"genres":['animation']}


In [23]:
text = f'''
###instruction:
{instruction}


keyword_info:
{key_info_str}


genres:
{genres_str}


'''
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"), 
    attention_mask=inputs["attention_mask"], 
    max_new_tokens=256,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)
output = tokenizer.decode(outputs[0])
print(output)


<s> 
###instruction:

사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format



keyword_info:
{'keywords': ['jealousy', 'toy', 'boy']}


genres:
{'genres': ['animation']}


overview_tk_str:
{'tk_str': ['toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', 'toy', 'boy', 'jealous', '
