In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizerFast,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
)

2024-04-09 14:13:45.859225: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-09 14:13:45.965522: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-09 14:13:46.448647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-04-09 14:13:46.448698: W tensorflow/compiler/xla/stream_exe

In [2]:
base_model = "beomi/open-llama-2-ko-7b"

In [3]:
# QLoRA 모델을 사용하기 위한 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 
)



In [4]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
# 토크나이저 로드
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 토크나이저 병렬처리 방지(오류 방지)
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' # __cell__ 오류 방지

tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # 패딩 토큰을 문장의 끝으로 설정 </s>
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가

In [6]:
# 학습 양식
import json
from datasets import load_dataset
file_name = 'dataset/labeling_223.json'

In [7]:
instruction = '''
사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''

In [41]:
# 데이터셋 로드
data = load_dataset('json', data_files=file_name, split="train")

In [35]:
print(data)

Dataset({
    features: ['overview_tk', 'tagline_tk', 'movie_info'],
    num_rows: 223
})


In [54]:
# 데이터 매핑 함수 정의
def map_data_to_format(example):

    imdb_id = json.dumps(example['movie_info']['imdb_id'], ensure_ascii=False)
    title_str = json.dumps(example['movie_info']['title'], ensure_ascii=False)
    key_info_str = json.dumps(example['movie_info']['keywords'], ensure_ascii=False)
    genres_str = json.dumps(example['movie_info']['genres'], ensure_ascii=False)
    overview_tk_str = json.dumps(example['overview_tk'], ensure_ascii=False)
    tagline_tk_str = json.dumps(example['tagline_tk'], ensure_ascii=False)
    popularity = json.dumps(example['movie_info']['popularity'], ensure_ascii=False)
    vote_average = json.dumps(example['movie_info']['vote_average'], ensure_ascii=False)
    
    text = (
        f"keyword_info:\n{key_info_str}+{genres_str+overview_tk_str+tagline_tk_str}\n\n"
        # f"keyword_info:\n{genres_str}\n\n"
        # f"overview_tk_str:\n{overview_tk_str}\n\n"
        # f"tagline_tk_str:\n{tagline_tk_str}\n\n"
    )
    
    # completion은 모델이 생성해야 할 예상 출력을 포함합니다.
    # 여기서는 추천 목록 영화 title, imdb_id, 장르, 인기도, 평점을 포함합니다.
    completion = f"{{\"title\": {title_str}, \"imdb_id\": {imdb_id}, \"genres\": {genres_str},\
        \"popularity\": {popularity}, \"vote_average\": {vote_average} \
    }}"
    
    return {'text': text, 'completion': completion}

In [49]:
#print(mapped_data['overview_tk'])
# print(mapped_data['tagline_tk'])
# print(mapped_data['movie_info'])
print(mapped_data['text'])
# print(mapped_data['completion'])



['keyword_info:\n["jealousy", "toy", "boy", "friendship", "friends", "rivalry", "boy next door", "new toy", "toy comes to life"]+["animation", "comedy", "family"]["led", "woody", "andy", "toys", "live", "happily", "room", "andy", "birthday", "brings", "buzz", "lightyear", "onto", "scene", "afraid", "losing", "place", "andy", "heart", "woody", "plots", "buzz", "circumstances", "separate", "buzz", "woody", "owner", "duo", "eventually", "learns", "put", "aside", "differences"][""]\n\n', 'keyword_info:\n["robbery", "detective", "bank", "obsession", "chase", "shooting", "thief", "honor", "murder", "suspense", "heist", "betrayal", "money", "gang", "cat and mouse", "criminal mastermind", "cult film", "ex-con", "heist movie", "one last job", "loner", "bank job", "neo-noir", "gun fight", "crime epic"]+["action", "crime", "drama", "thriller"]["obsessive", "master", "thief", "neil", "mccauley", "leads", "top-notch", "crew", "various", "insane", "heists", "throughout", "los", "angeles", "mentally"

In [50]:
print(mapped_data)


Dataset({
    features: ['overview_tk', 'tagline_tk', 'movie_info', 'text', 'completion'],
    num_rows: 223
})


In [51]:
# 데이터 매핑 적용
mapped_data = data.map(map_data_to_format)

# 데이터셋 분할
split_data = mapped_data.train_test_split(test_size=0.1)  # 10%를 테스트셋으로 사용

train_set = split_data['train']
eval_set = split_data['test']

train_set = train_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)
eval_set = eval_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)

# lora 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [52]:
train_set[3]["text"]

'keyword_info:\n["coma", "beach", "baby", "fish", "boat", "office", "baseball", "classroom", "dream", "trip", "lantern", "lie", "ambition", "water", "tea", "song", "retirement", "cake", "marriage", "peace", "class", "swing", "honor", "gift", "teacher", "school", "old man"]+["drama"]["old", "couple", "visit", "children", "grandchildren", "city", "children", "little", "time"]["life", "depressing"]\n\n'

In [53]:
train_set[3]["completion"]

'{"title": "Tokyo Story", "imdb_id": "tt0046438", "genres": ["drama"],         "popularity": 10.009423, "vote_average": 8.2     }'

In [55]:
model = get_peft_model(model, peft_params)

# prameter
epochs = 10

batch_size = 2

lr = 2e-4

training_params = TrainingArguments(
    output_dir="models",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=20,
    logging_steps=20,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    dataloader_num_workers=1,
)


In [16]:
! wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [56]:
import transformers
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

trainer = Trainer(
    model=model,
    args=training_params,
    train_dataset=train_set,
    eval_dataset=eval_set,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 2.1457, 'grad_norm': 0.16025236248970032, 'learning_rate': 0.00015611870653623825, 'epoch': 3.2}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.8382534980773926, 'eval_runtime': 3.1711, 'eval_samples_per_second': 7.253, 'eval_steps_per_second': 0.946, 'epoch': 3.2}
{'loss': 1.7554, 'grad_norm': 0.12896545231342316, 'learning_rate': 5.3159155930021e-05, 'epoch': 6.4}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.7907086610794067, 'eval_runtime': 3.1458, 'eval_samples_per_second': 7.311, 'eval_steps_per_second': 0.954, 'epoch': 6.4}
{'loss': 1.7431, 'grad_norm': 0.1441970318555832, 'learning_rate': 0.0, 'epoch': 9.6}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.8148118257522583, 'eval_runtime': 3.2555, 'eval_samples_per_second': 7.065, 'eval_steps_per_second': 0.922, 'epoch': 9.6}
{'train_runtime': 1026.0884, 'train_samples_per_second': 1.949, 'train_steps_per_second': 0.058, 'train_loss': 1.8814240137736002, 'epoch': 9.6}


TrainOutput(global_step=60, training_loss=1.8814240137736002, metrics={'train_runtime': 1026.0884, 'train_samples_per_second': 1.949, 'train_steps_per_second': 0.058, 'train_loss': 1.8814240137736002, 'epoch': 9.6})

In [62]:
instruction = '''
사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''
key_info_str = {"keywords": ['toy', 'love', 'universe']}
# genres_str={"genres":['animation']}


In [63]:
text = f'''
###instruction:
{instruction}


keyword_info:
{key_info_str}


'''
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"), 
    attention_mask=inputs["attention_mask"], 
    max_new_tokens=256,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)
output = tokenizer.decode(outputs[0])
print(output)


<s> 
###instruction:

사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format



keyword_info:
{'keywords': ['toy', 'love', 'universe']}



['toy story', 'toy story 2', 'toy story 3', 'toy story 4', 'toy story 5', 'toy story 6', 'toy story 7', 'toy story 8', 'toy story 9', 'toy story 10', 'toy story 11', 'toy story 12', 'toy story 13', 'toy story 14', 'toy story 15', 'toy story 16', 'toy story 17', 'toy story 18', 'toy story 19', 'toy story 20', 'toy story 21', 'toy story 22', 'toy story 23', 'toy story 24', 'toy story 25', 'toy story 26', 'toy story 27', 'toy story 28', 'toy story 29', 'toy story 30', 'toy story 31', 'toy story 32', 'toy story 33', 'to
