In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizerFast,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
)

2024-04-18 23:47:03.892800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-18 23:47:04.104870: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-18 23:47:04.871413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-04-18 23:47:04.871573: W tensorflow/compiler/xla/stream_exe

In [2]:
# https://huggingface.co/meta-llama/Llama-2-7b-hf
# base_model = "meta-llama/Llama-2-7b-hf"
base_model = "meta-llama/Llama-2-7b-chat-hf"
# base_model="beomi/open-llama-2-ko-7b"
# base_model = "huggingface-projects/llama-2-7b-chat"
# base_model = "TinyPixel/Llama-2-7B-bf16-sharded"

In [3]:
# # Fine-tuned model
# new_model = "llama-2-7b-hf-fine-tuned-test1"

In [3]:
# QLoRA 모델을 사용하기 위한 설정

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 
)



In [4]:
# 라마2 모델 불러오기
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# 토크나이저 로드
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 토크나이저 병렬처리 방지(오류 방지)
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' # __cell__ 오류 방지

tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # 패딩 토큰을 문장의 끝으로 설정 </s>
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가

In [30]:
# 학습 양식
import json
from datasets import load_dataset
file_name = 'preprocess/dataset3/new_ratings_5.json'

In [31]:
import json

# JSON 파일 불러오기
input_file = file_name
with open(input_file, 'r') as f:
    data = json.load(f)

# 불러온 데이터 확인
# print(data)

In [32]:
# print(data[1])

print(data[0]["ratings"]) # 영화 평점들 정보 모음
print(type(data[0]["ratings"])) 

print(data[0]["ratings"][0]) # 평점 1개
print(type(data[0]["ratings"][0])) 
print(data[0]["rank"])

[{'title': 'Toy Story (1995)', 'rating': 4.0, 'imdbId': 'tt114709', 'timestamp': 964982703}, {'title': 'Grumpier Old Men (1995)', 'rating': 4.0, 'imdbId': 'tt113228', 'timestamp': 964981247}, {'title': 'Heat (1995)', 'rating': 4.0, 'imdbId': 'tt113277', 'timestamp': 964982224}, {'title': 'Seven (a.k.a. Se7en) (1995)', 'rating': 5.0, 'imdbId': 'tt114369', 'timestamp': 964983815}, {'title': 'Usual Suspects, The (1995)', 'rating': 5.0, 'imdbId': 'tt114814', 'timestamp': 964982931}, {'title': 'From Dusk Till Dawn (1996)', 'rating': 3.0, 'imdbId': 'tt116367', 'timestamp': 964982400}, {'title': 'Bottle Rocket (1996)', 'rating': 5.0, 'imdbId': 'tt115734', 'timestamp': 964980868}, {'title': 'Braveheart (1995)', 'rating': 4.0, 'imdbId': 'tt112573', 'timestamp': 964982176}, {'title': 'Rob Roy (1995)', 'rating': 5.0, 'imdbId': 'tt114287', 'timestamp': 964984041}, {'title': 'Canadian Bacon (1995)', 'rating': 5.0, 'imdbId': 'tt109370', 'timestamp': 964984100}]
<class 'list'>
{'title': 'Toy Story (1

In [53]:
instruction = '''
I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.
'''

In [69]:
# 데이터 매핑 함수 정의
def map_data_to_format(example):

    user_ratings_str = json.dumps(example['ratings'], ensure_ascii=False)
    rank_list_str = json.dumps(example["rank"], ensure_ascii=False)
    
    text = (
        f"###instruction:\n{instruction}\n\n"
        f"user_rating_information:\n{user_ratings_str}\n\n"
        f"###response:\n{rank_list_str}\n\n"
    )
    
    # completion은 모델이 생성해야 할 예상 출력을 포함함
    # 여기서는 우선순위에 따라 정렬된 영화 imdb_id의 리스트를 반환합니다.
    completion = f"{{\"rank\": {rank_list_str}}}"
    
    return {'text': text, 'completion': completion}

In [70]:
# 데이터셋 로드
data = load_dataset('json', data_files=file_name, split="train")

In [71]:
print(data)

Dataset({
    features: ['rank', 'userId', 'ratings'],
    num_rows: 100
})


In [15]:
# print(data['ratings'])


In [16]:
# print(data['order']) # [['tt109370', 'tt114287', 'tt114369', 'tt114814', 'tt115734', 'tt114709', 'tt113277', 'tt112573', 'tt113228', 'tt116367'], ...]

In [17]:
# print(data['userId']) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...]

In [72]:
# 데이터 매핑 적용
mapped_data = data.map(map_data_to_format)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [73]:
print(mapped_data)


Dataset({
    features: ['rank', 'userId', 'ratings', 'text', 'completion'],
    num_rows: 100
})


In [74]:
# 데이터셋 분할
split_data = mapped_data.train_test_split(test_size=0.1)  # 10%를 테스트셋으로 사용

train_set = split_data['train']
eval_set = split_data['test']

train_set = train_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)
eval_set = eval_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)

# lora 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [75]:
train_set

Dataset({
    features: ['rank', 'userId', 'ratings', 'text', 'completion', 'input_ids', 'attention_mask'],
    num_rows: 90
})

In [76]:
print(train_set[3]["text"])
print("\n")
print(type(train_set[3]["text"]))
print("\n")
# print(train_set[20]["text"])
print(train_set[79]["text"])

###instruction:
I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.

user_rating_information:
[{"imdbId": "tt112697", "rating": 3.0, "timestamp": 847434961, "title": "Clueless (1995)"}, {"imdbId": "tt114814", "rating": 4.0, "timestamp": 847434881, "title": "Usual Suspects, The (1995)"}, {"imdbId": "tt110877", "rating": 5.0, "timestamp": 847435238, "title": "Postman, The (Postino, Il) (1994)"}, {"imdbId": "tt112573", "rating": 4.0, "timestamp": 847434880, "title": "Braveheart (1995)"}, {"imdbId": "tt112384", "rating": 3.0, "timestamp": 847434748, "title": "Apollo 13 (1995)"}, {"imdbId": "tt112462", "rating": 3.0, "timestamp": 847434802, "title": "Batman Forever (1995)"}, {"imdbId": "tt111797", "rating": 4.0, "timestamp": 847435292, "title": "Eat Drink Man Woman (Yin shi nan nu) (1994)"}, {"imdbId": "tt110005", "rating": 5.0, "timestamp": 847435337, "title": "Heavenly Creatures 

In [77]:
print(eval_set[3]["text"])
# print(eval_set[20]["text"])
print(eval_set[9]["text"])

###instruction:
I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.

user_rating_information:
[{"imdbId": "tt87332", "rating": 5.0, "timestamp": 964983414, "title": "Ghostbusters (a.k.a. Ghost Busters) (1984)"}, {"imdbId": "tt129167", "rating": 5.0, "timestamp": 964982703, "title": "Iron Giant, The (1999)"}, {"imdbId": "tt94737", "rating": 4.0, "timestamp": 964981710, "title": "Big (1988)"}, {"imdbId": "tt120657", "rating": 4.0, "timestamp": 964980523, "title": "13th Warrior, The (1999)"}, {"imdbId": "tt169547", "rating": 5.0, "timestamp": 964980868, "title": "American Beauty (1999)"}, {"imdbId": "tt82348", "rating": 5.0, "timestamp": 964981680, "title": "Excalibur (1981)"}, {"imdbId": "tt31397", "rating": 5.0, "timestamp": 964982703, "title": "Gulliver's Travels (1939)"}, {"imdbId": "tt100802", "rating": 4.0, "timestamp": 964982290, "title": "Total Recall (1990)"}, {"imdbId":

In [78]:
print(train_set[3]["completion"])
# print(train_set[20]["completion"])
print(train_set[79]["completion"])

{"rank": [{"title": "Heavenly Creatures (1994)"}, {"title": "Postman, The (Postino, Il) (1994)"}, {"title": "Eat Drink Man Woman (Yin shi nan nu) (1994)"}, {"title": "Little Women (1994)"}, {"title": "Usual Suspects, The (1995)"}, {"title": "Braveheart (1995)"}, {"title": "Interview with the Vampire: The Vampire Chronicles (1994)"}, {"title": "Clueless (1995)"}, {"title": "Batman Forever (1995)"}, {"title": "Apollo 13 (1995)"}]}
{"rank": [{"title": "Wild Bill (1995)"}, {"title": "Bushwhacked (1995)"}, {"title": "Billy Madison (1995)"}, {"title": "Walk in the Clouds, A (1995)"}, {"title": "Three Wishes (1995)"}, {"title": "White Man's Burden (1995)"}, {"title": "Unstrung Heroes (1995)"}, {"title": "Under Siege 2: Dark Territory (1995)"}, {"title": "Species (1995)"}, {"title": "Waterworld (1995)"}]}


In [79]:
model = get_peft_model(model, peft_params)

# prameter
epochs = 1 # 10
batch_size = 1
lr = 2e-4

training_params = TrainingArguments(
    output_dir="models5",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=20,
    logging_steps=20,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    dataloader_num_workers=1,
)


In [80]:
# ! wandb login --relogin
# torch.cuda.empty_cache()
# import gc
# gc.collect()
# import gc
# torch.cuda.empty_cache()
# gc.collect()

In [81]:
import transformers
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

trainer = Trainer(
    model=model,
    args=training_params,
    train_dataset=train_set,
    eval_dataset=eval_set,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/5 [00:00<?, ?it/s]

{'train_runtime': 80.4114, 'train_samples_per_second': 1.119, 'train_steps_per_second': 0.062, 'train_loss': 0.9120184898376464, 'epoch': 0.89}


TrainOutput(global_step=5, training_loss=0.9120184898376464, metrics={'train_runtime': 80.4114, 'train_samples_per_second': 1.119, 'train_steps_per_second': 0.062, 'train_loss': 0.9120184898376464, 'epoch': 0.89})

In [82]:
from tensorboard import notebook
log_dir = "./models5" 
notebook.start("--logdir {} --port 4000".format(log_dir))

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
2024-04-19 10:49:37.915191: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-19 10:49:38.028882: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 10:49:38.031520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

In [83]:
instruction = "I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format."

user_ratings_str = [{"imdbId": "tt199725", "rating": 3.0, "timestamp": 1044311397, "title": "Love and Basketball (2000)"}, {"imdbId": "tt171359", "rating": 3.0, "timestamp": 1044311426, "title": "Hamlet (2000)"}, {"imdbId": "tt187393", "rating": 3.0, "timestamp": 1044311108, "title": "Patriot, The (2000)"}, {"imdbId": "tt181875", "rating": 4.0, "timestamp": 1044311358, "title": "Almost Famous (2000)"}, {"imdbId": "tt180093", "rating": 5.0, "timestamp": 1044311310, "title": "Requiem for a Dream (2000)"}, {"imdbId": "tt120917", "rating": 4.0, "timestamp": 1044311744, "title": "Emperor's New Groove, The (2000)"}, {"imdbId": "tt181865", "rating": 5.0, "timestamp": 1044311310, "title": "Traffic (2000)"}, {"imdbId": "tt209144", "rating": 5.0, "timestamp": 1044311318, "title": "Memento (2000)"}, {"imdbId": "tt125022", "rating": 2.0, "timestamp": 1044311195, "title": "Heartbreakers (2001)"}, {"imdbId": "tt203009", "rating": 4.0, "timestamp": 1044311949, "title": "Moulin Rouge (2001)"}]

# order_list_str=["tt118971", "tt118883", "tt120102", "tt119488", "tt118884", "tt119345", "tt119174", "tt120177", "tt118842", "tt120399"]

In [84]:
# ###instruction:
# {instruction}

text = f'''
###instruction:
{instruction}


user_rating_information:
{user_ratings_str}


'''
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"), 
    attention_mask=inputs["attention_mask"], 
    max_new_tokens=256,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)
output = tokenizer.decode(outputs[0])
print(output)


<s> 
###instruction:
I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


user_rating_information:
[{'imdbId': 'tt199725', 'rating': 3.0, 'timestamp': 1044311397, 'title': 'Love and Basketball (2000)'}, {'imdbId': 'tt171359', 'rating': 3.0, 'timestamp': 1044311426, 'title': 'Hamlet (2000)'}, {'imdbId': 'tt187393', 'rating': 3.0, 'timestamp': 1044311108, 'title': 'Patriot, The (2000)'}, {'imdbId': 'tt181875', 'rating': 4.0, 'timestamp': 1044311358, 'title': 'Almost Famous (2000)'}, {'imdbId': 'tt180093', 'rating': 5.0, 'timestamp': 1044311310, 'title': 'Requiem for a Dream (2000)'}, {'imdbId': 'tt120917', 'rating': 4.0, 'timestamp': 1044311744, 'title': "Emperor's New Groove, The (2000)"}, {'imdbId': 'tt181865', 'rating': 5.0, 'timestamp': 1044311310, 'title': 'Traffic (2000)'}, {'imdbId': 'tt209144', 'rating': 5.0, 'timestamp': 1044311318, 'title': 'Memento (2000)'}, {'imdbId