In [1]:
# model8 inference
# train data: 8000개, eval data: 2000개

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizerFast,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel
)

2024-04-19 20:59:55.700295: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-19 20:59:55.809178: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 20:59:56.297999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-04-19 20:59:56.298055: W tensorflow/compiler/xla/stream_exe

In [3]:
base_model = "meta-llama/Llama-2-7b-chat-hf"

In [4]:
# QLoRA 모델을 사용하기 위한 설정

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 
)

In [5]:
# 라마2 모델 불러오기
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# 토크나이저 로드
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 토크나이저 병렬처리 방지(오류 방지)
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' # __cell__ 오류 방지

tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # 패딩 토큰을 문장의 끝으로 설정 </s>
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가

In [8]:
# 어댑터 추가
model = PeftModel.from_pretrained(model, "/data/ngn_2024/llm_auto_train/models8/ckpt")

In [9]:
# 학습 양식
import json
from datasets import load_dataset
file_name = 'preprocess/dataset3/new_ratings_4.json'

In [10]:
import json

# JSON 파일 불러오기
input_file = file_name
with open(input_file, 'r') as f:
    data = json.load(f)

In [11]:
# 데이터셋 로드
data = load_dataset('json', data_files=file_name, split="train")

In [12]:
instruction = '''
I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.
'''

In [13]:
# 데이터 매핑 함수 정의
def map_data_to_format(example):

    user_ratings_str = json.dumps(example['ratings'], ensure_ascii=False)
    rank_list_str = json.dumps(example["rank"], ensure_ascii=False)
    
    text = (
        f"###instruction:\n{instruction}\n\n"
        f"user_rating_information:\n{user_ratings_str}\n\n"
        f"###response:\n{rank_list_str}\n\n"
    )
    
    # completion은 모델이 생성해야 할 예상 출력을 포함함
    # 여기서는 우선순위에 따라 정렬된 영화 imdb_id의 리스트를 반환합니다.
    completion = f"{{\"rank\": {rank_list_str}}}"
    
    return {'text': text, 'completion': completion}

In [14]:
# 데이터 매핑 적용
mapped_data = data.map(map_data_to_format)
print(mapped_data)

Dataset({
    features: ['rank', 'ratings', 'userId', 'text', 'completion'],
    num_rows: 10000
})


In [15]:
# 데이터셋 분할
split_data = mapped_data.train_test_split(test_size=0.1)  # 10%를 테스트셋으로 사용

train_set = split_data['train']
eval_set = split_data['test']

train_set = train_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)
eval_set = eval_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [16]:
print(eval_set[90]["text"])

###instruction:

I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


user_rating_information:
[{"imdbId": "tt401792", "rating": 4.5, "timestamp": 1368896424, "title": "Sin City (2005)"}, {"imdbId": "tt375679", "rating": 4.5, "timestamp": 1368893832, "title": "Crash (2004)"}, {"imdbId": "tt374536", "rating": 2.5, "timestamp": 1368897031, "title": "Bewitched (2005)"}, {"imdbId": "tt405422", "rating": 3.0, "timestamp": 1368896664, "title": "40-Year-Old Virgin, The (2005)"}, {"imdbId": "tt399295", "rating": 3.5, "timestamp": 1368894060, "title": "Lord of War (2005)"}, {"imdbId": "tt424136", "rating": 5.0, "timestamp": 1368896464, "title": "Hard Candy (2005)"}, {"imdbId": "tt457430", "rating": 4.0, "timestamp": 1368896435, "title": "Pan's Labyrinth (Laberinto del fauno, El) (2006)"}, {"imdbId": "tt457939", "rating": 3.5, "timestamp": 1368896701, "title": "Holiday, The (2006)"}, {

In [17]:
print(eval_set[90]["ratings"])

[{'imdbId': 'tt401792', 'rating': 4.5, 'timestamp': 1368896424, 'title': 'Sin City (2005)'}, {'imdbId': 'tt375679', 'rating': 4.5, 'timestamp': 1368893832, 'title': 'Crash (2004)'}, {'imdbId': 'tt374536', 'rating': 2.5, 'timestamp': 1368897031, 'title': 'Bewitched (2005)'}, {'imdbId': 'tt405422', 'rating': 3.0, 'timestamp': 1368896664, 'title': '40-Year-Old Virgin, The (2005)'}, {'imdbId': 'tt399295', 'rating': 3.5, 'timestamp': 1368894060, 'title': 'Lord of War (2005)'}, {'imdbId': 'tt424136', 'rating': 5.0, 'timestamp': 1368896464, 'title': 'Hard Candy (2005)'}, {'imdbId': 'tt457430', 'rating': 4.0, 'timestamp': 1368896435, 'title': "Pan's Labyrinth (Laberinto del fauno, El) (2006)"}, {'imdbId': 'tt457939', 'rating': 3.5, 'timestamp': 1368896701, 'title': 'Holiday, The (2006)'}, {'imdbId': 'tt457572', 'rating': 4.0, 'timestamp': 1368893067, 'title': 'Fido (2006)'}, {'imdbId': 'tt1028528', 'rating': 3.0, 'timestamp': 1368893084, 'title': 'Death Proof (2007)'}]


In [18]:
instruction = "I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format."

# user_ratings_str3 = eval_set[90]["text"]
# [{"imdbId": "tt101846", "rating": 2.0, "timestamp": 996221356, "title": "F/X2 (a.k.a. F/X 2 - The Deadly Art of Illusion) (1991)"}, {"imdbId": "tt87277", "rating": 3.0, "timestamp": 996259355, "title": "Footloose (1984)"}, {"imdbId": "tt120903", "rating": 2.0, "timestamp": 996212879, "title": "X-Men (2000)"}, {"imdbId": "tt174480", "rating": 2.0, "timestamp": 996217683, "title": "Autumn in New York (2000)"}, {"imdbId": "tt99005", "rating": 5.0, "timestamp": 996256206, "title": "Air America (1990)"}, {"imdbId": "tt191397", "rating": 3.0, "timestamp": 996213058, "title": "Replacements, The (2000)"}, {"imdbId": "tt102510", "rating": 2.0, "timestamp": 996221323, "title": "Naked Gun 2 1/2: The Smell of Fear, The (1991)"}, {"imdbId": "tt204946", "rating": 2.0, "timestamp": 996215785, "title": "Bring It On (2000)"}, {"imdbId": "tt208988", "rating": 4.0, "timestamp": 996217599, "title": "Get Carter (2000)"}, {"imdbId": "tt212338", "rating": 3.0, "timestamp": 996213481, "title": "Meet the Parents (2000)"}]


In [19]:
def generate_output(idx):
    text = f'''
    ###instruction:
    {instruction}


    user_rating_information:
    {eval_set[idx]["ratings"]}


    '''

    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        input_ids=inputs["input_ids"].to("cuda"), 
        attention_mask=inputs["attention_mask"], 
        max_new_tokens=256,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )
    output = tokenizer.decode(outputs[0])
    print(f"--- {idx} 번째 eval set 결과---")
    print(output)



In [20]:
# 사용 예시
for i in range(10):
    generate_output(i) # idx



--- 0 번째 eval set 결과---
<s> 
    ###instruction:
    I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


    user_rating_information:
    [{'imdbId': 'tt107818', 'rating': 2.0, 'timestamp': 1488594826, 'title': 'Philadelphia (1993)'}, {'imdbId': 'tt83658', 'rating': 1.0, 'timestamp': 1489016618, 'title': 'Blade Runner (1982)'}, {'imdbId': 'tt102926', 'rating': 0.5, 'timestamp': 1488332713, 'title': 'Silence of the Lambs, The (1991)'}, {'imdbId': 'tt80684', 'rating': 4.0, 'timestamp': 1488332785, 'title': 'Star Wars: Episode V - The Empire Strikes Back (1980)'}, {'imdbId': 'tt93779', 'rating': 1.5, 'timestamp': 1488594765, 'title': 'Princess Bride, The (1987)'}, {'imdbId': 'tt92005', 'rating': 4.0, 'timestamp': 1488332744, 'title': 'Stand by Me (1986)'}, {'imdbId': 'tt107048', 'rating': 2.5, 'timestamp': 1488332858, 'title': 'Groundhog Day (1993)'}, {'imdbId': 'tt77631', 'rat

In [22]:
# 사용 예시2
for i in range(100, 110):
    generate_output(i) # idx

--- 100 번째 eval set 결과---
<s> 
    ###instruction:
    I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


    user_rating_information:
    [{'imdbId': 'tt163978', 'rating': 3.0, 'timestamp': 1007994972, 'title': 'Beach, The (2000)'}, {'imdbId': 'tt181984', 'rating': 4.0, 'timestamp': 953759895, 'title': 'Boiler Room (2000)'}, {'imdbId': 'tt190138', 'rating': 4.0, 'timestamp': 951169736, 'title': 'Whole Nine Yards, The (2000)'}, {'imdbId': 'tt91217', 'rating': 3.0, 'timestamp': 951756668, 'title': 'Hoosiers (a.k.a. Best Shot) (1986)'}, {'imdbId': 'tt90274', 'rating': 2.0, 'timestamp': 959625116, 'title': 'Volunteers (1985)'}, {'imdbId': 'tt98645', 'rating': 2.0, 'timestamp': 959625102, 'title': "Who's Harry Crumb? (1989)"}, {'imdbId': 'tt92666', 'rating': 3.0, 'timestamp': 1020803145, 'title': 'Blind Date (1987)'}, {'imdbId': 'tt79588', 'rating': 4.0, 'timestamp': 951756789,

In [23]:
# 사용 예시3
for i in range(200, 210):
    generate_output(i) # idx

--- 200 번째 eval set 결과---
<s> 
    ###instruction:
    I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


    user_rating_information:
    [{'imdbId': 'tt106977', 'rating': 4.0, 'timestamp': 975032059, 'title': 'Fugitive, The (1993)'}, {'imdbId': 'tt107206', 'rating': 4.0, 'timestamp': 974943172, 'title': 'In the Line of Fire (1993)'}, {'imdbId': 'tt107207', 'rating': 4.0, 'timestamp': 974942322, 'title': 'In the Name of the Father (1993)'}, {'imdbId': 'tt107616', 'rating': 4.0, 'timestamp': 974942676, 'title': 'Much Ado About Nothing (1993)'}, {'imdbId': 'tt107822', 'rating': 4.0, 'timestamp': 974943132, 'title': 'Piano, The (1993)'}, {'imdbId': 'tt108000', 'rating': 4.0, 'timestamp': 974939593, 'title': 'Ruby in Paradise (1993)'}, {'imdbId': 'tt108052', 'rating': 5.0, 'timestamp': 974938725, 'title': "Schindler's List (1993)"}, {'imdbId': 'tt108122', 'rating': 4.0, 'times

In [24]:
# 사용 예시4
for i in range(700, 710):
    generate_output(i) # idx

--- 700 번째 eval set 결과---
<s> 
    ###instruction:
    I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


    user_rating_information:
    [{'imdbId': 'tt73195', 'rating': 3.5, 'timestamp': 1158532008, 'title': 'Jaws (1975)'}, {'imdbId': 'tt77766', 'rating': 2.5, 'timestamp': 1240093178, 'title': 'Jaws 2 (1978)'}, {'imdbId': 'tt116996', 'rating': 3.0, 'timestamp': 1158532870, 'title': 'Mars Attacks! (1996)'}, {'imdbId': 'tt116695', 'rating': 4.5, 'timestamp': 1271544854, 'title': 'Jerry Maguire (1996)'}, {'imdbId': 'tt117571', 'rating': 2.5, 'timestamp': 1158532360, 'title': 'Scream (1996)'}, {'imdbId': 'tt104691', 'rating': 5.0, 'timestamp': 1158533126, 'title': 'Last of the Mohicans, The (1992)'}, {'imdbId': 'tt117038', 'rating': 2.5, 'timestamp': 1261623578, 'title': 'Michael (1996)'}, {'imdbId': 'tt116250', 'rating': 1.5, 'timestamp': 1261624171, 'title': 'Evita (1996)'

In [25]:
# 사용 예시5
for i in range(950, 960):
    generate_output(i) # idx

--- 950 번째 eval set 결과---
<s> 
    ###instruction:
    I will sort the movie list in descending order based on user rating information. I will show the title for the corresponding movie. It must return in JSON format.


    user_rating_information:
    [{'imdbId': 'tt297884', 'rating': 4.0, 'timestamp': 1043177830, 'title': 'Far from Heaven (2002)'}, {'imdbId': 'tt295297', 'rating': 3.0, 'timestamp': 1043177872, 'title': 'Harry Potter and the Chamber of Secrets (2002)'}, {'imdbId': 'tt246460', 'rating': 2.0, 'timestamp': 1043177906, 'title': 'Die Another Day (2002)'}, {'imdbId': 'tt307479', 'rating': 4.0, 'timestamp': 1043177889, 'title': 'Solaris (2002)'}, {'imdbId': 'tt268126', 'rating': 4.0, 'timestamp': 1043177872, 'title': 'Adaptation (2002)'}, {'imdbId': 'tt83629', 'rating': 2.0, 'timestamp': 1043176423, 'title': 'Beast Within, The (1982)'}, {'imdbId': 'tt257360', 'rating': 4.0, 'timestamp': 1043177830, 'title': 'About Schmidt (2002)'}, {'imdbId': 'tt167261', 'rating': 5.0, 'time

In [34]:
# # {instruction}

# text = f'''
# ###instruction:
# {instruction}


# user_rating_information:
# {eval_set[90]["text"]}


# '''
# inputs = tokenizer(text, return_tensors="pt").to("cuda")
# outputs = model.generate(
#     input_ids=inputs["input_ids"].to("cuda"), 
#     attention_mask=inputs["attention_mask"], 
#     max_new_tokens=256,
#     early_stopping=True,
#     pad_token_id=tokenizer.eos_token_id
# )
# output = tokenizer.decode(outputs[0])
# print(output)