In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    LlamaTokenizerFast,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
)

2024-04-17 17:50:06.850826: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-17 17:50:06.957599: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-17 17:50:07.444358: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2024-04-17 17:50:07.444416: W tensorflow/compiler/xla/stream_exe

In [2]:
# https://huggingface.co/meta-llama/Llama-2-7b-hf
base_model = "meta-llama/Llama-2-7b-hf"

In [3]:
# Fine-tuned model
new_model = "llama-2-7b-hf-fine-tuned-test1"

In [4]:
# QLoRA 모델을 사용하기 위한 설정

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 
)



In [5]:
# 라마2 모델 불러오기
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# 토크나이저 로드
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # 토크나이저 병렬처리 방지(오류 방지)
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' # __cell__ 오류 방지

tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token # 패딩 토큰을 문장의 끝으로 설정 </s>
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가

In [7]:
# 학습 양식
import json
from datasets import load_dataset
file_name = 'preprocess/ratings_365.json'

In [8]:
import json

# JSON 파일 불러오기
input_file = file_name
with open(input_file, 'r') as f:
    data = json.load(f)

# 불러온 데이터 확인
# print(data)

In [9]:
# print(data[1])

print(data[0]["ratings"]) # 영화 평점들 정보 모음
print(type(data[0]["ratings"])) 

print(data[0]["ratings"][0]) # 평점 1개
print(type(data[0]["ratings"][0])) 
print(data[0]["order"])

[{'imdbId': 'tt111161', 'rating': 3, 'title': 'Shawshank Redemption, The (1994)', 'genres': 'Crime|Drama', 'timestamp': 1445714835}, {'imdbId': 'tt114694', 'rating': 4, 'title': 'Tommy Boy (1995)', 'genres': 'Comedy', 'timestamp': 1445715029}, {'imdbId': 'tt119217', 'rating': 4, 'title': 'Good Will Hunting (1997)', 'genres': 'Drama|Romance', 'timestamp': 1445715228}, {'imdbId': 'tt172495', 'rating': 4, 'title': 'Gladiator (2000)', 'genres': 'Action|Adventure|Drama', 'timestamp': 1445714885}, {'imdbId': 'tt266697', 'rating': 4, 'title': 'Kill Bill: Vol. 1 (2003)', 'genres': 'Action|Crime|Thriller', 'timestamp': 1445714952}, {'imdbId': 'tt369339', 'rating': 4, 'title': 'Collateral (2004)', 'genres': 'Action|Crime|Drama|Thriller', 'timestamp': 1445714960}, {'imdbId': 'tt415306', 'rating': 4, 'title': 'Talladega Nights: The Ballad of Ricky Bobby (2006)', 'genres': 'Action|Comedy', 'timestamp': 1445715013}, {'imdbId': 'tt407887', 'rating': 4, 'title': 'Departed, The (2006)', 'genres': 'Crim

In [10]:
instruction = '''
사용자의 영화 평점 기록을 바탕으로 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''

In [11]:
# 데이터 매핑 함수 정의
def map_data_to_format(example):

    ratings = json.dumps(example['ratings'], ensure_ascii=False)
    order_list = json.dumps(example["order"], ensure_ascii=False)
    
    text = (
        f"user_ratings_info:\n{ratings}\n\n"
        # f"keyword_info:\n{genres_str}\n\n"
        # f"overview_tk_str:\n{overview_tk_str}\n\n"
        # f"tagline_tk_str:\n{tagline_tk_str}\n\n"
    )
    
    # completion은 모델이 생성해야 할 예상 출력을 포함합니다.

    # 여기서는 추천 목록 영화 리스트 imdb_id를 반환합니다.
    completion = f"{{\"order_list\": {order_list} \
    }}"
    
    return {'text': text, 'completion': completion}

In [12]:
# 데이터셋 로드
data = load_dataset('json', data_files=file_name, split="train")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
print(data)

Dataset({
    features: ['order', 'ratings', 'userId'],
    num_rows: 365
})


In [14]:
# 데이터 매핑 적용
mapped_data = data.map(map_data_to_format)

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

In [15]:
#print(mapped_data['overview_tk'])
# print(mapped_data['tagline_tk'])
# print(mapped_data['movie_info'])
print(mapped_data['text'])
print(mapped_data['completion'])



['user_ratings_info:\n[{"genres": "Crime|Drama", "imdbId": "tt111161", "rating": 3, "timestamp": 1445714835, "title": "Shawshank Redemption, The (1994)"}, {"genres": "Comedy", "imdbId": "tt114694", "rating": 4, "timestamp": 1445715029, "title": "Tommy Boy (1995)"}, {"genres": "Drama|Romance", "imdbId": "tt119217", "rating": 4, "timestamp": 1445715228, "title": "Good Will Hunting (1997)"}, {"genres": "Action|Adventure|Drama", "imdbId": "tt172495", "rating": 4, "timestamp": 1445714885, "title": "Gladiator (2000)"}, {"genres": "Action|Crime|Thriller", "imdbId": "tt266697", "rating": 4, "timestamp": 1445714952, "title": "Kill Bill: Vol. 1 (2003)"}, {"genres": "Action|Crime|Drama|Thriller", "imdbId": "tt369339", "rating": 4, "timestamp": 1445714960, "title": "Collateral (2004)"}, {"genres": "Action|Comedy", "imdbId": "tt415306", "rating": 4, "timestamp": 1445715013, "title": "Talladega Nights: The Ballad of Ricky Bobby (2006)"}, {"genres": "Crime|Drama|Thriller", "imdbId": "tt407887", "rati

In [16]:
print(mapped_data)


Dataset({
    features: ['order', 'ratings', 'userId', 'text', 'completion'],
    num_rows: 365
})


In [17]:
# 데이터셋 분할
split_data = mapped_data.train_test_split(test_size=0.1)  # 10%를 테스트셋으로 사용

train_set = split_data['train']
eval_set = split_data['test']

train_set = train_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)
eval_set = eval_set.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, return_tensors="pt"), batched=True)

# lora 파라미터 설정
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


Map:   0%|          | 0/328 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/37 [00:00<?, ? examples/s]

In [18]:
train_set[3]["text"]
train_set[20]["text"]
train_set[120]["text"]

'user_ratings_info:\n[{"genres": "Crime|Mystery|Thriller", "imdbId": "tt114814", "rating": 4, "timestamp": 1513678307, "title": "Usual Suspects, The (1995)"}, {"genres": "Action|Adventure|Sci-Fi", "imdbId": "tt76759", "rating": 5, "timestamp": 1513602000, "title": "Star Wars: Episode IV - A New Hope (1977)"}, {"genres": "Crime|Drama", "imdbId": "tt111161", "rating": 5, "timestamp": 1513601957, "title": "Shawshank Redemption, The (1994)"}, {"genres": "Comedy|Drama|Romance|War", "imdbId": "tt109830", "rating": 5, "timestamp": 1513601965, "title": "Forrest Gump (1994)"}, {"genres": "Drama|War", "imdbId": "tt108052", "rating": 5, "timestamp": 1513601963, "title": "Schindler\'s List (1993)"}, {"genres": "Action|Drama|War", "imdbId": "tt120815", "rating": 5, "timestamp": 1513601986, "title": "Saving Private Ryan (1998)"}, {"genres": "Action|Crime|Drama|Thriller", "imdbId": "tt137523", "rating": 5, "timestamp": 1513601998, "title": "Fight Club (1999)"}, {"genres": "Crime|Drama", "imdbId": "tt

In [19]:
train_set[3]["completion"]
train_set[20]["completion"]
train_set[120]["completion"]

'{"order_list": ["tt3896198", "tt3501632", "tt3315342", "tt1856101", "tt482571", "tt407887", "tt371746", "tt910970", "tt372784", "tt468569", "tt1345836", "tt76759", "tt137523", "tt2582802", "tt2084970", "tt1375666", "tt120815", "tt120689", "tt109830", "tt108052", "tt111161", "tt114814", "tt5052448", "tt5362988", "tt2527336", "tt5013056"]     }'

In [20]:
model = get_peft_model(model, peft_params)

# prameter
epochs = 3 # 10

batch_size = 2

lr = 2e-4

training_params = TrainingArguments(
    output_dir="models",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=20,
    logging_steps=20,
    learning_rate=lr,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    dataloader_num_workers=1,
)


In [33]:
# ! wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
Aborted!


In [28]:
import transformers
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

trainer = Trainer(
    model=model,
    args=training_params,
    train_dataset=train_set,
    eval_dataset=eval_set,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/84 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 55.28 GiB. GPU 0 has a total capacity of 23.68 GiB of which 15.10 GiB is free. Process 3874058 has 1.26 GiB memory in use. Including non-PyTorch memory, this process has 6.58 GiB memory in use. Of the allocated memory 5.61 GiB is allocated by PyTorch, and 93.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [35]:
instruction = '''
사용자가 입력한 키워드를 통해 영화를 추천해줍니다. 영화 title 리스트를 생성해주고, 관련한 메타메이터를 보여줍니다. must return json format
'''
key_info_str = {"keywords": ['toy', 'love', 'universe']}
# genres_str={"genres":['animation']}


In [36]:
# ###instruction:
# {instruction}

text = f'''

keyword_info:
{key_info_str}


'''
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(
    input_ids=inputs["input_ids"].to("cuda"), 
    attention_mask=inputs["attention_mask"], 
    max_new_tokens=256,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)
output = tokenizer.decode(outputs[0])
print(output)


<s> 

keyword_info:
{'keywords': ['toy', 'love', 'universe']}









































































































































































!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


wandb: Network error (ReadTimeout), entering retry loop.
