
# LLM에서 질의응답하는 처리하는 단계를 기술(표준)
# UPDATE : 2024.11.06

# py39에서 실행

In [1]:
# 3 다음은 파일("/home/사용자아이디/.env")에 저장된 API_KEY 정보를 자동으로 읽어오는 기능

from dotenv import load_dotenv
load_dotenv("/home/mhkwon/.env")

import os

#HF_TOKEN = "get your token in http://hf.co/settings/tokens"
HF_TOKEN = os.getenv('HF_TOKEN')
print(HF_TOKEN)

from huggingface_hub import login
hf_token = login(token=HF_TOKEN, add_to_git_credential=True)

# 에러가 나면, linux에서 다음 명령어를 실행
# git config --global credential.helper store

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


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
import os
os.getcwd()

'/linuxhdd/mychatbot/my_note'

In [3]:

import torch

print(torch.__version__)

2.6.0.dev20241030+cu121


In [2]:
import transformers

print(transformers.__version__)

4.44.0


In [24]:
#import tracemalloc

#tracemalloc.start()


<em>LLM의 문자질의에 대한 답변을 처리 구조</em>

![qa image](images/qa_arch.JPG)

![qa image](images/qa_process.JPG)



# 필수 버전

!pip install transformers==4.44.0

![qa image](images/error-llama.JPG)

In [7]:
import torch
import gc
import os

def print_gpu_memory():
    allocated = torch.cuda.memory_allocated() / (1024**2)
    cached = torch.cuda.memory_reserved() / (1024**2)
    print(f"Allocated: {allocated:.2f} MB")
    print(f"Cached: {cached:.2f} MB")

import psutil
def print_cpu_memory():
    # gives a single float value
    print('cpu usage(%) :', psutil.cpu_percent())
    # gives an object with many fields
    #print(psutil.virtual_memory())
    # you can convert that object to a dictionary 
    #print(dict(psutil.virtual_memory()._asdict()))
    
    # you can have the percentage of used RAM
    print('memory usage(%) : ', psutil.virtual_memory().percent)
    # you can calculate percentage of available memory
    #print(psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)

    #memory_usage_dict = dict(psutil.virtual_memory()._asdict())
    #memory_usage_percent = memory_usage_dict['percent']
    #print(f"AFTER  CODE: memory_usage_percent: {memory_usage_percent}%")
    
    # current process RAM usage
    pid = os.getpid()
    current_process = psutil.Process(pid)
    current_process_memory_usage_as_KB = current_process.memory_info()[0] / 2.**20
    print(f"Current Process memory KB   : {current_process_memory_usage_as_KB: 9.3f} KB")
    
    return psutil.virtual_memory().available

#print_cpu_memory()

cpu usage(%) : 0.1
memory usage(%) :  2.0
Current Process memory KB   :  3118.465 KB


264841940992

In [8]:
# 1) 표준 템플릿

print_gpu_memory()
print("----------------")
start_memory = print_cpu_memory()

#############################################################
# 0) 선언 부분

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


# cpu/gpu를 선택 또는 지정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Detected device:', device)
device = "cpu" 
#print('Using device:', device)

model_id = 'meta-llama/Llama-3.2-1B' # 정상동작,  transformers==4.44.0에선, attention mask 에러
#model_id = 'yanolja/EEVE-Korean-10.8B-v1.0' # 정상동작
#model_id = 'upstage/SOLAR-10.7B-v1.0' #정상동작

#model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B' # transformers==4.44.0에선, untagged enum ModelWrapper어레
#model_id = 'LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct'
#model_id = 'naver-clova-ix/donut-base-finetuned-docvqa' # 에러 발생
#model_id = 'naver-clova-ix/donut-base-finetuned-cord-v2' # 에러 발생
#model_id = 'migtissera/Trinity-2-Codestral-22B-v0.2'
#model_id = 'BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B'


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    #device_map=device,
    #trust_remote_code=True,  # exaone only
).to(device)


#############################################################
# 1) prompt과정

# 결과값을 보여주는 template
if tokenizer.chat_template is None:
    tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"

instruction = "철수가 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고, 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?"

messages = [
    {"role": "user", "content": f"{instruction}"}
    ]


#############################################################
# 2) tokenizer과정
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to(device)

# 'unsloth/Llama-3.2-1B-Instruct 사용시에는 다음을 막아야 함.
#if model_id == 'meta-llama/Llama-3.2-1B':
#    model.generation_config.pad_token_id = model.generation_config.eos_token_id
#    model.generation_config.pad_token_id = tokenizer.pad_token_id   # 설정하지 않으면, 무한 CPU 실행

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id  # attention mask 에러 해결

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

#############################################################
# 3) LLM과정

# 실행시간을 측정하는 모듈
import time

start_time = time.time()
outputs = model.generate(
    input_ids,
    max_new_tokens=200,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    #eos_token_id=terminators, 
    pad_token_id = tokenizer.eos_token_id,  # llama 3.2, bllossom
)
end_time = time.time()
print('elapsed time =', end_time - start_time)


#############################################################
# 4) decoder과정
answer = tokenizer.decode(outputs[0])

# 특수 토근을 제거하고, 출력
print(tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True))


print("----------------")
print("\nAfter Job:")
print_gpu_memory()
end_memory = print_cpu_memory()
print("memory used = {:,}".format(start_memory - end_memory))

# Clearing cache
gc.collect()
torch.cuda.empty_cache()

Allocated: 0.00 MB
Cached: 0.00 MB
----------------
cpu usage(%) : 0.1
memory usage(%) :  2.0
Current Process memory KB   :  3118.465 KB
Detected device: cuda
elapsed time = 84.71294641494751
Question: A problem in a certain bank requires that the total amount of money in the bank is to be increased by 20% of the current amount. The total amount of money in the bank is 3000. How much money is in the bank now? A. 3000 B. 3500 C. 4000 D. 4500 E. 5000
Answer: C
----------------

After Job:
Allocated: 0.00 MB
Cached: 0.00 MB
cpu usage(%) : 5.9
memory usage(%) :  2.0
Current Process memory KB   :  3193.629 KB
memory used = 75,145,216


In [9]:
import sys

def get_size(obj, seen=None):
    """Recursively finds size of objects"""

    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()

    obj_id = id(obj)
    if obj_id in seen:
        return 0

    #print(type(obj).__name__)
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)

    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])

    return size


In [17]:
# CPU usage

print('elapsed time =', end_time - start_time)
print('model =', get_size(model))
print('tokenizer =', get_size(tokenizer))
print('input_ids =', get_size(input_ids))
print('outputs =', get_size(outputs))

#model_id = 'meta-llama/Llama-3.2-1B'
#elapsed time = 160.56574654579163
#model = 1003265
#tokenizer = 4807
#input_ids = 184
#outputs = 184

elapsed time = 160.56574654579163
model = 1003265
tokenizer = 4807
input_ids = 184
outputs = 184


In [10]:
# CUDA usage

print('elapsed time =', end_time - start_time)
print('model =', get_size(model))
print('tokenizer =', get_size(tokenizer))
print('input_ids =', get_size(input_ids))
print('outputs =', get_size(outputs))

#model_id = 'meta-llama/Llama-3.2-1B'
#elapsed time = 0.625960111618042
#model = 1006393
#tokenizer = 4807
#input_ids = 184
#outputs = 184

# 위에서 GPU MEM를 사용한것을 보면, 24GB인데, 실제 model은 1GB이다.
# 이유는, model내부에서 vector 계산을 하면서, 많은 메모리가 소요되었다.

elapsed time = 2.825634479522705
model = 1006393
tokenizer = 4807
input_ids = 184
outputs = 184


In [25]:
#snapshot = tracemalloc.take_snapshot()

#for stat in snapshot.statistics('lineno'):
#    print(stat)
#    print(stat.traceback.format())


![qa image](images/result-llm-general.JPG)

![qa image](images/error-attention-pad.JPG)

In [23]:
# 질의 문자열을 토큰으로 나투어서 숫자로 변환

encoded = tokenizer.encode(instruction)
encoded

[128000,
 107837,
 123503,
 220,
 508,
 123590,
 78453,
 110174,
 18359,
 120693,
 107417,
 103170,
 101603,
 105204,
 20565,
 110217,
 101738,
 18359,
 89946,
 20565,
 35495,
 11,
 107138,
 123503,
 102484,
 34804,
 220,
 20,
 117594,
 89946,
 14705,
 242,
 91040,
 112521,
 24140,
 102244,
 102484,
 34804,
 78453,
 110174,
 21028,
 17196,
 107,
 24140,
 16969,
 113156,
 60861,
 115372,
 36811,
 30]

In [20]:
# 숫자를 다시, 원문자(토큰)로 변환(복구)

import numpy as np

encoded_arr = numpy.array(encoded)
decoded = tokenizer.decode(encoded_arr)
decoded

'<|begin_of_text|>철수가 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?'

In [21]:
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-1B', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|reserved_special_token_2|>", rst

In [6]:
# 단순 질의 문자열을 prompt로 변환해서, 전달

messages

[{'role': 'user',
  'content': '철수가 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?'}]

In [4]:
input_ids

tensor([[112521, 123503,    220,    508, 123590,  78453, 110174,  18359, 120693,
         107417, 103170, 101603, 105204,  20565, 110217, 101738,  18359,  89946,
          20565,  35495,     11, 107138, 123503, 102484,  34804,    220,     20,
         117594,  89946,  14705,    242,  91040, 112521,  24140, 102244, 102484,
          34804,  78453, 110174,  21028,  17196,    107,  24140,  16969, 113156,
          60861, 115372,  36811,     30, 128001]], device='cuda:0')

In [5]:
input_ids.shape

torch.Size([1, 50])

In [6]:
outputs

tensor([[112521, 123503,    220,    508, 123590,  78453, 110174,  18359, 120693,
         107417, 103170, 101603, 105204,  20565, 110217, 101738,  18359,  89946,
          20565,  35495,     11, 107138, 123503, 102484,  34804,    220,     20,
         117594,  89946,  14705,    242,  91040, 112521,  24140, 102244, 102484,
          34804,  78453, 110174,  21028,  17196,    107,  24140,  16969, 113156,
          60861, 115372,  36811,     30, 128001, 128000,  14924,     25,    220,
            508, 123590,  78453, 110174,  18359, 120693, 107417, 103170, 101603,
         105204,  20565, 110217, 101738,  18359,  89946,  20565,  35495,     11,
         107138, 123503, 102484,  34804,    220,     20, 117594,  89946,  14705,
            242,  91040, 112521,  24140, 102244, 102484,  34804,  78453, 110174,
          21028,  17196,    107,  24140,  16969, 113156,  60861, 115372,  36811,
             30,    362,     13,    220,    605,    426,     13,    220,    868,
            356,     13,    

In [7]:
outputs.shape

torch.Size([1, 200])

In [3]:
answer

' 철수가 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?<|end_of_text|><|begin_of_text|>Question: 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?Answer: 10개Answer: 10개<|end_of_text|>'