
# RAG + LLM의 template (hf+openai)

> 1) openai에서 huggingface로 전환
> 2) 목적 : langchain기능 사용해보기, web으로 문서파일을 업로드하고, document을 읽어서, split해서, chroma에 추가하는 기능
> 3) 확장해보기 : text, pdf등 문서읽기로 확장
> 4) 선행작업 : HF_TOKEN, OPENAI_API_KEY 등록후 획득

> 아래에는 huggingface와 openai방식으로 2개의 예제 있음.

> 참고: 학원에서 제공해주는 교육 프로그램의 내용
> https://ragmasterpath.oopy.io/

![qa image](images/chromadb.PNG)
![qa image](images/llm.PNG)

In [32]:
from dotenv import load_dotenv
load_dotenv("/home/mhkwon/.env")

import os

#HF_TOKEN = "get your token in http://hf.co/settings/tokens"
HF_TOKEN = os.getenv('HF_TOKEN')
print(HF_TOKEN)

#from huggingface_hub import login
#hf_token = login(token=HF_TOKEN, add_to_git_credential=True)

# 에러가 나면, linux에서 다음 명령어를 실행
# git config --global credential.helper store

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
Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/mhkwon/.cache/huggingface/token
Login successful



# 0) huggingface용 llm template


In [126]:
# 1) 표준 템플릿
# "test_llm_general.ipynb"에서 복사

#############################################################
# 0) 선언 부분

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# cpu/gpu를 선택 또는 지정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = "cpu"
print('Using device:', device)

model_id = 'meta-llama/Llama-3.2-1B' # 에러 발생
#model_id = 'LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct'
#model_id = 'naver-clova-ix/donut-base-finetuned-docvqa' # 에러 발생
#model_id = 'naver-clova-ix/donut-base-finetuned-cord-v2' # 에러 발생
#model_id = 'upstage/SOLAR-10.7B-v1.0'
#model_id = 'yanolja/EEVE-Korean-10.8B-v1.0'
#model_id = 'migtissera/Trinity-2-Codestral-22B-v0.2'
#model_id = 'BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B'


def my_aiquery(model_name, messages):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        #trust_remote_code=True,  # exaone only
    )

    # 결과값을 보여주는 template
    if tokenizer.chat_template is None:
        tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"

    #############################################################
    # 1) prompt과정
    #instruction = "철수가 20개의 연필을 가지고 있었는데 영희가 절반을 가져가고, 민수가 남은 5개를 가져갔으면 철수에게 남은 연필의 갯수는 몇개인가요?"
    #messages = [
    #    {"role": "user", "content": f"{instruction}"}
    #    ]
    
    #############################################################
    # 2) tokenizer과정
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    
    # 'unsloth/Llama-3.2-1B-Instruct 사용시에는 다음을 막아야 함.
    if model_name == 'meta-llama/Llama-3.2-1B':
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
        model.generation_config.pad_token_id = tokenizer.pad_token_id   # 설정하지 않으면, 무한 CPU 실행
    
    #if tokenizer.pad_token is None:
    #    tokenizer.pad_token = tokenizer.eos_token
    #if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    # 에러해결
    # The attention mask is not set and cannot be inferred from input because pad token is same as eos token. 
    # As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
    
    #terminators = [
    #    tokenizer.eos_token_id,
    #    tokenizer.convert_tokens_to_ids("<|eot_id|>")
    #]
    
    #############################################################
    # 3) LLM과정
    
    # 실행시간을 측정하는 모듈
    import time
    
    start_time = time.time()
    outputs = model.generate(
        input_ids,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        #eos_token_id=terminators, 
        pad_token_id = tokenizer.eos_token_id,  # llama 3.2, bllossom
    )
    end_time = time.time()
    print('elapsed time =', end_time - start_time)
    
    
    #############################################################
    # 4) decoder과정
    #answer = tokenizer.decode(outputs[0])
    #print(answer)
    
    # 특수 토근을 제거하고, 출력
    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    #print(response)

    return response


Using device: cuda


In [8]:
model.__dir__()

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_search',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_dola_decoding',
 '_expand_inputs_for_generation',

In [15]:
model.config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [23]:
model.generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "temperature": 0.6,
  "top_p": 0.9
}

In [34]:
print(model.get_input_embeddings())
print(model.get_output_embeddings())
print(model.name_or_path)
print(model.main_input_name)
print(model.hf_device_map)

Embedding(128256, 2048)
Linear(in_features=2048, out_features=128256, bias=False)
meta-llama/Llama-3.2-1B
input_ids
{'': 0}


In [None]:
'''
# 사용자질의를 이미 만들어진 chroma db에서 정보를 가져와서, chagpt에 물어보기 방식으로 만든 챗봇
# flask4.py에서 url로 가져와서 만들어진 chroma db를 활용한다.


# cli에서 실행
# python flask5.py 


# 기본 개념 : https://cookbook.chromadb.dev/core/concepts/
# gihub 소스 : https://github.com/chroma-core/chroma


sqlite> .schema collections
CREATE TABLE IF NOT EXISTS "collections" (
    id TEXT PRIMARY KEY, -- unique globally
    name TEXT NOT NULL, -- unique per database
    dimension INTEGER,
    database_id TEXT NOT NULL REFERENCES databases(id) ON DELETE CASCADE,
    UNIQUE (name, database_id)
);
sqlite> 
sqlite> select * from collections;
76e260af-b248-414b-a868-781afb6e59af|langchain|1536|00000000-0000-0000-0000-000000000000


# https://docs.trychroma.com/guides

# langchain
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/

# 전체 기능들:
# https://python.langchain.com/v0.2/docs/how_to/#vector-stores

# 이런 에레가 발생하면, embedding vector가 서로 다르다는 것이다.
InvalidDimensionException: Embedding dimension 768 does not match collection dimensionality 1536

'''


# 1) Huggingface용 chatbot-rag template

[관련 vectorestore 만들고 오기](test_indexing_loader_splitter_chroma-hf.ipynb)

In [129]:
#1) Huggingface용 chatbot-rag template

# "test_indexing_loader_splitter_chroma-hf.ipynb"를 통해서 만들어진
# vectorstore "chroma_store_hf"를 사용한다.


from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate

model_name = 'meta-llama/Llama-3.2-1B'


hf_embeddings = HuggingFaceEmbeddings()
   
vectorstore = Chroma(persist_directory='chroma_store_hf', embedding_function = hf_embeddings)

def build_prompt(query, context):
    prompt = ChatPromptTemplate.from_messages([
        (
            "system",
            "I am going to ask you a question, which I would like you to answer"
            "based only on the provided context, and not any other information."
            "If there is not enough information in the context to answer the question,"
            'say "I am not sure", then try to make a guess.'
            "Break your answer up into nicely readable paragraphs.",
        ),
        (
            "human", 
            f"The question is {query}. Here is all the context you have:"
            #f'{(" ").join(context)}',
            f'{context}',
        ),
    ])
    #print(prompt)
    return prompt


def get_chatHF_response(query, context, model_name):
    response = my_aiquery(
        model_name=model_name,
        messages=build_prompt(query, context),)
    #print(response)
    return response


while True:
    # Get the user's query
    query = input("Query: ")
    if len(query) == 0:
        print("Please enter a question. Ctrl+C to Quit.\n")
        continue
    if query == 'quit':
        break
    print(f"\nThinking using {model_name}...\n")
    
    # Query the collection to get the 5 most relevant results
    results = vectorstore.search(
        query=query, 
        search_type="similarity", # "mmr", "similarity_score_threshold"
        k=5,
        #include=["documents", "metadatas"]
    )
    
    
    sources = "\n".join(
        [   f"{result.metadata['source']}"
            for result in results 
        ]
    )
    
    # Get the response from GPT
    res = ''
    if len(results) > 0:
        res = results[0].page_content
        
    response = get_chatHF_response(query, res, model_name)  # type: ignore

    # Output, with sources
    print(response)
    print("\n")
    print(f"Source documents:\n{sources}")
    print("----"*20)
    print("\n")


Query:  대한민국의 수도는?


Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4



Thinking using meta-llama/Llama-3.2-1B...

elapsed time = 2.169771194458008
def calculate_sediment_volume(sediment_type, sediment_density, sediment_thickness, sediment_area):
    """
    Calculate the volume of sediment in a given area.

    Args:
        sediment_type (str): The type of sediment (e.g. "sand", "silt", "clay").
        sediment_density (float): The density of the sediment in kg/m^3.
        sediment_thickness (float): The thickness of the sediment in meters.
        sediment_area (float): The area of the sediment in square meters.

    Returns:
        float: The volume of the sediment in cubic meters.
    """
    # Calculate the volume of the sediment in cubic meters
    volume = sediment_area * sediment_density * sediment_thickness

    # Check if the sediment is of a specific type
    if sediment_type == "sand":
        # Calculate the volume of the sand in cubic meters
        sand_volume = volume * 0.5
        # Check if the sand volume is


Source documents:
http

Query:  quit



# 2) OpenAI용 template


[관련 vectorestore 만들고 오기](test_indexing_loader_splitter_chroma.ipynb)


In [31]:
# OpenAI 버전

# "test_indexing_loader_splitter_chroma.ipynb"를 통해서 만들어진 vectorstore를 사용한다.


from langchain_chroma import Chroma
from openai.types.chat import ChatCompletionMessageParam
import openai

model_id = "gpt-3.5-turbo"
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectorstore_openai = Chroma(persist_directory='chroma_store', embedding_function = embeddings)

def build_prompt_openai(query, context):
    system: ChatCompletionMessageParam = {
        "role": "system",
        "content": "I am going to ask you a question, which I would like you to answer"
        "based only on the provided context, and not any other information."
        "If there is not enough information in the context to answer the question,"
        'say "I am not sure", then try to make a guess.'
        "Break your answer up into nicely readable paragraphs.",
    }
    user: ChatCompletionMessageParam = {
        "role": "user",
        "content": f"The question is {query}. Here is all the context you have: "
        "{context}"
        #f'{(" ").join(context)}',
    }

    return [system, user]

def get_chatGPT_response(query, context, model_id):
    response = openai.chat.completions.create(
        model=model_id,
        messages=build_prompt_openai(query, context),
    )
    #print(response)

    return response.choices[0].message.content  # type: ignore
######

    
vectorstore = Chroma(persist_directory='chroma_store', embedding_function = embeddings)


while True:

    # Get the user's query
    query = input("Query: ")
    if len(query) == 0:
        print("Please enter a question. Ctrl+C to Quit.\n")
        continue
    if query == 'quit':
        break
    print(f"\nThinking using {model_id}...\n")
    
    # Query the collection to get the 5 most relevant results
    results = vectorstore.search(
        query=query, 
        search_type="similarity", # "mmr", "similarity_score_threshold"
        #n_results=5, # 아래 k로 대체
        k=5, 
        #include=["documents", "metadatas"]
    )
    
    sources = "\n".join(
        [   f"{result.metadata['source']}"
            for result in results 
        ]
    )
    
    # Get the response from GPT
    res = ''
    if len(results) > 0:
        res = results[0].page_content
        
    response = get_chatGPT_response(query, res, model_id)  # type: ignore
    
    # Output, with sources
    print(response)
    print("\n")
    print(f"Source documents:\n{sources}")
    print("----"*20)
    print("\n")


Query:  한국의 수도는?



Thinking using gpt-3.5-turbo...

--------------------------------------------------------------------------------
Based on the limited context provided, the question "한국의 수도는?" translates to "What is the capital of South Korea?" The answer to this question is "서울 (Seoul)." Seoul is the capital city of South Korea and the largest metropolis in the country. It serves as the political, cultural, and economic center of South Korea, making it the capital city. Therefore, based on the context given, the answer to the question is 서울 (Seoul).


Source documents:
https://www.etnews.com/20240823000244
https://www.etnews.com/20240823000244
https://www.etnews.com/20240823000244
https://www.etnews.com/20240823000244
https://www.etnews.com/20240823000244




Query:  quit


In [27]:
f'{(" ").join(results)}'

TypeError: sequence item 0: expected str instance, Document found

In [19]:
sources = "\n".join(
    [   f"{result.metadata['source']}"
        for result in results 
    ]
)
sources

'https://www.etnews.com/20240823000244\nhttps://www.etnews.com/20240823000244\nhttps://www.etnews.com/20240823000244\nhttps://www.etnews.com/20240823000244\nhttps://www.etnews.com/20240823000244'

In [9]:
results[0]

Document(metadata={'description': '미래를 보는 창 - 전자신문', 'language': 'ko', 'source': 'https://www.etnews.com/20240823000244', 'title': "카카오브레인, AI 헬스케어 전담 '씨엑스알랩'으로 분할한다 - 전자신문"}, page_content='차 부수고 시민 위협한 20대 음주 운전자뉴스속보SWIT경제전자모빌리티플랫폼/유통과학\xa0정치오피니언국제전국스포츠특집연재라이프연예포토공연전시생활문화비주얼IT이슈플러스Hot 영상뷰포인트인포그래픽부가서비스ConferenceallshowTV시사용어PDF서비스서비스안내신문구독신청콘텐츠구매초판서비스회원서비스내 스크랩이용안내지면광고안내행사문의디지털광고안내이용약관개인정보취급방침고충처리회사소개전자신문전자신문인터넷연혁CI소개회사위치 지면광고안내행사문의디지털광고안내이용약관개인정보취급방침고충처리사이트맵전자신문회사소개주소 : 서울시 서초구 양재대로2길 22-16 호반파크1관대표번호 : 02-2168-9200등록번호 : 서울 아04494등록일자 : 2017년 04월 27일사업자명 : 전자신문인터넷사업자번호 : 107-81-80959발행·편집인: 심규호청소년보호책임자: 김인기Copyright © Electronic Times Internet. All Rights Reserved.')

In [12]:
results[0].metadata

{'description': '미래를 보는 창 - 전자신문',
 'language': 'ko',
 'source': 'https://www.etnews.com/20240823000244',
 'title': "카카오브레인, AI 헬스케어 전담 '씨엑스알랩'으로 분할한다 - 전자신문"}

In [13]:
results[0].metadata['title']

"카카오브레인, AI 헬스케어 전담 '씨엑스알랩'으로 분할한다 - 전자신문"