# Import

In [None]:
import os

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pandas as pd

from configs import build_exp, EnvDefineUnit
from data_engineering.dataset.precendent import CSVPrecendentDataset, install_pipeline
from data_engineering.prompt_engineering.LLM_template import get_prompt_template
from data_engineering.prompt_engineering.precendent_to_docs import get_prompt_precendent
from data_engineering.prompt_engineering.precendent_to_question import get_prompt_question
from data_engineering.dataset.guideline import PDFDataset
from data_engineering.RAG import build_vectorstore
from model.LLM import load_LLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = EnvDefineUnit()
config_exp = build_exp('exp_1')
# 경로
path_train = os.path.join(env.PATH_DATA_DIR, config_exp.train)
path_test = os.path.join(env.PATH_DATA_DIR, config_exp.test)
paths_pdf = os.path.join(env.PATH_DATA_DIR, 'raw', '건설안전지침')
paths_pdf = [os.path.join(paths_pdf, x) for x in os.listdir(paths_pdf)]


In [3]:
# 실험 파라미터
encoding = config_exp.data_encoding
pipeline = config_exp.data_pipeline
prompt_template = config_exp.prompt_template
chain_type1 = config_exp.RAG_chain_type1
chain_type2 = config_exp.RAG_chain_type2
model_name = config_exp.model_name
temperature = config_exp.temperature
top_p = config_exp.top_p
top_k = config_exp.top_k
max_new_tokens = config_exp.max_new_tokens

In [4]:
model_params = {
    "model_name":model_name,
    "temperature":temperature,
    "top_p" :top_p,
    "top_k":top_k,
    "max_new_tokens":max_new_tokens
}

In [None]:
model_params

{'model_name': 'NCSOFT/Llama-VARCO-8B-Instruct',
 'temperature': 0.1,
 'top_p': (1.0,),
 'top_k': (-1,),
 'max_new_tokens': 64}

# Data Load & Pre-processing

In [4]:
pipeline = install_pipeline(pipeline)

In [5]:
# 데이터 로드
test_data = pd.read_csv(path_test, encoding = encoding)
precendent = pd.read_csv(path_train, encoding = encoding)
guidelines = PDFDataset(paths_pdf)

test_data = pipeline(test_data)
precendent = pipeline(precendent)

precendents = []
for i, row in precendent.iterrows():
    prec = get_prompt_precendent(row)
    precendents.append(prec)

# Vector store 생성

In [6]:
# 벡터스토어 생성
retriever_precendent = build_vectorstore(precendents)
print("벡터스토어 생성 완료")

# retriever_guidelines = build_vectorstore(guidelines)

print("벡터스토어 생성 완료")
# 템플릿 프롬프트
prompt_template = get_prompt_template(exp = prompt_template)
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template.template,
)

  embedding = HuggingFaceEmbeddings(model_name=embedding_model_name)
23322it [00:00, 2995637.84it/s]


벡터스토어 생성 완료
벡터스토어 생성 완료


# Model import

In [7]:

# LLM 모델 로드
llm = load_LLM('load_vllm', model_params)
print("모델로드완료")

2025-03-07 09:24:49,736	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 03-07 09:24:49 __init__.py:207] Automatically detected platform cuda.
INFO 03-07 09:24:55 config.py:549] This model supports multiple tasks: {'classify', 'score', 'generate', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-07 09:24:56 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='NCSOFT/Llama-VARCO-8B-Instruct', speculative_config=None, tokenizer='NCSOFT/Llama-VARCO-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=Fa

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.00s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:05<00:05,  2.96s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:09<00:03,  3.42s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.73s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:13<00:00,  3.38s/it]



INFO 03-07 09:25:12 model_runner.py:1115] Loading model weights took 5.3129 GB
INFO 03-07 09:25:15 worker.py:267] Memory profiling takes 2.42 seconds
INFO 03-07 09:25:15 worker.py:267] the current vLLM instance can use total_gpu_memory (10.00GiB) x gpu_memory_utilization (0.90) = 9.00GiB
INFO 03-07 09:25:15 worker.py:267] model weights take 5.31GiB; non_torch_memory takes 0.01GiB; PyTorch activation peak memory takes 1.22GiB; the rest of the memory reserved for KV Cache is 2.45GiB.
INFO 03-07 09:25:15 executor_base.py:111] # cuda blocks: 1254, # CPU blocks: 2048
INFO 03-07 09:25:15 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 2.45x
INFO 03-07 09:25:15 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliz

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:22<00:00,  1.53it/s]

INFO 03-07 09:25:38 model_runner.py:1562] Graph capturing finished in 20 secs, took 0.52 GiB
INFO 03-07 09:25:38 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 26.11 seconds
모델로드완료





# RAG chain 생성

In [None]:
# RAG 체인 (DF 기반)
chain_df = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type1,
    retriever=retriever_precendent,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# Inference

In [None]:
from tqdm import tqdm

In [None]:
# 추론
test_results = []
for idx, row in tqdm(test_data.iterrows()):
    question = get_prompt_question(row)
    result_df = chain_df.invoke(question)
    
    # result_pdf = chain_pdf.invoke(question)

    # 사용자가 원하는 방식으로 두 결과를 합치거나, 둘 중 하나만 선택
    # 여기서는 DF 결과와 PDF 결과를 단순 연결 예시
    final_result = result_df['result']
    test_results.append(final_result)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it, est. speed input: 896.94 toks/s, output: 50.40 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it, est. speed input: 828.30 toks/s, output: 43.03 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it, est. speed input: 825.01 toks/s, output: 47.31 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it, est. speed input: 767.59 toks/s, output: 46.87 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it, est. speed input: 329.22 toks/s, output: 17.34 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it, est. speed input: 877.44 toks/s, output: 43.80 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it, est. speed input: 800.48 toks/s, output: 46.49 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it, est. speed input: 974.37 toks/s, output: 43.58 toks/s]
Processed prompts: 100%|██████████| 1/1 

In [None]:
a = pd.DataFrame(test_results, columns = ['answer'])

In [None]:
b = pd.read_csv("/workspace/Storage/hansoldeco3/Data/sample_result.csv")

In [None]:
from wrapup.evaluation import calculate_similarities, scoring
true_sample = pd.read_csv("/workspace/Storage/hansoldeco3/Data/sample/v1/test.csv")
cossims, jaccardsims = calculate_similarities(true_sample['재발방지대책 및 향후조치계획'], a['answer'])
score = scoring(cossims, jaccardsims)
print(score)

0.3558504359625673


# Submission