In [1]:
from llama_cpp import Llama
from transformers import AutoTokenizer

In [2]:
model_path='./new_model_inst-summ'

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = Llama(
    model_path='./new_model_inst-summ/unsloth.Q4_K_M.gguf', #다운로드받은 모델의 위치
    n_ctx=2048,
    n_gpu_layers=33        # Number of model layers to offload to GPU
)

llama_model_loader: loaded meta data with 25 key-value pairs and 291 tensors from ./new_model_inst-summ/unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                       llama.context_length u32              = 8192
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                  llama.feed_forward_lengt

In [5]:
import pandas as pd
import random

df = pd.read_csv('./news_summ.csv', encoding='UTF-8')

sampled_df = df.sample(frac=0.2, random_state=42)

news = list(sampled_df['news'])
summ = list(sampled_df['summary'])

print(len(news))
print(len(summ))

random_indices = random.sample(range(8561), 200)

sample_news = [news[i] for i in random_indices]
sample_summ = [summ[i] for i in random_indices]

8561
8561


In [6]:
output_summ = []

for text in sample_news:

    PROMPT = "정확한 챗봇으로서 상대방의 입력에 대해 요약을 하자. 모든 대답은 한국어(Korean)으로 대답해줘."
    instruction = text

    messages = [
        {"role": "system", "content": f"{PROMPT}"},
        {"role": "user", "content": f"{instruction}"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    generation_kwargs = {
        "max_tokens": 2048,
        "stop": ["<|eot_id|>"],
        "top_p": 0.9,
        "temperature": 0.6,
        "echo": True,  # 프롬프트를 출력에 포함합니다.
    }

    response_msg = model(prompt, **generation_kwargs)
    summ         = response_msg['choices'][0]['text'][len(prompt):]

    output_summ.append(summ)


llama_print_timings:        load time =     351.83 ms
llama_print_timings:      sample time =       1.97 ms /    38 runs   (    0.05 ms per token, 19328.59 tokens per second)
llama_print_timings: prompt eval time =     554.97 ms /   805 tokens (    0.69 ms per token,  1450.52 tokens per second)
llama_print_timings:        eval time =     636.20 ms /    37 runs   (   17.19 ms per token,    58.16 tokens per second)
llama_print_timings:       total time =    1207.29 ms /   842 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     351.83 ms
llama_print_timings:      sample time =       2.27 ms /    44 runs   (    0.05 ms per token, 19400.35 tokens per second)
llama_print_timings: prompt eval time =     292.84 ms /   468 tokens (    0.63 ms per token,  1598.12 tokens per second)
llama_print_timings:        eval time =     727.55 ms /    43 runs   (   16.92 ms per token,    59.10 tokens per second)
llama_print_timings:       total time =    1038.37 ms /   511 

In [7]:
import pickle

with open('summ.pkl', 'wb') as file1, \
    open('output_summ.pkl', 'wb') as file2:

    pickle.dump(sample_summ, file1)
    pickle.dump(output_summ, file2)