In [1]:
import warnings
import gc
warnings.filterwarnings('ignore')

In [2]:
import torch
import evaluate
import pandas as pd
from tqdm import tqdm
from peft import PeftModel, PeftConfig
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()
gc.collect()
torch.manual_seed(42)

<torch._C.Generator at 0x230d946e730>

In [3]:
full_data_test = pd.read_csv('../dataset/full_test_data_summarization.csv')

In [3]:
checkpoint = './model_checkpoint/checkpoint-1576'

In [5]:
config = PeftConfig.from_pretrained(checkpoint)
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map={"":0},
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    quantization_config=BitsAndBytesConfig(    
        load_in_4bit=True,
        load_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16
    )
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model.config.pad_token_id = base_model.config.eos_token_id

bin d:\PythonVenv\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
model = PeftModel.from_pretrained(base_model, checkpoint, device_map={"":0})
model.eval()

In [7]:
def create_prompt(sample):
    template = """<s>[INST] Bạn là một trợ lý AI. Bạn sẽ được giao một nhiệm vụ. Hãy tóm lược ngắn gọn nội dung sau bằng tiếng Việt:
{} [/INST] """
    prompt = template.format(sample)
    return prompt

In [8]:
def generate_text(text):
  prompt = create_prompt(text)
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = model.generate(
    **inputs,
    early_stopping=False,
    max_new_tokens=1024,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2
  )
  line = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
  torch.cuda.empty_cache()
  return line.split('[/INST]')[1].strip()

In [9]:
full_data_test['summarization_predictions'] = full_data_test['context'].progress_apply(lambda x: generate_text(x))

  0%|          | 0/6006 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/6006 [00:09<7:34:14,  4.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/6006 [00:18<10:39:52,  6.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/6006 [00:25<11:09:01,  6.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/6006 [00:32<11:30:13,  6.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 6/6006 [00:43<13:48:12,  8.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 7/6006 [00:51<13:20:29,  8.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 8/6006 [00:57<12:39:40,  7.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 9/6006 [01:06<13:03:59,  7.84s/it]Setting `

In [10]:
rouge_metric = evaluate.load("rouge")
rouge_scores = rouge_metric.compute(references=full_data_test['summarization'].tolist(), predictions=full_data_test['summarization_predictions'].tolist(), use_stemmer=True, rouge_types=['rouge1', 'rouge2', 'rougeL'])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
rouge_scores

{'rouge1': 0.6073709993163005,
 'rouge2': 0.3535813077835422,
 'rougeL': 0.4224855441941776}

In [12]:
full_data_test.to_csv('test_mistral_lora.csv', index=False)

In [3]:
tmp = pd.read_csv('./test_mistral_lora.csv')

In [5]:
tmp1 = tmp[0:1179]

In [6]:
# VLSP dataset
tmp2 = tmp[1179:1427]

In [7]:
# Wikilingua
tmp3= tmp[1427:4095]

In [7]:
tmp4 = tmp[4095:]

In [8]:
# News dataset
tmp5 = pd.concat([tmp1, tmp4], axis=0)

In [9]:
tmp5.shape

(3090, 3)

In [13]:
rouge_metric = evaluate.load("rouge")

In [15]:
rouge_metric.compute(references=tmp2['summarization'].tolist(), predictions=tmp2['summarization_predictions'].tolist(), use_stemmer=True, rouge_types=['rouge1', 'rouge2', 'rougeL'])

{'rouge1': 0.5514556226356194,
 'rouge2': 0.24268137376339052,
 'rougeL': 0.3340222922669962}

In [16]:
rouge_metric.compute(references=tmp3['summarization'].tolist(), predictions=tmp3['summarization_predictions'].tolist(), use_stemmer=True, rouge_types=['rouge1', 'rouge2', 'rougeL'])

{'rouge1': 0.5559046758295272,
 'rouge2': 0.28220273027968323,
 'rougeL': 0.38901493848949387}

In [17]:
rouge_metric.compute(references=tmp5['summarization'].tolist(), predictions=tmp5['summarization_predictions'].tolist(), use_stemmer=True, rouge_types=['rouge1', 'rouge2', 'rougeL'])

{'rouge1': 0.6562189005835162,
 'rouge2': 0.4241398731164239,
 'rougeL': 0.45856896965371063}