In [1]:
import re
import json
import torch
from tqdm import tqdm

In [2]:
from datasets import load_dataset
# for available language names, see above
src_lang = "english"
tgt_lang = "thai"
dataset = load_dataset(f"csebuetnlp/CrossSum", "{}-{}".format(src_lang, tgt_lang))

Found cached dataset cross_sum (/home/alta/summary/pm574/.cache/datasets/cross_sum/english-thai/1.0.0/eb4f77f2fc1b67d7b1b8d20108d04a2916d9c6cbdad1aa984beea9494fe3a12b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source_url', 'target_url', 'summary', 'text'],
        num_rows: 816
    })
    test: Dataset({
        features: ['source_url', 'target_url', 'summary', 'text'],
        num_rows: 102
    })
    validation: Dataset({
        features: ['source_url', 'target_url', 'summary', 'text'],
        num_rows: 102
    })
})

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_m2m_crossSum", use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_m2m_crossSum")
model = model.to(device)
model = model.eval()

In [20]:
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
get_lang_id = lambda lang: tokenizer._convert_token_to_id(
    model.config.task_specific_params["langid_map"][lang][1]
) 
target_lang = "thai" # for a list of available language names see below

In [34]:
outputs = {}
for idx in tqdm(range(len(dataset['test']))):
    source = dataset['test'][idx]['text']
    input_ids = tokenizer(
        [WHITESPACE_HANDLER(source)],
        return_tensors="pt", padding="max_length",
        truncation=True, max_length=512)["input_ids"].to(device)

    output_ids = model.generate(
        input_ids=input_ids,
        decoder_start_token_id=get_lang_id(target_lang),
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4,
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    outputs[idx] = summary

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [01:36<00:00,  1.06it/s]


In [36]:
with open("CrossSum_En2Th_outputs/test.json", "w") as f:
    json.dump(outputs, f)

# Evaluation

In [48]:
import numpy as np
import pythainlp
from rouge_score import rouge_scorer

In [43]:
class ThaiTokenizer:
    def __init__(self, engine="newmm"):
        self.engine = engine 
        self.tokenizer = pythainlp.tokenize.word_tokenize # newmm -- new multicut is the default option
    def tokenize(self, text):
        return self.tokenizer(text, engine=self.engine)
my_tokenizer = ThaiTokenizer("newmm")

# Google's reimplementation -- https://github.com/google-research/google-research/blob/e3d00617cb28064b6e96ab4e2485079f0ca5a763/rouge/rouge_scorer.py#L60
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=my_tokenizer)

In [49]:
R1_arr = []
R2_arr = []
RL_arr = []
for idx in range(len(outputs)):
    output = outputs[idx]
    target = dataset['test'][idx]['summary']
    Rscore = scorer.score(target, output)
    R1, R2, RL = Rscore['rouge1'].fmeasure, Rscore['rouge2'].fmeasure, Rscore['rougeL'].fmeasure
    R1_arr.append(R1)
    R2_arr.append(R2)    
    RL_arr.append(RL)    

In [57]:
print("{:.3f}".format(np.mean(R1_arr)*100))
print("{:.3f}".format(np.mean(R2_arr)*100))
print("{:.3f}".format(np.mean(RL_arr)*100))

25.931
7.978
20.065
