In [1]:
import random
import time

from bert import preprocess_function
from datasets import load_dataset
import numpy as np
from metrics import score_generated_sentences
from transformers import BertTokenizer, EncoderDecoderModel
import torch

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.1.crossattention.self.query.weight', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.self.value.weight', 'bert.encoder.layer.10.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.encoder.layer.2.crossattention.self.key.weight', 'bert.encoder.layer.10.crossattention.self.value.bias', 'bert.encoder.layer.9.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.11.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.11.crossattention.output.dense.bias', 'bert.encoder.layer.2.crossattention.self.key.bias', 'bert.encoder.layer.7.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.la

In [2]:
tokenizer = BertTokenizer.from_pretrained("./bert-encoder")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("./bert-encoder", "./bert-decoder")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)
device

device(type='cuda', index=0)

In [3]:
ds = load_dataset("csebuetnlp/BanglaParaphrase")
index = random.randint(0, 1000)
tokenized_test = ds["test"].select(range(10000)).map(preprocess_function, batched=True)

def predict(examples):
    input_ids = torch.from_numpy(np.array(examples["input_ids"])).to(device)
    o = model.generate(input_ids=input_ids, max_new_tokens=128)
    os = []
    for i in o:
        os.append(" ".join(tokenizer.decode(i).split("[SEP]")[0].split(" ")[1:]))
    examples["output"] = os
    return examples

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
print("source", ds["test"][index:index + 1]["source"])
print("target", ds["test"][index:index + 1]["target"])
tokenized = tokenizer(ds["test"][index:index + 1]["source"], return_tensors="pt").to(device)
print("tokenized source:", tokenizer.decode(tokenized.input_ids[0]))
tokenized = tokenizer(ds["test"][index:index + 1]["target"], return_tensors="pt").to(device)
print("tokenized target:", tokenizer.decode(tokenized.input_ids[0]))

outputs = model.generate(input_ids=tokenized.input_ids, max_new_tokens=128)
print("output", tokenizer.decode(outputs[0]).split("[SEP]")[0])

source ['অনেকের কাছে শুনেছি, জাহাজে করে অনেকে মুক্তিযোদ্ধাদের ধরে নিয়ে গেছে।']
target ['অনেকের কাছ থেকে শুনেছি, অনেকেই জাহাজে করে মুক্তিবাহিনীকে নিয়ে গেছে।']
tokenized source: [CLS] অনেকের কাছে শনেছি, জাহাজে করে অনেকে মকতিযোদধাদের ধরে নিযে গেছে । [SEP]
tokenized target: [CLS] অনেকের কাছ থেকে শনেছি, অনেকেই জাহাজে করে মকতিবাহিনীকে নিযে গেছে । [SEP]




output [CLS] অনেকে মনে করেছিল যে, অনেক লোক জাহাজে করে মকতিবাহিনীকে নিযে গিযেছে । 


In [5]:
started = time.time()
test = tokenized_test.map(lambda batch: predict(batch), batched=True, batch_size=256)
scores = score_generated_sentences(test["target"], test["output"])

print({x: y for x, y in zip(["bleu_score", "ter_score", "rouge_score"], scores)})
print("scoring time (seconds):", time.time() - started)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'bleu_score': 0.04129314737577141, 'ter_score': 0.9605118795899265, 'rouge_score': 0.049509803921568625}
scoring time (seconds): 144.2633559703827
