In [1]:
from lm_model import LSTMLM
from tokenizer import SentencePieceTokenizer
from data import LibriSpeechTextDataset, get_text_dataloader
import torch
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TOKENIZER_MODEL_FILE_PATH = "./vocabs/librispeech_1024_bpe.model"
DATASET_JSON_FILE_PATH = "./json/librispeech_train-clean-100.json"
LM_MODEL_FILE_PATH = "./artifacts/librispeech-clean-100/6c74f0e19278402b87c4e21a4866e206/artifacts/model_34.pth"
DEVICE = "cuda"

In [3]:
tokenizer = SentencePieceTokenizer(
    model_file_path=TOKENIZER_MODEL_FILE_PATH
)
dataset = LibriSpeechTextDataset(
    tokenizer=tokenizer,
    json_file_path=DATASET_JSON_FILE_PATH
)

In [4]:
with open(LM_MODEL_FILE_PATH, "rb") as f:
    cpt = torch.load(f)
model_state = cpt["model"]
model_args = cpt["model_args"]
language_model = LSTMLM(**model_args).to(DEVICE)
language_model.load_state_dict(model_state)

<All keys matched successfully>

In [21]:
IDX = 30
prompt = dataset[IDX][1][:20].tolist()
prompt = [tokenizer.bos_token_id] + prompt

prompt = torch.tensor(prompt).unsqueeze(0).to(DEVICE)

In [6]:
# greedy decode
hyp_tokens = prompt[0].tolist()
next_token = None
output, hidden = language_model.inference(prompt, None)
output = output[0, -1, :]
next_token = torch.argmax(output, dim=-1)
hyp_tokens.append(next_token.item())
while next_token != tokenizer.eos_token_id:
    output, hidden = language_model.inference(next_token, hidden) #[1, T, D]
    output =  output[0, -1, :]
    next_token = torch.argmax(output, dim=-1)
    hyp_tokens.append(next_token.item())
    print(tokenizer.token_ids_to_text(hyp_tokens), end="\r")
answer_tokens = dataset[IDX][1].tolist()
answer = tokenizer.token_ids_to_text(answer_tokens)
hyp = tokenizer.token_ids_to_text(hyp_tokens)
print(f"answer: {answer}")
print(f"hypothesis: {hyp}")

AttributeError: 'LSTMLM' object has no attribute 'inference'

In [26]:
class Hypotheis:
    def __init__(self, hyp: List[int], next_input: torch.Tensor, hidden, score):
        self.hyp = hyp
        self.next_input = next_input
        self.hidden = hidden
        self.score = score

class BeamSearch:
    def __init__(
            self,
            beam_size: int,
            max_length: int,
            scorer,
    ):
        self.beam_size = beam_size
        self.max_length = max_length
        self.scorer = scorer
    
    def forward(
            self,
            prompt: torch.Tensor,
    ):
        # prompt: [T]
        initial_hypothesis = Hypotheis(prompt.tolist(), prompt, None, 0)
        hypotheses = [initial_hypothesis]
        next_hypotheses = []
        length = prompt.shape[0]
        ended_hypotheses = []
        while length < self.max_length:
            for hypothesis in hypotheses:
                hyp, next_input, hidden, score = hypothesis.hyp, hypothesis.next_input, hypothesis.hidden, hypothesis.score
                output, hidden = self.scorer.score(next_input, hidden) # [1, T, num_tokens]
                output = output[0, -1, :]
                topk = torch.topk(output, self.beam_size)
                for i in range(self.beam_size):
                    new_next_input = topk.indices[i]
                    new_hyp = hyp + [new_next_input.item()]
                    new_score = score + topk.values[i].item()
                    new_hypothesis = Hypotheis(new_hyp, new_next_input, hidden, new_score)
                    next_hypotheses.append(new_hypothesis)
            next_hypotheses = sorted(next_hypotheses, key=lambda x: x.score, reverse=True)[:min(self.beam_size, len(next_hypotheses))]
            if len(next_hypotheses) == 0:
                break
            print(f"length: {length}, {tokenizer.token_ids_to_text(next_hypotheses[0].hyp)}", end="\r")
            next_hypotheses, ended_hypotheses = self.post_process(next_hypotheses, ended_hypotheses)
            hypotheses = next_hypotheses
            next_hypotheses = []
            length += 1
        
        nbest_hypotheses = sorted(ended_hypotheses, key=lambda x: x.score, reverse=True)[:min(self.beam_size, len(ended_hypotheses))]
        return nbest_hypotheses
    
    def post_process(self, next_hypotheses, ended_hypotheses):
        remained_next_hypotheses = []
        for hypothesis in next_hypotheses:
            if hypothesis.next_input == tokenizer.eos_token_id:
                ended_hypotheses.append(hypothesis)
            else:
                remained_next_hypotheses.append(hypothesis)
        return remained_next_hypotheses, ended_hypotheses


In [27]:
beamsearch = BeamSearch(beam_size=10, max_length=100, scorer=language_model)
ended_hyps = beamsearch.forward(prompt[0])

length: 99, thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's pennsylvania augustine's husband's augusta's husband's husband's pennsylvania augustine'

In [28]:
for hyp in ended_hyps:
    # normalized score
    hyp.score /= len(hyp.hyp)
    print(f"{tokenizer.token_ids_to_text(hyp.hyp)}: {hyp.score}")

thin woman with angles and without curves her dark hair showed some grays and scholarship: -0.5394296288490296
thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's handwriting: -0.9109017410385423
thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's pennsylvania: -0.8893814965353899
thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's pennsylvania augustine: -0.9395778866985939
thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's pennsylvania augusta: -0.9620941595344062
thin woman with angles and without curves her dark hair showed some grays and scholarship in the streets of the mountaineer's pennsylvania augustlihood: -0.9829661327831056
thin woman with angles and without curves her dark hair sho