In [1]:
%run 2_converter.ipynb
%run 3_dataset.ipynb
%run 4_batching.ipynb
%run 7_model.ipynb

[' ', '!', '"', '#', '%', '&', "'", '(', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', 'Ó', 'ó', 'Ą', 'ą', 'Ć', 'ć', 'Ę', 'ę', 'Ł', 'ł', 'ń', 'Ś', 'ś', 'ź', 'Ż', 'ż']
['KG', 'G', 'SZT', 'L', 'ML']
['C', 'A', 'B', 'G', '0', 't', 'r', 'c', '4', 'a', '8', 'l', 'k']
[' ', '!', '"', '#', '%', '&', "'", '(', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', 'Ó', 'ó', 

In [2]:
import torch

In [3]:
def load_model(checkpoint_path, dataset, emb_dim=256, num_heads=8, ff_dim=1024, dropout=0.25, num_layers=4, device="cuda"):
    model = Model(
        sym_len=len(dataset.name_converter),
        max_sam_len=max(len(x[0]) for x in dataset),
        max_nam_len=max(len(x[1]["name"]) for x in dataset),
        unit_cat_len=len(dataset.unit_converter),
        tax_cat_len=len(dataset.tax_converter),
        emb_dim=emb_dim,
        num_heads=num_heads,
        ff_dim=ff_dim,
        dropout=dropout,
        num_layers=num_layers,
    )

    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.to(device)
    model.eval()

    return model

In [4]:
def decode_name(token_ids, name_converter):
    # token_ids: a 1D tensor of ids
    # convert to string using your converter decode (you may strip BOS/EOS/PAD)
    tokens = token_ids.tolist()
    # remove leading BOS and everything from EOS on
    if name_converter["<BOS>"] in tokens:
        tokens = tokens[tokens.index(name_converter["<BOS>"])+1:]
    if name_converter["<EOS>"] in tokens:
        tokens = tokens[:tokens.index(name_converter["<EOS>"])]
    # use converter to decode sequence of ints into string:
    return name_converter.decode_seq(tokens)   # adjust if your converter has different API


In [5]:
def decode_category(logits, converter):
    idx = logits.argmax(dim=-1).item()
    return converter.decode(idx)

In [6]:
def decode_regression(x):
    x = x.item()
    x = max(x, 0)
    return float(torch.expm1(torch.tensor(x)).item())

In [7]:
def predict_one(model, text, name_converter, unit_converter, tax_converter,
                device="cpu", max_len=50):

    model.eval()

    # Encode sample
    enc_tokens = torch.tensor(
        name_converter.encode_seq(text),
        dtype=torch.long,
        device=device
    ).unsqueeze(0)               # (1, L)

    sample_mask = torch.zeros(enc_tokens.shape, dtype=torch.bool, device=device)

    # Encoder forward
    enc_emb = model.encoder_embedding(enc_tokens)
    enc_out = model.encoder(enc_emb, sample_mask)

    # -------------------------------------------------------
    # NO NAME GENERATION — we create a dummy decoder input
    # -------------------------------------------------------
    # Use a single BOS token as decoder input
    sos_id = int(name_converter["<BOS>"])
    dec_inp = torch.tensor([[sos_id]], device=device)

    dec_emb = model.decoder_embedding(dec_inp)

    # masks
    name_mask = dec_inp.eq(int(name_converter["<PAD>"]))   # trivial but needed
    sample_mask = torch.zeros(enc_out.size()[:2], dtype=torch.bool, device=device)

    # Decoder forward
    dec_out = model.decoder(
        decoder_input=dec_emb,
        encoder_output=enc_out,
        name_mask=name_mask,
        sample_mask=sample_mask
    )

    # Multihead (unit/tax/regression)
    out = model.multihead(enc_out, dec_out)

    # Decode heads
    unit_pred_id = out["unit_logits"].argmax(dim=-1).item()
    tax_pred_id  = out["tax_logits"].argmax(dim=-1).item()

    unit_text = unit_converter.decode(unit_pred_id)
    tax_text  = tax_converter.decode(tax_pred_id)

    amount   = float(out["amount_pred"].item())
    quantity = float(out["quantity_pred"].item())
    price    = float(out["price_pred"].item())
    total    = float(out["total_pred"].item())

    return {
        "unit": unit_text,
        "tax": tax_text,
        "quantity": quantity,
        "amount": amount,
        "price": price,
        "total_price": total,
    }

In [8]:
def predict_batch(model, texts, name_converter, unit_converter, tax_converter, device="cuda"):
    return [
        predict_one(model, t, name_converter, unit_converter, tax_converter, device=device)
        for t in texts
    ]


In [9]:

name_symbols = extract_all_symbols("../data/data.csv")
name_converter = Converter(
    symbols=name_symbols,
    special_symbols=["<PAD>", "<BOS>", "<EOS>", "<NONE>"]
)

unit_symbols = extract_all_unique_column_values("../data/data.csv", "unit")
unit_converter = Converter(
    symbols=unit_symbols,
    special_symbols=["<NONE>"]
)

tax_symbols = extract_all_unique_column_values("../data/data.csv", "tax_category")
tax_converter = Converter(
    symbols=tax_symbols,
    special_symbols=["<NONE>"]
)

# -------------------------------------------------------
# 2. Dataset + Collator
# -------------------------------------------------------
dataset = Dataset(
    "../data/data.csv",
    name_converter=name_converter,
    unit_converter=unit_converter,
    tax_converter=tax_converter
)

model = load_model("checkpoint_epoch_619.pt", dataset)

collator = Collator(converter=name_converter)

result = predict_one(
    model,
    "KLCBatontruskawkbana t 1SZT x8,56 8,56C",
    name_converter,
    unit_converter,
    tax_converter,
    device="cuda"
)

print(result)

{'unit': 'SZT', 'tax': 'C', 'quantity': 1.0374729633331299, 'amount': 14.756280899047852, 'price': 9.718778610229492, 'total_price': 9.674680709838867}
