In [17]:
import torch
import pandas as pd

In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, path_to_csv, name_converter, unit_converter, tax_converter):
        super().__init__()

        self.name_converter = name_converter
        self.unit_converter = unit_converter
        self.tax_converter = tax_converter

        df = pd.read_csv(path_to_csv)

        # ==================== SAMPLES ====================
        self.samples = df["line"].tolist()

        # ==================== LABELS ====================
        # We want output to be in this format, so we need to assemble the labels accordingly:
        # {
        #   "name_logits": Tensor([seq_len, vocab_size]), // The only sequential output (the only part that uses decoder)
        #   "amount_pred": Tensor([1]),                   // Numeric regression output
        #   "quantity_pred": Tensor([1]),                 // Numeric regression output
        #   "unit_logits": Tensor([num_units]),           // Categorical classification
        #   "price_pred": Tensor([1]),                    // Numeric regression output
        #   "total_pred": Tensor([1]),                    // Numeric regression output
        #   "tax_logits": Tensor([num_tax_classes])       // Categorical classification
        # }

        x_or_none = lambda x, none: x if pd.notna(x) and x != "" else none # Helper function

        self.labels = []

        regression_columns = ["quantity", "amount", "price", "total_price"]

        for _, row in df.iterrows():
            label = {
                # seq2seq
                "name": x_or_none(row["name"], "<NONE>"),

                # category
                "unit": x_or_none(row["unit"], "<NONE>"),
                "tax_category": x_or_none(row["tax_category"], "<NONE>"),

                # regression (needs an extra presence bit - for inference. for training a mask will be applied to loss)
                **{col: x_or_none(row[col], -1) for col in regression_columns},
                **{col + "_present": pd.notna(row[col]) for col in regression_columns},
            }

            self.labels.append(label)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        # ==================== SAMPLE ====================
        sample = self.samples[item]

        encoded_sample = self.name_converter.encode_seq(sample)
        encoded_sample = torch.tensor(encoded_sample, dtype=torch.long)

        # ==================== LABEL ====================
        label = self.labels[item]

        regression_columns = ["quantity", "amount", "price", "total_price"]

        encoded_label = {
            "name":         torch.tensor(   self.name_converter.encode_seq(label["name"]        )   , dtype=torch.long),
            "unit":         torch.tensor(   self.unit_converter.encode(    label["unit"]        )   , dtype=torch.long),
            "tax_category": torch.tensor(   self.unit_converter.encode(    label["tax_category"])   , dtype=torch.long),
            **{col             : torch.tensor(   label[col]                , dtype=torch.float32) for col in regression_columns},
            **{col + "_present": torch.tensor(   label[col + "_present"]   , dtype=torch.int8   ) for col in regression_columns},
        }

        # encoded_label = {
        #     "name":         self.name_converter.encode_seq(label["name"]        ),
        #     "unit":         self.unit_converter.encode(    label["unit"]        ),
        #     "tax_category": self.unit_converter.encode(    label["tax_category"]),
        #     **{col             : label[col] for col in regression_columns},
        #     **{col + "_present": label[col + "_present"] for col in regression_columns},
        # }

        return encoded_sample, encoded_label

In [19]:
# Helpers
def extract_all_symbols(path_to_csv):
    df = pd.read_csv(path_to_csv, dtype=str)

    df_as_text = df.astype(str).agg("".join, axis=1).str.cat()

    return sorted(set(df_as_text))

def extract_all_unique_column_values(path_to_csv, column):
    df = pd.read_csv(path_to_csv)

    filtered = df[column].dropna()
    filtered = filtered[filtered != ""]

    return filtered.unique().tolist()

In [20]:
%run 2_converter.ipynb

# Line
name_symbols = extract_all_symbols("data.csv")
name_converter = Converter(symbols=name_symbols, special_symbols=["<PAD>", "<BOS>", "<EOS>", "<NONE>"])
print(name_symbols)

# Unit
unit_symbols = extract_all_unique_column_values("data.csv", "unit")
unit_converter = Converter(symbols=unit_symbols, special_symbols=["<NONE>"])
print(unit_symbols)

# Tax
tax_symbols = extract_all_unique_column_values("data.csv", "tax_category")
tax_converter = Converter(symbols=tax_symbols, special_symbols=["<NONE>"])
print(tax_symbols)

dataset = Dataset("data.csv", name_converter=name_converter, unit_converter=unit_converter, tax_converter=tax_converter)

dataset[3]

[' ', '*', ',', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'X', 'Y', 'Z', 'a', 'c', 'd', 'e', 'g', 'k', 'l', 'n', 'o', 's', 't', 'u', 'x', 'y', 'z', 'Ł', 'ł']
['kg', 'g', 'szt']
['A', 'B', 'C', 't']


(tensor([29, 33, 32, 23, 25, 32, 37, 35, 21, 37,  5, 34, 33, 31, 28,  5, 13, 11,
          5, 23,  5, 14,  5, 55, 17,  7, 11, 19,  5, 12, 18,  7, 17, 16, 23]),
 {'name': tensor([29, 33, 32, 23, 25, 32, 37, 35, 21, 37,  5, 34, 33, 31, 28, 24, 33, 35,
          33, 39, 41,  5, 13, 11]),
  'unit': tensor(4),
  'tax_category': tensor(0),
  'quantity': tensor(4.),
  'amount': tensor(-1.),
  'price': tensor(7.1900),
  'total_price': tensor(-1.),
  'quantity_present': tensor(1, dtype=torch.int8),
  'amount_present': tensor(0, dtype=torch.int8),
  'price_present': tensor(1, dtype=torch.int8),
  'total_price_present': tensor(0, dtype=torch.int8)})