In [21]:
import torch
import pandas as pd

In [22]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, path_to_csv, name_converter, unit_converter, tax_converter):
        super().__init__()

        self.name_converter = name_converter
        self.unit_converter = unit_converter
        self.tax_converter = tax_converter

        df = pd.read_csv(path_to_csv, sep=";")

        # FIX COMMA FLOAT
        # for col in ["quantity", "amount", "price", "total_price"]:
        #     df[col] = (
        #         df[col]
        #         .astype(str)
        #         .str.replace(",", ".", regex=False)
        #         .astype(float)
        #         # .replace("nan", None)
        #     )

        # ==================== SAMPLES ====================
        self.samples = df["line"].tolist()

        # ==================== LABELS ====================
        # We want output to be in this format, so we need to assemble the labels accordingly:
        # {
        #   "name_logits": Tensor([seq_len, vocab_size]), // The only sequential output (the only part that uses decoder)
        #   "amount_pred": Tensor([1]),                   // Numeric regression output
        #   "quantity_pred": Tensor([1]),                 // Numeric regression output
        #   "unit_logits": Tensor([num_units]),           // Categorical classification
        #   "price_pred": Tensor([1]),                    // Numeric regression output
        #   "total_pred": Tensor([1]),                    // Numeric regression output
        #   "tax_logits": Tensor([num_tax_classes])       // Categorical classification
        # }

        x_or_none = lambda x, none: x if pd.notna(x) and x != "" else none # Helper function

        self.labels = []

        regression_columns = ["quantity", "amount", "price", "total_price"]

        for _, row in df.iterrows():
            label = {
                # seq2seq
                "name": x_or_none(row["name"], None),

                # category
                "unit": x_or_none(row["unit"], None),
                "tax_category": x_or_none(row["tax_category"], None),

                # regression (needs an extra presence bit - for inference. for training a mask will be applied to loss)
                **{col: self.parse_number(row[col])[0] for col in regression_columns},
                **{col + "_present": pd.notna(row[col]) for col in regression_columns},
            }

            for col in regression_columns:
                value, present = self.parse_number(row[col])

                label[col] = value if present else -1
                label[col + "_present"] = present

            self.labels.append(label)

    def parse_number(self, x):
        if x is None:
            return None, False

        x = str(x).strip()

        if x == "" or x.lower() in {"nan", "null", "none"}:
            return None, False

        x = x.replace(",", ".")

        try:
            return float(x), True
        except:
            return None, False

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        # ==================== SAMPLE ====================
        sample = self.samples[item]

        encoded_sample = self.name_converter.encode_seq(sample)
        encoded_sample = torch.tensor(encoded_sample, dtype=torch.long)

        # ==================== LABEL ====================
        label = self.labels[item]

        regression_columns = ["quantity", "amount", "price", "total_price"]

        x_or_none = lambda x, none: none if x == "<NONE>" else x # Helper function

        encoded_label = {
            "name":         torch.tensor(   [self.name_converter["<NONE>"]] if label["name"]         == None else self.name_converter.encode_seq(label["name"]        )   , dtype=torch.long),
            "unit":         torch.tensor(   self.unit_converter["<NONE>"]   if label["unit"]         == None else self.unit_converter.encode(    label["unit"]        )   , dtype=torch.long),
            "tax_category": torch.tensor(   self.tax_converter["<NONE>"]    if label["tax_category"] == None else self.tax_converter.encode(     label["tax_category"])   , dtype=torch.long),
            **{col             : torch.tensor(   label[col]                , dtype=torch.float32) for col in regression_columns},
            **{col + "_present": torch.tensor(   label[col + "_present"]   , dtype=torch.int8   ) for col in regression_columns},
        }

        # encoded_label = {
        #     "name":         self.name_converter.encode_seq(label["name"]        ),
        #     "unit":         self.unit_converter.encode(    label["unit"]        ),
        #     "tax_category": self.unit_converter.encode(    label["tax_category"]),
        #     **{col             : label[col] for col in regression_columns},
        #     **{col + "_present": label[col + "_present"] for col in regression_columns},
        # }

        return encoded_sample, encoded_label

In [23]:
# Helpers
def extract_all_symbols(path_to_csv):
    df = pd.read_csv(path_to_csv, sep=";", dtype=str)

    df_as_text = df.astype(str).agg("".join, axis=1).str.cat()

    return sorted(set(df_as_text))

def extract_all_unique_column_values(path_to_csv, column):
    df = pd.read_csv(path_to_csv, sep=";")

    filtered = df[column].dropna()
    filtered = filtered[filtered != ""]

    return filtered.unique().tolist()

In [24]:
%run 2_converter.ipynb

# Line
name_symbols = extract_all_symbols("../data/train.csv")
name_converter = Converter(symbols=name_symbols, special_symbols=["<PAD>", "<BOS>", "<EOS>", "<NONE>"])
print(name_symbols)

# Unit
unit_symbols = extract_all_unique_column_values("../data/train.csv", "unit")
unit_converter = Converter(symbols=unit_symbols, special_symbols=["<NONE>"])
print(unit_symbols)

# Tax
tax_symbols = extract_all_unique_column_values("../data/train.csv", "tax_category")
tax_converter = Converter(symbols=tax_symbols, special_symbols=["<NONE>"])
print(tax_symbols)

dataset = Dataset("../data/train.csv", name_converter=name_converter, unit_converter=unit_converter, tax_converter=tax_converter)

dataset[0]

[' ', '!', '"', '#', '%', '&', "'", '(', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', 'Ó', 'ó', 'Ą', 'ą', 'Ć', 'ć', 'Ę', 'ę', 'Ł', 'ł', 'ń', 'Ś', 'ś', 'ź', 'Ż', 'ż']
['KG', 'G', 'SZT', 'L', 'ML']
['C', 'A', 'B', 'G', '0', 't', 'r', 'c', '4', 'a', '8', 'l', 'k']


(tensor([43, 31, 41, 31,  5, 32, 31, 49, 39, 31,  5, 20, 41, 37,  5, 33,  5, 28,
          5, 82, 22, 15, 24, 28,  5, 22, 21, 15, 22, 20, 33]),
 {'name': tensor([43, 31, 41, 31,  5, 32, 31, 49, 39, 31,  5, 20, 41, 37]),
  'unit': tensor(2),
  'tax_category': tensor(2),
  'quantity': tensor(9.),
  'amount': tensor(1.),
  'price': tensor(3.5900),
  'total_price': tensor(32.3100),
  'quantity_present': tensor(1, dtype=torch.int8),
  'amount_present': tensor(1, dtype=torch.int8),
  'price_present': tensor(1, dtype=torch.int8),
  'total_price_present': tensor(1, dtype=torch.int8)})