In [45]:
import torcharrow as ta
import torch
import black

def pp(s):
    """Beautiful multiline formatting"""
    print(black.format_str(repr(s), mode=black.Mode()))

In [38]:
df = ta.DataFrame({
    "ints": [2, 3, 5, 7],
    "ints_with_null": [1, None, 2, None],
    "list_of_ints": [[1, 2], [3, 4], [5, 6], [7, 8]],
    "list_of_ints_with_null": [[1, 2], [3, None], [None, 6], [7, 8]],
    "id_score_list": ta.Column(
        [[(1, 1.5), (2, 2.5)], [], [(3, 3.5)], [(4, 4.5), (5, 5.5)]],
        dtype=ta.List_(ta.Struct([ta.Field('id', ta.int64), ta.Field('score', ta.float32)]))
    ),
    "str": ["a", "b", "c", "d"],
    "list_of_str": [["a", "aa"], ["b"], [], ["d", "dd"]],
    "multi_label_map": [{"click": 1, "conv": 0}, {"click": 0, "conv": 0}, {}, {"conv": 1}],
})
df

  index    ints    ints_with_null  list_of_ints    list_of_ints_with_null    id_score_list         str    list_of_str    multi_label_map
-------  ------  ----------------  --------------  ------------------------  --------------------  -----  -------------  -----------------------
      0       2                 1  [1, 2]          [1, 2]                    [(1, 1.5), (2, 2.5)]  a      ['a', 'aa']    {'click': 1, 'conv': 0}
      1       3                    [3, 4]          [3, None]                 []                    b      ['b']          {'click': 0, 'conv': 0}
      2       5                 2  [5, 6]          [None, 6]                 [(3, 3.5)]            c      []             {}
      3       7                    [7, 8]          [7, 8]                    [(4, 4.5), (5, 5.5)]  d      ['d', 'dd']    {'conv': 1}
dtype: Struct([Field('ints', int64), Field('ints_with_null', Int64(nullable=True)), Field('list_of_ints', List_(int64)), Field('list_of_ints_with_null', List_(Int64(nullab

## `to_python()` just recovers the original data

After implementing it I realized that it's almost the same as `list(df)` :)

The only difference is that it returns namedtuples instead of plain tuples and OrderedDict instead of a regular one. Maybe we should collapse them?

In [46]:
pp(list(df))
pp(df.to_python())

[
    (
        2,
        1,
        [1, 2],
        [1, 2],
        [(1, 1.5), (2, 2.5)],
        "a",
        ["a", "aa"],
        {"click": 1, "conv": 0},
    ),
    (3, None, [3, 4], [3, None], [], "b", ["b"], {"click": 0, "conv": 0}),
    (5, 2, [5, 6], [None, 6], [(3, 3.5)], "c", [], {}),
    (7, None, [7, 8], [7, 8], [(4, 4.5), (5, 5.5)], "d", ["d", "dd"], {"conv": 1}),
]

[
    Struct(
        ints=2,
        ints_with_null=1,
        list_of_ints=[1, 2],
        list_of_ints_with_null=[1, 2],
        id_score_list=[Struct(id=1, score=1.5), Struct(id=2, score=2.5)],
        str="a",
        list_of_str=["a", "aa"],
        multi_label_map=OrderedDict([("click", 1), ("conv", 0)]),
    ),
    Struct(
        ints=3,
        ints_with_null=None,
        list_of_ints=[3, 4],
        list_of_ints_with_null=[3, None],
        id_score_list=[],
        str="b",
        list_of_str=["b"],
        multi_label_map=OrderedDict([("click", 0), ("conv", 0)]),
    ),
    Struct(
        ints

## `to_torch()` converts into a very simplified columnar storage using torch.Tensors

Numerical columns just turn into tensors.

Lists become PackedList type with offsets and values. Maps - PackedMaps.

In [47]:
pp(df["ints"].to_torch())
pp(df["list_of_ints"].to_torch())
pp(df["id_score_list"].to_torch())

tensor([2, 3, 5, 7])

PackedList(
    offsets=tensor([0, 2, 4, 6, 8], dtype=torch.int32),
    values=tensor([1, 2, 3, 4, 5, 6, 7, 8]),
)

PackedList(
    offsets=tensor([0, 2, 2, 3, 5], dtype=torch.int32),
    values=Struct(
        id=tensor([1, 2, 3, 4, 5]),
        score=tensor([1.5000, 2.5000, 3.5000, 4.5000, 5.5000]),
    ),
)



For nullable columns we wrap the value into WithPresence.

Those can be nested!

In [49]:
pp(df["ints_with_null"].to_torch())
pp(df["list_of_ints_with_null"].to_torch())

WithPresence(values=tensor([1, 0, 2, 0]), presence=tensor([True, False, True, False]))

PackedList(
    offsets=tensor([0, 2, 4, 6, 8], dtype=torch.int32),
    values=WithPresence(
        values=tensor([1, 2, 3, 0, 0, 6, 7, 8]),
        presence=tensor([True, True, True, False, False, True, True, True]),
    ),
)



Since PyTorch doesn't have string tensors, string columns get converted to `List[str]` in python.

As a special rule, we also don't use PackedList for lists of strings (as it'd be awkward). This special case is also present in F6 today.

In [50]:
pp(df["str"].to_torch())
pp(df["list_of_str"].to_torch())

["a", "b", "c", "d"]

[["a", "aa"], ["b"], [], ["d", "dd"]]



But we do use PackedMap for maps even if the keys are string (though no one probably would want it)

In [51]:
pp(df["multi_label_map"].to_torch())

PackedMap(
    offsets=tensor([0, 2, 4, 4, 5], dtype=torch.int32),
    keys=["click", "conv", "click", "conv", "conv"],
    values=tensor([1, 0, 0, 0, 1]),
)



You can convert the entire Dataframe at once!

In [52]:
pp(df.to_torch())

Struct(
    ints=tensor([2, 3, 5, 7]),
    ints_with_null=WithPresence(
        values=tensor([1, 0, 2, 0]), presence=tensor([True, False, True, False])
    ),
    list_of_ints=PackedList(
        offsets=tensor([0, 2, 4, 6, 8], dtype=torch.int32),
        values=tensor([1, 2, 3, 4, 5, 6, 7, 8]),
    ),
    list_of_ints_with_null=PackedList(
        offsets=tensor([0, 2, 4, 6, 8], dtype=torch.int32),
        values=WithPresence(
            values=tensor([1, 2, 3, 0, 0, 6, 7, 8]),
            presence=tensor([True, True, True, False, False, True, True, True]),
        ),
    ),
    id_score_list=PackedList(
        offsets=tensor([0, 2, 2, 3, 5], dtype=torch.int32),
        values=Struct(
            id=tensor([1, 2, 3, 4, 5]),
            score=tensor([1.5000, 2.5000, 3.5000, 4.5000, 5.5000]),
        ),
    ),
    str=["a", "b", "c", "d"],
    list_of_str=[["a", "aa"], ["b"], [], ["d", "dd"]],
    multi_label_map=PackedMap(
        offsets=tensor([0, 2, 4, 4, 5], dtype=torch.int32),


## to be continued...

* specifying output type, so that we can mix output formats, e.g. convert some columns of the dataframe, but keep another as in python
* reverse conversion from these simple structs to Dataframe
* UDFs with automatic conversion back and forth
* explore integration with `__torch_function__`