## 001 - 01  

![Vogon Poetry - Zero Copy data processing over Columnar layouts](./images/vogon-poetry-v02.png)  

_Oh freddled gruntbuggly_,            
&nbsp;&nbsp;&nbsp;&nbsp;_Thy micturations are to me_            
_As plurdled gabbleblotchits on a lurgid bee_.            
&nbsp;&nbsp;&nbsp;&nbsp;_Groop, I implore thee, my foonting turlingdromes_,            
_And hooptiously drangle me with crinkly bindlewurdles_,            
&nbsp;&nbsp;&nbsp;&nbsp;_Or I will rend thee in the gobberwarts_            
_With my blurglecruncheon, see if I don't!_      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- Prostetnic Vogon Jeltz, in **Douglas Adams' "The Hitchhiker's Guide to the Galaxy"**

On similar lines, we go:

#### Columnar Layout  
#### &nbsp;&nbsp;&nbsp;&nbsp;Array of Structs  
#### Struct of Arrays  
#### &nbsp;&nbsp;&nbsp;&nbsp;Metadata Medatata  
#### Zero Copy  
#### &nbsp;&nbsp;&nbsp;&nbsp;Distributed  
#### Relational   
#### &nbsp;&nbsp;&nbsp;&nbsp;Data Engineering    

# **Vogon Poetry**   

In [1]:
# setup/pre-req
import pyarrow as pa
import pyarrow.compute as pc

In [2]:
def buf_addrs(arr):
    addrs = []
    for b in arr.buffers():
        if b is None:
            addrs.append(None)
        else:
            addrs.append(int(b.address))
    return tuple(addrs)

In [3]:
def table_buf_addrs(t):
    out = {}
    for name in t.column_names:
        out[name] = [buf_addrs(chunk) for chunk in t[name].chunks]
    return out

In [4]:
def show_aliasing(label, before, after):
    print(label)
    for col in after.column_names:
        b = before.get(col)
        a = table_buf_addrs(after)[col]
        print(col, "aliased_chunks=", sum(1 for x in a if x in b), "after_chunks=", len(a))

In [5]:
t = pa.table(
    {
        "id": pa.array(range(10), type=pa.int32()),
        "s": pa.array([f"v{i%3}" for i in range(10)]),
        "x": pa.array([i * 10 for i in range(10)], type=pa.int32()),
    }
)
base = table_buf_addrs(t)

In [6]:
base, t

({'id': [(None, 1868595986560)],
  's': [(None, 1868596052160, 1868596052288)],
  'x': [(None, 1868595986688)]},
 pyarrow.Table
 id: int32
 s: string
 x: int32
 ----
 id: [[0,1,2,3,4,5,6,7,8,9]]
 s: [["v0","v1","v2","v0","v1","v2","v0","v1","v2","v0"]]
 x: [[0,10,20,30,40,50,60,70,80,90]])

In [7]:
p = t.select(["id", "s"])
show_aliasing("projection", base, p)


projection
id aliased_chunks= 1 after_chunks= 1
s aliased_chunks= 1 after_chunks= 1


In [8]:
sl = t.slice(2, 5)
show_aliasing("slice", base, sl)


slice
id aliased_chunks= 1 after_chunks= 1
s aliased_chunks= 1 after_chunks= 1
x aliased_chunks= 1 after_chunks= 1


In [9]:
rn = t.rename_columns(["id2", "s2", "x2"])
md = t.replace_schema_metadata({b"owner": b"eng", b"purpose": b"bench"})

In [10]:
rn, md

(pyarrow.Table
 id2: int32
 s2: string
 x2: int32
 ----
 id2: [[0,1,2,3,4,5,6,7,8,9]]
 s2: [["v0","v1","v2","v0","v1","v2","v0","v1","v2","v0"]]
 x2: [[0,10,20,30,40,50,60,70,80,90]],
 pyarrow.Table
 id: int32
 s: string
 x: int32
 ----
 id: [[0,1,2,3,4,5,6,7,8,9]]
 s: [["v0","v1","v2","v0","v1","v2","v0","v1","v2","v0"]]
 x: [[0,10,20,30,40,50,60,70,80,90]])

In [11]:
a = pa.array([1, 2, 3, 4], type=pa.int32())
validity_buf, data_buf = a.buffers()

date = pa.Array.from_buffers(
    pa.date32(),
    len(a),
    [validity_buf, data_buf],
    null_count=a.null_count,
    offset=a.offset,
)

print("int32_buffers", [int(b.address) if b else None for b in a.buffers()])
print("date32_buffers", [int(b.address) if b else None for b in date.buffers()])

int32_buffers [None, 1868595986816]
date32_buffers [None, 1868595986816]


In [12]:
s = pa.array(["a", "b", "a", "c", "b", "a"])
d = pc.dictionary_encode(s)
print(d.type)

dictionary<values=string, indices=int32, ordered=0>


In [13]:
codes = d.indices
dict_values = d.dictionary

In [14]:
d2 = pa.DictionaryArray.from_arrays(codes, dict_values)
print("codes_alias", buf_addrs(codes) == buf_addrs(d2.indices))
print("dict_alias", buf_addrs(dict_values) == buf_addrs(d2.dictionary))

codes_alias True
dict_alias True


In [15]:
one = pa.scalar(1, type=pa.int32())
zero = pa.scalar(0, type=pa.int32())

In [16]:
mask = pc.equal(pc.bit_wise_and(t["id"], one), zero)
idx = pc.indices_nonzero(mask)

In [17]:
logical = (t, idx)  # table + selection vector
phys = t.take(idx)  # materialized compact result

In [18]:
print("logical_indices_type", idx.type)
show_aliasing("filter_materialize_take", base, phys)

logical_indices_type uint64
filter_materialize_take
id aliased_chunks= 0 after_chunks= 1
s aliased_chunks= 0 after_chunks= 1
x aliased_chunks= 0 after_chunks= 1


In [19]:
perm = pc.sort_indices(t, sort_keys=[("s", "ascending"), ("id", "descending")])
sorted_t = t.take(perm)

In [20]:
print("perm_type", perm.type)
show_aliasing("sort_materialize_take", base, sorted_t)

perm_type uint64
sort_materialize_take
id aliased_chunks= 0 after_chunks= 1
s aliased_chunks= 0 after_chunks= 1
x aliased_chunks= 0 after_chunks= 1


In [21]:
left = pa.table({"k": [1, 2, 3], "lv": ["a", "b", "c"]})
right = pa.table({"k": [2, 3, 4], "rv": ["B", "C", "D"]})

In [22]:
j = left.join(right, keys="k")
print(j)

pyarrow.Table
k: int64
lv: string
rv: string
----
k: [[2,3,1]]
lv: [["b","c","a"]]
rv: [["B","C",null]]


In [23]:
g = t.group_by("s").aggregate([("x", "sum"), ("id", "count")])
print(g)

pyarrow.Table
s: string
x_sum: int64
id_count: int64
----
s: [["v0","v1","v2"]]
x_sum: [[180,120,150]]
id_count: [[4,3,3]]


In [24]:
t1 = t.slice(0, 5)
t2 = t.slice(5, 5)

In [25]:
cat = pa.concat_tables([t1, t2], promote_options="none")
print([len(t1["id"].chunks), len(t2["id"].chunks), len(cat["id"].chunks)])

[1, 1, 2]


In [26]:
compact = cat.combine_chunks()
print([len(cat["id"].chunks), len(compact["id"].chunks)])

[2, 1]


In [27]:
sl = t.slice(1, 3)  # zero-copy buffers, new metadata object :contentReference[oaicite:12]{index=12}
idx = pc.indices_nonzero(pc.equal(t["id"], 3))  # allocates indices buffer, no base buffer rewrite

In [28]:
logical = (t, perm)      # logical sorted view
physical = t.take(perm)  # physical sorted table