## How DataFrames (DF) and DataPipes (DP) can work together

In [1]:
from torch.utils.data import IterDataPipe

In [2]:
# Example IterDataPipe
class ExampleIterPipe(IterDataPipe):
    def __init__(self, range = 20):
        self.range = range
    def __iter__(self):
        for i in range(self.range):
            yield i

`to_dataframes_pipe` converts tuples into DataFrames, ideally dataloaders will load into DF directly

In [3]:
def get_dataframes_pipe(range = 20, batch_size = 7):
    return ExampleIterPipe(range = range).map(lambda i: (i, i % 3)).to_dataframes_pipe(columns = ['i','j'], batch_size = batch_size)

dp = get_dataframes_pipe()

# Iterator converts DF Pipe into singular items
for i in dp:
    print(i)

   i  j
0  0  0
1  1  1
2  2  2
3  3  0
4  4  1
5  5  2
6  6  0
    i  j
0   7  1
1   8  2
2   9  0
3  10  1
4  11  2
5  12  0
6  13  1
    i  j
0  14  2
1  15  0
2  16  1
3  17  2
4  18  0
5  19  1


Operations over DF Pipe is captured

In [4]:
dp = get_dataframes_pipe(batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
print(dp.ops_str())


var_2 = input_var_1.i * 100
var_3 = var_2 + input_var_1.j
var_4 = var_3 - 2.7
input_var_1["y"] = var_4


Captured operations executed on `__next__` calls of constructed DataPipe

In [5]:
dp = get_dataframes_pipe(batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
for i in dp:
    print(i)

   i  j      y
0  0  0   -2.7
1  1  1   98.3
2  2  2  199.3
   i  j      y
0  3  0  297.3
1  4  1  398.3
2  5  2  499.3
   i  j      y
0  6  0  597.3
1  7  1  698.3
2  8  2  799.3
    i  j       y
0   9  0   897.3
1  10  1   998.3
2  11  2  1099.3
    i  j       y
0  12  0  1197.3
1  13  1  1298.3
2  14  2  1399.3
    i  j       y
0  15  0  1497.3
1  16  1  1598.3
2  17  2  1699.3
    i  j       y
0  18  0  1797.3
1  19  1  1898.3


`shuffle` is DataPipe level operation here and it changes order of DataFrames

In [6]:
dp = get_dataframes_pipe(batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
dp = dp.shuffle()
for i in dp:
    print(i)

    i  j       y
0  15  0  1497.3
1  16  1  1598.3
2  17  2  1699.3
    i  j       y
0  12  0  1197.3
1  13  1  1298.3
2  14  2  1399.3
    i  j       y
0  18  0  1797.3
1  19  1  1898.3
   i  j      y
0  0  0   -2.7
1  1  1   98.3
2  2  2  199.3
   i  j      y
0  3  0  297.3
1  4  1  398.3
2  5  2  499.3
   i  j      y
0  6  0  597.3
1  7  1  698.3
2  8  2  799.3
    i  j       y
0   9  0   897.3
1  10  1   998.3
2  11  2  1099.3


You can continue mixing DF and DP operations

In [7]:
dp = get_dataframes_pipe(batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
dp = dp.shuffle()
dp = dp - 17
dp['y'] = dp.y * 10000
for i in dp:
    print(i)

    i   j          y
0 -11 -17  5803000.0
1 -10 -16  6813000.0
2  -9 -15  7823000.0
    i   j          y
0 -14 -17  2803000.0
1 -13 -16  3813000.0
2 -12 -15  4823000.0
   i   j           y
0  1 -17  17803000.0
1  2 -16  18813000.0
   i   j           y
0 -8 -17   8803000.0
1 -7 -16   9813000.0
2 -6 -15  10823000.0
   i   j           y
0 -2 -17  14803000.0
1 -1 -16  15813000.0
2  0 -15  16823000.0
   i   j           y
0 -5 -17  11803000.0
1 -4 -16  12813000.0
2 -3 -15  13823000.0
    i   j          y
0 -17 -17  -197000.0
1 -16 -16   813000.0
2 -15 -15  1823000.0


#### Open questions:
 - Does this `bath` API looks intuitive?

In [11]:
dp = get_dataframes_pipe(batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
dp = dp.shuffle()
dp = dp - 17
dp['y'] = dp.y * 10000
dp = dp.batch(2)
for i,v in enumerate(dp):
    print('batch',i)
    for x in v:
        print(x)

batch 0
    i   j          y
0 -11 -17  5803000.0
1 -10 -16  6813000.0
2  -9 -15  7823000.0
    i   j          y
0 -14 -17  2803000.0
1 -13 -16  3813000.0
2 -12 -15  4823000.0
batch 1
   i   j           y
0 -2 -17  14803000.0
1 -1 -16  15813000.0
2  0 -15  16823000.0
   i   j           y
0 -8 -17   8803000.0
1 -7 -16   9813000.0
2 -6 -15  10823000.0
batch 2
    i   j          y
0 -17 -17  -197000.0
1 -16 -16   813000.0
2 -15 -15  1823000.0
   i   j           y
0 -5 -17  11803000.0
1 -4 -16  12813000.0
2 -3 -15  13823000.0
batch 3
   i   j           y
0  1 -17  17803000.0
1  2 -16  18813000.0


 - Should be support untyped `concat`?

In [15]:
dp0 = get_dataframes_pipe(range = 8, batch_size = 4)
dp = get_dataframes_pipe(range = 6, batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
dp = dp.concat(dp0)
for i,v in enumerate(dp):
    print(v)

   i  j      y
0  0  0   -2.7
1  1  1   98.3
2  2  2  199.3
   i  j      y
0  3  0  297.3
1  4  1  398.3
2  5  2  499.3
   i  j
0  0  0
1  1  1
2  2  2
3  3  0
   i  j
0  4  1
1  5  2
2  6  0
3  7  1


 - How unbatching of DF should look alike?

In [16]:
dp = get_dataframes_pipe(range = 6, batch_size = 3)
dp['y'] = dp.i * 100 + dp.j - 2.7
dp = dp.unbatch()
for i in dp:
    print(i)

TypeError: 'CaptureGetAttr' object is not callable