## Preprocess Raw Data

In [1]:
import os

In [2]:
xml_files = []

for dir, _, files in os.walk('data/lineStrokes'):
    for file in files:
        xml_files.append(f'{dir}/{file}')

In [3]:
xml_files

['data/lineStrokes/j06/j06-333/j06-333z-06.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-03.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-02.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-07.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-05.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-01.xml',
 'data/lineStrokes/j06/j06-333/j06-333z-04.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-03.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-01.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-05.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-02.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-04.xml',
 'data/lineStrokes/j06/j06-287/j06-287z-06.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-06.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-07.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-05.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-04.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-03.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-01.xml',
 'data/lineStrokes/j06/j06-308/j06-308z-02.xml',
 'data/lineStrokes/j

In [4]:
len(xml_files)

12195

In [6]:
xml_files[0]

'data/lineStrokes/j06/j06-333/j06-333z-06.xml'

In [7]:
import xml.etree.ElementTree as ET

In [10]:
root = ET.parse(xml_files[0]).getroot()

In [11]:
root

<Element 'WhiteboardCaptureSession' at 0x7f11a042eae0>

In [12]:
root[0]

<Element 'WhiteboardDescription' at 0x7f11a042ee50>

In [13]:
root[1]

<Element 'StrokeSet' at 0x7f11a03c2a40>

In [14]:
for stroke in root[1]:
    print(stroke)

<Element 'Stroke' at 0x7f11a03c2400>
<Element 'Stroke' at 0x7f11a03c2d60>
<Element 'Stroke' at 0x7f11a03c2680>
<Element 'Stroke' at 0x7f11a03c25e0>
<Element 'Stroke' at 0x7f11a03c2ea0>
<Element 'Stroke' at 0x7f11a03c23b0>
<Element 'Stroke' at 0x7f11a03575e0>
<Element 'Stroke' at 0x7f11a0285bd0>
<Element 'Stroke' at 0x7f11a028e180>
<Element 'Stroke' at 0x7f11a028e4a0>
<Element 'Stroke' at 0x7f11a028e7c0>
<Element 'Stroke' at 0x7f11a028eea0>
<Element 'Stroke' at 0x7f11a0294220>
<Element 'Stroke' at 0x7f11a02943b0>
<Element 'Stroke' at 0x7f11a0294810>
<Element 'Stroke' at 0x7f11a0294c70>
<Element 'Stroke' at 0x7f11a029d270>
<Element 'Stroke' at 0x7f11a029d450>
<Element 'Stroke' at 0x7f11a029d630>
<Element 'Stroke' at 0x7f11a029db80>
<Element 'Stroke' at 0x7f11a03f8d60>
<Element 'Stroke' at 0x7f11a03f8400>
<Element 'Stroke' at 0x7f11a03f2360>
<Element 'Stroke' at 0x7f11a03f27c0>
<Element 'Stroke' at 0x7f11a03ec180>
<Element 'Stroke' at 0x7f11a03ec4a0>
<Element 'Stroke' at 0x7f11a03df090>
<

In [23]:
import numpy as np

In [22]:
data = []

In [24]:
temp = ['a', 'b', 'c']
for c, i in enumerate(temp):
    print(c, i)

0 a
1 b
2 c


In [25]:
for file in xml_files[:1]:
    root = ET.parse(file).getroot()

    offset_x = 1e9
    offset_y = 1e9

    for elem in root[0][1:]:
        offset_x = min(offset_x, int(elem.attrib["x"]))
        offset_y = min(offset_y, int(elem.attrib["y"]))

    stroke_data = []
    last_x = 0
    last_y = 0

    for stroke in root[1]:
        for i, point in enumerate(stroke):
            point_data = np.array(
                [
                    int(point.attrib["x"]) - offset_x - last_x,
                    int(point.attrib["y"]) - offset_y - last_y,
                    1 if (i == len(stroke) - 1) else 0,
                ]
            )

            last_x = point_data[0]
            last_y = point_data[1]

            stroke_data.append(point_data)

    stroke_data = np.array(stroke_data)
    data.append(stroke_data)


In [26]:
len(data)

1

In [27]:
data[0]

array([[  61,  235,    1],
       [   9,   25,    0],
       [  60,  231,    0],
       ...,
       [3060,   66,    0],
       [3295,  139,    0],
       [3085,   79,    1]])

In [28]:
data[0].shape

(900, 3)

## Model Training

In [29]:
learning_rate = 0.0001
batch_size = 64
epochs = 10
seq_len = 300

In [30]:
temp = list(range(5))
temp[2:4]

[2, 3]

In [None]:
for epoch in range(epochs):
    for batch_start in range(0, len(data), batch_size):
        x = []
        y = []

        for i in range(batch_size):
            stroke = data[batch_start + i]

            x.append(stroke[0:seq_len])
            y.append(stroke[1:seq_len+1])