# Load our sub-stroke sequence dataset

In [23]:
import numpy as np
import pickle
from collections import Counter

In [3]:
with open('../subid_dict.p', 'rb') as fp:
    sid_dict = pickle.load(fp)

In [7]:
def flatten_subid_dict(sid_dict):
    sid_seqs = []
    n_alpha = len(sid_dict)
    for a in range(n_alpha):
        alphabet = sid_dict[a]
        n_char = len(alphabet)
        for c in range(n_char):
            char = alphabet[c]
            n_rend = len(char)
            for r in range(n_rend):
                rendition = char[r]
                n_stroke = len(rendition)
                for s in range(n_stroke):
                    sequence = rendition[s]
                    if len(sequence) > 0:
                        sid_seqs.append(sequence)
                            
    return sid_seqs

In [14]:
sid_seqs = flatten_subid_dict(sid_dict)

In [15]:
len(sid_seqs)

43861

In [22]:
sid_seqs[:10]

[[195, 955, 844],
 [831],
 [1096],
 [1101],
 [994],
 [931],
 [350, 794],
 [2, 22, 794],
 [601],
 [994]]

In [18]:
prim_list = []
for seq in sid_seqs:
    prim_list.extend(seq)

In [21]:
Counter(prim_list).most_common(100)

[(794, 7817),
 (831, 4379),
 (266, 3959),
 (931, 3239),
 (844, 2274),
 (1040, 2267),
 (716, 1845),
 (994, 1260),
 (598, 1201),
 (805, 1045),
 (1187, 1026),
 (472, 973),
 (597, 894),
 (652, 870),
 (706, 853),
 (1132, 766),
 (650, 747),
 (1114, 655),
 (0, 648),
 (112, 627),
 (1, 530),
 (553, 498),
 (1006, 460),
 (2, 430),
 (601, 384),
 (1013, 342),
 (439, 342),
 (977, 339),
 (233, 334),
 (505, 332),
 (5, 285),
 (11, 281),
 (15, 255),
 (1101, 250),
 (10, 235),
 (312, 211),
 (575, 210),
 (600, 194),
 (920, 187),
 (307, 185),
 (8, 178),
 (1088, 174),
 (7, 172),
 (591, 169),
 (6, 156),
 (689, 149),
 (1012, 148),
 (17, 130),
 (543, 121),
 (22, 120),
 (19, 118),
 (29, 107),
 (30, 107),
 (20, 106),
 (3, 104),
 (24, 103),
 (23, 100),
 (21, 99),
 (48, 99),
 (787, 98),
 (28, 98),
 (1061, 96),
 (27, 94),
 (909, 89),
 (53, 84),
 (35, 81),
 (65, 77),
 (995, 76),
 (33, 76),
 (1007, 75),
 (626, 75),
 (9, 74),
 (675, 73),
 (4, 73),
 (55, 70),
 (39, 69),
 (1025, 69),
 (31, 69),
 (793, 68),
 (865, 68),
 (

# Compare to samples from BPL graphical model

In [24]:
import torch

from pybpl.model.type_dist import CharacterTypeDist, StrokeTypeDist
from pybpl.library import Library

In [25]:
lib = Library()
char_dist = CharacterTypeDist(lib)
stroke_dist = StrokeTypeDist(lib)

In [26]:
sid_seqs1 = []
for nb_characters in range(1000):
    k = char_dist.sample_k()
    for nb_strokes in range(k):
        nsub = stroke_dist.sample_nsub(k)
        ids = stroke_dist.sample_subIDs(nsub)
        sid_seqs1.append(list(ids.numpy()))

In [28]:
sid_seqs1[:10]

[[127, 304, 551, 849, 756],
 [455],
 [868],
 [437],
 [251],
 [7, 22],
 [223],
 [82, 240, 369, 8],
 [413, 640, 908, 30],
 [10]]

In [29]:
prim_list1 = []
for seq in sid_seqs1:
    prim_list1.extend(seq)

In [30]:
Counter(prim_list1).most_common(100)

[(0, 107),
 (1, 94),
 (3, 84),
 (2, 71),
 (4, 68),
 (5, 67),
 (9, 66),
 (10, 59),
 (7, 58),
 (8, 56),
 (14, 55),
 (6, 55),
 (12, 50),
 (11, 42),
 (13, 41),
 (15, 32),
 (16, 31),
 (22, 25),
 (41, 19),
 (81, 16),
 (31, 16),
 (18, 15),
 (17, 15),
 (19, 14),
 (26, 14),
 (20, 14),
 (64, 13),
 (74, 13),
 (83, 13),
 (36, 13),
 (133, 13),
 (35, 12),
 (40, 12),
 (29, 12),
 (121, 12),
 (45, 12),
 (37, 12),
 (51, 12),
 (70, 11),
 (87, 11),
 (44, 11),
 (33, 11),
 (68, 11),
 (52, 11),
 (28, 10),
 (171, 10),
 (85, 10),
 (76, 10),
 (168, 10),
 (249, 10),
 (225, 10),
 (38, 10),
 (27, 10),
 (60, 10),
 (24, 10),
 (66, 10),
 (82, 9),
 (180, 9),
 (95, 9),
 (93, 9),
 (47, 9),
 (550, 9),
 (69, 9),
 (23, 9),
 (30, 8),
 (80, 8),
 (163, 8),
 (124, 8),
 (194, 8),
 (86, 8),
 (88, 8),
 (543, 8),
 (146, 8),
 (100, 8),
 (46, 8),
 (138, 8),
 (58, 8),
 (54, 8),
 (331, 8),
 (90, 8),
 (484, 8),
 (322, 8),
 (150, 8),
 (99, 8),
 (321, 8),
 (77, 8),
 (34, 8),
 (173, 8),
 (92, 8),
 (251, 7),
 (640, 7),
 (255, 7),
 (67, 7),