# Load our sub-stroke sequence dataset

In [1]:
import numpy as np
import pickle
from collections import Counter

In [2]:
with open('../subid_dict.p', 'rb') as fp:
    sid_dict = pickle.load(fp)

In [3]:
def flatten_subid_dict(sid_dict):
    sid_seqs = []
    n_alpha = len(sid_dict)
    for a in range(n_alpha):
        alphabet = sid_dict[a]
        n_char = len(alphabet)
        for c in range(n_char):
            char = alphabet[c]
            n_rend = len(char)
            for r in range(n_rend):
                rendition = char[r]
                n_stroke = len(rendition)
                for s in range(n_stroke):
                    sequence = rendition[s]
                    if len(sequence) > 0:
                        sid_seqs.append(sequence)
                            
    return sid_seqs

In [4]:
sid_seqs = flatten_subid_dict(sid_dict)

In [5]:
len(sid_seqs)

43861

In [6]:
sid_seqs[:10]

[[195, 955, 844],
 [831],
 [1096],
 [1101],
 [994],
 [931],
 [350, 794],
 [2, 22, 794],
 [601],
 [994]]

In [7]:
prim_list = []
for seq in sid_seqs:
    prim_list.extend(seq)

In [8]:
Counter(prim_list).most_common(100)

[(794, 7817),
 (831, 4379),
 (266, 3959),
 (931, 3239),
 (844, 2274),
 (1040, 2267),
 (716, 1845),
 (994, 1260),
 (598, 1201),
 (805, 1045),
 (1187, 1026),
 (472, 973),
 (597, 894),
 (652, 870),
 (706, 853),
 (1132, 766),
 (650, 747),
 (1114, 655),
 (0, 648),
 (112, 627),
 (1, 530),
 (553, 498),
 (1006, 460),
 (2, 430),
 (601, 384),
 (1013, 342),
 (439, 342),
 (977, 339),
 (233, 334),
 (505, 332),
 (5, 285),
 (11, 281),
 (15, 255),
 (1101, 250),
 (10, 235),
 (312, 211),
 (575, 210),
 (600, 194),
 (920, 187),
 (307, 185),
 (8, 178),
 (1088, 174),
 (7, 172),
 (591, 169),
 (6, 156),
 (689, 149),
 (1012, 148),
 (17, 130),
 (543, 121),
 (22, 120),
 (19, 118),
 (29, 107),
 (30, 107),
 (20, 106),
 (3, 104),
 (24, 103),
 (23, 100),
 (21, 99),
 (48, 99),
 (787, 98),
 (28, 98),
 (1061, 96),
 (27, 94),
 (909, 89),
 (53, 84),
 (35, 81),
 (65, 77),
 (995, 76),
 (33, 76),
 (1007, 75),
 (626, 75),
 (9, 74),
 (675, 73),
 (4, 73),
 (55, 70),
 (39, 69),
 (1025, 69),
 (31, 69),
 (793, 68),
 (865, 68),
 (

# Compare to samples from BPL graphical model

In [9]:
import torch

from pybpl.model.type_dist import CharacterTypeDist, StrokeTypeDist
from pybpl.library import Library

In [10]:
lib = Library()
char_dist = CharacterTypeDist(lib)
stroke_dist = StrokeTypeDist(lib)

In [13]:
sid_seqs1 = []
for nb_characters in range(20000):
    k = char_dist.sample_k()
    for nb_strokes in range(k):
        nsub = stroke_dist.sample_nsub(k)
        ids = stroke_dist.sample_subIDs(nsub)
        sid_seqs1.append(list(ids.numpy()))

In [14]:
len(sid_seqs1)

48564

In [15]:
sid_seqs1[:10]

[[354, 1058, 180, 1016],
 [184],
 [413, 456, 1],
 [351],
 [81, 20, 42],
 [447, 710],
 [70],
 [76, 1114],
 [300],
 [10]]

In [16]:
prim_list1 = []
for seq in sid_seqs1:
    prim_list1.extend(seq)

In [17]:
Counter(prim_list1).most_common(100)

[(0, 1731),
 (1, 1585),
 (3, 1548),
 (2, 1508),
 (4, 1319),
 (6, 1301),
 (5, 1267),
 (9, 1218),
 (7, 1201),
 (8, 1120),
 (10, 1017),
 (14, 871),
 (12, 849),
 (13, 803),
 (16, 705),
 (18, 569),
 (15, 523),
 (11, 510),
 (17, 339),
 (20, 300),
 (19, 300),
 (24, 290),
 (22, 261),
 (41, 248),
 (31, 242),
 (26, 223),
 (36, 220),
 (37, 218),
 (23, 210),
 (25, 206),
 (21, 204),
 (34, 195),
 (33, 191),
 (45, 183),
 (38, 181),
 (35, 180),
 (28, 176),
 (76, 175),
 (44, 175),
 (50, 175),
 (70, 173),
 (52, 173),
 (88, 172),
 (43, 171),
 (29, 170),
 (47, 167),
 (68, 167),
 (100, 167),
 (32, 162),
 (27, 160),
 (60, 160),
 (46, 160),
 (63, 157),
 (40, 156),
 (81, 151),
 (71, 145),
 (112, 144),
 (77, 143),
 (39, 141),
 (48, 140),
 (64, 139),
 (30, 138),
 (72, 138),
 (87, 137),
 (90, 136),
 (57, 136),
 (106, 135),
 (82, 134),
 (58, 133),
 (138, 133),
 (66, 133),
 (65, 132),
 (54, 130),
 (89, 129),
 (133, 128),
 (119, 127),
 (99, 127),
 (121, 125),
 (83, 125),
 (160, 123),
 (53, 123),
 (159, 121),
 (107,