#### Описание данных

Данные предоставляются в формате JSON на каждой строке следующего содержания:

*   word – таргет (слово, которое хотел ввести пользователь)
*   curve
    *   x – массив координат X
    *   y – массив координат Y
    *   t – массив временных меток (время в миллисекундах)
    *   grid – формат раскладким
        *   grid\_name – название раскладки (default или extra)
        *   width – ширина раскладки
        *   height – высота раскладки
        *   keys – массив клавиш
            *   label – символ клавиши
            *   hitbox – область нажатия
                *   x – координата X верхнего левого угла
                *   y – координата Y верхнего левого угла
                *   w – ширина области
                *   h – высота области

У клавиш `enter`, `space`, `globe`, `toNumberState`, `backspace` нет ключа 'label', но есть одноименный ключ 'action'

Было проверено, что:
* В train ровно 6000000 примеров
* Все примеры имеют line_data['curve']['grid']['grid_name'] либо 'default', либо 'extra', другого точно нет
* Все раскладки с одним именем одинаковы. То есть достаточно хранить один набор клавиш для каждой раскладки. Чтобы получить корректные hitbox'ы для данного примера нужно лишь умножить x и w на width_factor; y и h на height_factor, где:
    * height_factor = grid_height / template_grid_height
    * width_factor = grid_width / template_grid_width

**TODO**
* проверить распределение ширин и высот
* проверить распределение слов и процент покрытия, относительно всего словаря

**Questions**
* если масштабировать клавиатуры, нужно ли масштабировать время?


In [None]:
# n_lines = sum(1 for i in open(train_dataset_path, 'rb'))
# >>>> 6000000

In [None]:
# from tqdm import tqdm
# gridnames = set()

# with open(train_dataset_path, encoding="utf-8") as f:
#     for line in tqdm(f, total = n_lines):
#         line_data = json.loads(line)
#         gridnames.add(line_data['curve']['grid']['grid_name'])

# print(gridnames)

# >>>> {'default', 'extra'}

In [5]:
import json
from typing import List, Set, Optional, Dict
import os

from tqdm import tqdm

In [52]:
N = 10
lines = []
with open("../../data/data/result_noctx_10k/train.jsonl", encoding="utf-8") as f:
    for _ in range(N):
        line = f.readline()
        lines.append(line)

In [38]:
lines

['{"word":"силе","curve":{"x":[306,306,307,316,337,374,440,487,548,583,619,656,688,703,725,743,755,764,771,773,770,758,707,656,626,529,470,418,394,327,327],"y":[398,398,398,395,391,386,389,397,410,415,410,398,383,374,353,332,316,302,286,281,272,260,235,214,202,158,127,97,82,43,43],"t":[0,7,24,62,64,81,108,125,142,158,175,194,208,224,241,258,275,293,308,325,341,359,375,394,411,426,443,459,475,494,499],"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":99,"h":154}},{"label":"ц","hitbox":{"x":98,"y":15,"w":99,"h":154}},{"label":"у","hitbox":{"x":196,"y":15,"w":100,"h":154}},{"label":"к","hitbox":{"x":295,"y":15,"w":99,"h":154}},{"label":"е","hitbox":{"x":393,"y":15,"w":99,"h":154}},{"label":"н","hitbox":{"x":491,"y":15,"w":99,"h":154}},{"label":"г","hitbox":{"x":589,"y":15,"w":99,"h":154}},{"label":"ш","hitbox":{"x":687,"y":15,"w":99,"h":154}},{"label":"щ","hitbox":{"x":785,"y":15,"w":100,"h":154}},{"label":"з","hitbox":{"x":884,"y":15,"w":99,"h":154}},{"lab

In [58]:
train_examples = list(json.loads(line) for line in lines)

In [74]:
curves = [train_example["curve"] for train_example in train_examples]

for curve in curves:
    assert len(curve["x"]) == len(curve["y"]) == len(curve["t"])

In [75]:
grids = [curve["grid"] for curve in curves]

In [78]:
grids[0]['grid_name']

'default'

In [76]:
for grid in grids:
    print(grid)

[{'width': 1080,
  'height': 667,
  'keys': [{'label': 'й', 'hitbox': {'x': 0, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'ц', 'hitbox': {'x': 98, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'у', 'hitbox': {'x': 196, 'y': 15, 'w': 100, 'h': 154}},
   {'label': 'к', 'hitbox': {'x': 295, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'е', 'hitbox': {'x': 393, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'н', 'hitbox': {'x': 491, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'г', 'hitbox': {'x': 589, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'ш', 'hitbox': {'x': 687, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'щ', 'hitbox': {'x': 785, 'y': 15, 'w': 100, 'h': 154}},
   {'label': 'з', 'hitbox': {'x': 884, 'y': 15, 'w': 99, 'h': 154}},
   {'label': 'х', 'hitbox': {'x': 982, 'y': 15, 'w': 98, 'h': 154}},
   {'label': 'ф', 'hitbox': {'x': 0, 'y': 169, 'w': 99, 'h': 154}},
   {'label': 'ы', 'hitbox': {'x': 98, 'y': 169, 'w': 99, 'h': 154}},
   {'label': 'в', 'hitbox': {'x': 196, 'y': 169, 'w': 100, 'h':

давайте кластеризуем данные по названиям grid'ов и проверим совпадают ли они. Ожидается, что есть два типа grid'ов и они полностью совпадают

In [3]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"

In [85]:
n_lines = 6000000

In [88]:
# from tqdm import tqdm
# gridnames = set()

# with open(train_dataset_path, encoding="utf-8") as f:
#     for line in tqdm(f, total = n_lines):
#         line_data = json.loads(line)
#         gridnames.add(line_data['curve']['grid']['grid_name'])

# print(gridnames)

# >>>> {'default', 'extra'}

100%|██████████| 6000000/6000000 [1:36:43<00:00, 1033.81it/s]  


In [89]:
gridnames = {'default', 'extra'}

{'default', 'extra'}

*   grid\_name – название раскладки (default или extra)
*   width – ширина раскладки
*   height – высота раскладки
*   keys – массив клавиш
    *   label – символ клавиши
    *   hitbox – область нажатия
        *   x – координата X верхнего левого угла
        *   y – координата Y верхнего левого угла
        *   w – ширина области
        *   h – высота области

In [132]:
def is_same_deformed_grid(grid1: dict,
                          grid2: dict,
                          verbose: bool = False, 
                          verbose_different: bool = False):
    
    def print_all_hitbox_info(hb1, hb2):
        print('x: ', hb2['x'], hb1['x'] * w_factor)
        print('w: ', hb2['w'], hb1['w'] * w_factor)
        print('y: ', hb2['y'], hb1['y'] * h_factor)
        print('h: ', hb2['h'], hb1['h'] * h_factor)
        print()


    w_factor = grid2['width']/grid1['width']
    h_factor = grid2['height']/grid1['height']
    
    for key1, key2 in zip(grid1['keys'], grid2['keys']):
        hb1 = key1['hitbox']
        hb2 = key2['hitbox']

        keys_same = (
            hb2['x'] == hb1['x'] * w_factor and
            hb2['w'] == hb1['w'] * w_factor and
            hb2['y'] == hb1['y'] * h_factor and
            hb2['h'] == hb1['h'] * h_factor
        )

        if verbose or (verbose_different and not keys_same):
            print_all_hitbox_info(hb1, hb2)
            
        if not keys_same:
            return False
        
        
        
    return True

In [133]:
grid_templates = {gridname: None for gridname in gridnames}
grids_that_differ = {gridname: [] for gridname in gridnames}

# grids contain several examples of grids

for grid in grids:
    g_name = grid['grid_name']
    if grid_templates[g_name] is None:
        grid_templates[g_name] = grid
    
    if not is_same_deformed_grid(grid, grid_templates[g_name], verbose=True):
        grids_that_differ[g_name].append(grid)

x:  0 0.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  98 98.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  196 196.0
w:  100 100.0
y:  15 15.0
h:  154 154.0

x:  295 295.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  393 393.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  491 491.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  589 589.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  687 687.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  785 785.0
w:  100 100.0
y:  15 15.0
h:  154 154.0

x:  884 884.0
w:  99 99.0
y:  15 15.0
h:  154 154.0

x:  982 982.0
w:  98 98.0
y:  15 15.0
h:  154 154.0

x:  0 0.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  98 98.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  196 196.0
w:  100 100.0
y:  169 169.0
h:  154 154.0

x:  295 295.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  393 393.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  491 491.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  589 589.0
w:  99 99.0
y:  169 169.0
h:  154 154.0

x:  687 687.0
w:  99 99.0
y:  169 169.

In [134]:
gridnames = {'default', 'extra'}
n_lines = 6000000

def check_all_keyboards_same(data_path: str,
                             gridnames: Set[str],
                             n_lines:int,
                             verbose_different: bool = True):
    
    grid_templates = {gridname: None for gridname in gridnames}
    grids_that_differ = {gridname: [] for gridname in gridnames}

    with open(data_path, encoding="utf-8") as f:
        for i, line in tqdm(enumerate(f), total = n_lines):
            line_data = json.loads(line)

            grid = line_data['curve']['grid']
            g_name = grid['grid_name']
            
            if grid_templates[g_name] is None:
                grid_templates[g_name] = grid
            
            if not is_same_deformed_grid(grid,
                                         grid_templates[g_name],
                                         verbose_different = verbose_different):
                grids_that_differ[g_name].append((i, grid))
    return grids_that_differ

grids_that_differ = check_all_keyboards_same(train_dataset_path, gridnames, n_lines, verbose_different = True)

100%|██████████| 6000000/6000000 [25:59<00:00, 3847.57it/s] 


In [136]:
from typing import Dict, Tuple
from collections import defaultdict

def get_distributions(data_path: str,
                      gridnames: Set[str],
                      n_lines:int) -> Tuple[Dict[str, Dict[int, int]], Dict[str, Dict[int, int]]]:

    width_nums = {gridname: defaultdict(int) for gridname in gridnames}
    height_nums = {gridname: defaultdict(int) for gridname in gridnames}

    with open(data_path, encoding="utf-8") as f:
        for line in tqdm(f, total = n_lines):
            line_data = json.loads(line)

            grid = line_data['curve']['grid']
            g_name = grid['grid_name']

            width_nums[g_name][grid['width']] += 1
            height_nums[g_name][grid['height']] += 1
            
    return width_nums, height_nums

width_nums, height_nums = get_distributions(train_dataset_path, gridnames, n_lines)

100%|██████████| 6000000/6000000 [18:27<00:00, 5416.80it/s] 


In [137]:
print(width_nums)

{'extra': defaultdict(<class 'int'>, {1080: 373660}), 'default': defaultdict(<class 'int'>, {1080: 5626340})}


In [138]:
print(height_nums)

{'extra': defaultdict(<class 'int'>, {667: 373660}), 'default': defaultdict(<class 'int'>, {667: 5626340})}


In [3]:
def is_same_grid(grid1: dict, grid2: dict):
    """
    Given two grids (data['curve']['grid']),
    returns True if they are exactly the same.
    """
    same_name = grid1['grid_name'] == grid2['grid_name']
    if not same_name:
        return False
    
    same_height = grid1['height'] == grid2['height']
    if not same_height:
        return False
    
    same_width = grid1['width'] == grid2['width']
    if not same_width:
        return False
    
    for key1, key2 in zip(grid1['keys'], grid2['keys']):
        # I suppose that even the order of keys is the same.
        # If not, I will sort keys with lambda key: key['label']
        
        key1_dict_keys = set(key1.keys())
        possible_dict_keys_sets = [{'label', 'hitbox'}, {'action', 'hitbox'}]
        assert key1_dict_keys in possible_dict_keys_sets, f"Unexpected key1_dict_keys: {key1_dict_keys}"

        if key1_dict_keys != set(key2.keys()):
            return False
        
        if key1_dict_keys == {'label', 'hitbox'}:
            if key1['label'] != key2['label']:
                return False
        elif key1_dict_keys == {'action', 'hitbox'}:
            if key1['action'] != key2['action']:
                return False

        for hb_dict_key in ['x', 'y', 'w', 'h']:
            if key1['hitbox'][hb_dict_key] != key2['hitbox'][hb_dict_key]:
                return False
    
    return True

In [4]:
def compare_all_grids_same(datapaths: List[str],
                           gridnames: Set[str],
                           n_lines_list: List[Optional[int]] = None,
                           verbose_different: bool = True):
    
    grid_templates = {gridname: None for gridname in gridnames}
    grids_that_differ = {gridname: [] for gridname in gridnames}

    for datapath, n_lines in zip(datapaths, n_lines_list):
        with open(datapath, encoding="utf-8") as f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                line_data = json.loads(line)

                grid = line_data['curve']['grid']
                g_name = grid['grid_name']

                if grid_templates[g_name] is None:
                    grid_templates[g_name] = grid

                if not is_same_grid(grid,
                                    grid_templates[g_name]):
                    grids_that_differ[g_name].append((datapath, i, grid))
                    if verbose_different:
                        print(f"Grid {g_name} differs in {datapath} at line {i}")
    
    return grids_that_differ

In [5]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"
valid_dataset_path = "../../data/data/result_noctx_10k/valid.jsonl"
test_dataset_path = "../../data/data/result_noctx_10k/test.jsonl"

datapaths = [train_dataset_path, valid_dataset_path, test_dataset_path]

n_train_lines = 6000000
gridnames = {'default', 'extra'}

grids_that_differ = compare_all_grids_same(datapaths,
                                           gridnames,
                                           n_lines_list = [n_train_lines, None, None],
                                           verbose_different = True)

  0%|          | 252/6000000 [00:00<40:00, 2499.24it/s]

100%|██████████| 6000000/6000000 [33:44<00:00, 2963.53it/s]  
10000it [00:03, 2626.72it/s]
10000it [00:03, 3197.56it/s]


In [7]:
grids_that_differ

{'default': [], 'extra': []}

In [4]:
def compare_all_grids_same_using_strs(datapaths: List[str],
                                      end_strs: List[str],
                                      n_lines_list: List[int],
                                      verbose_different: bool = True):
    lines_with_different_grids = []
    for datapath, n_lines in zip(datapaths, n_lines_list):
        with open(datapath, encoding="utf-8") as f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                if not (line.endswith(end_strs[0]) or line.endswith(end_strs[1])):
                    if verbose_different:
                        print(f"Grid differs in {datapath} at line {i}")
                        lines_with_different_grids.append((datapath, i, line))
    return lines_with_different_grids

In [5]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"
valid_dataset_path = "../../data/data/result_noctx_10k/valid.jsonl"
test_dataset_path = "../../data/data/result_noctx_10k/test.jsonl"

datapaths = [train_dataset_path, valid_dataset_path, test_dataset_path]
n_train_lines = 6000000

compare_all_grids_same_using_strs(
    datapaths,
    end_strs=[
        '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":99,"h":154}},{"label":"ц","hitbox":{"x":98,"y":15,"w":99,"h":154}},{"label":"у","hitbox":{"x":196,"y":15,"w":100,"h":154}},{"label":"к","hitbox":{"x":295,"y":15,"w":99,"h":154}},{"label":"е","hitbox":{"x":393,"y":15,"w":99,"h":154}},{"label":"н","hitbox":{"x":491,"y":15,"w":99,"h":154}},{"label":"г","hitbox":{"x":589,"y":15,"w":99,"h":154}},{"label":"ш","hitbox":{"x":687,"y":15,"w":99,"h":154}},{"label":"щ","hitbox":{"x":785,"y":15,"w":100,"h":154}},{"label":"з","hitbox":{"x":884,"y":15,"w":99,"h":154}},{"label":"х","hitbox":{"x":982,"y":15,"w":98,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":99,"h":154}},{"label":"ы","hitbox":{"x":98,"y":169,"w":99,"h":154}},{"label":"в","hitbox":{"x":196,"y":169,"w":100,"h":154}},{"label":"а","hitbox":{"x":295,"y":169,"w":99,"h":154}},{"label":"п","hitbox":{"x":393,"y":169,"w":99,"h":154}},{"label":"р","hitbox":{"x":491,"y":169,"w":99,"h":154}},{"label":"о","hitbox":{"x":589,"y":169,"w":99,"h":154}},{"label":"л","hitbox":{"x":687,"y":169,"w":99,"h":154}},{"label":"д","hitbox":{"x":785,"y":169,"w":100,"h":154}},{"label":"ж","hitbox":{"x":884,"y":169,"w":99,"h":154}},{"label":"э","hitbox":{"x":982,"y":169,"w":98,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":120,"h":154}},{"label":"я","hitbox":{"x":119,"y":323,"w":94,"h":154}},{"label":"ч","hitbox":{"x":212,"y":323,"w":95,"h":154}},{"label":"с","hitbox":{"x":306,"y":323,"w":94,"h":154}},{"label":"м","hitbox":{"x":399,"y":323,"w":95,"h":154}},{"label":"и","hitbox":{"x":493,"y":323,"w":94,"h":154}},{"label":"т","hitbox":{"x":586,"y":323,"w":95,"h":154}},{"label":"ь","hitbox":{"x":680,"y":323,"w":94,"h":154}},{"label":"б","hitbox":{"x":773,"y":323,"w":95,"h":154}},{"label":"ю","hitbox":{"x":867,"y":323,"w":95,"h":154}},{"action":"backspace","hitbox":{"x":961,"y":323,"w":119,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"default"}}}\n',
        '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":91,"h":154}},{"label":"ц","hitbox":{"x":90,"y":15,"w":91,"h":154}},{"label":"у","hitbox":{"x":180,"y":15,"w":91,"h":154}},{"label":"к","hitbox":{"x":270,"y":15,"w":91,"h":154}},{"label":"е","hitbox":{"x":360,"y":15,"w":91,"h":154}},{"label":"н","hitbox":{"x":450,"y":15,"w":91,"h":154}},{"label":"г","hitbox":{"x":540,"y":15,"w":91,"h":154}},{"label":"ш","hitbox":{"x":630,"y":15,"w":91,"h":154}},{"label":"щ","hitbox":{"x":720,"y":15,"w":91,"h":154}},{"label":"з","hitbox":{"x":810,"y":15,"w":91,"h":154}},{"label":"х","hitbox":{"x":900,"y":15,"w":91,"h":154}},{"label":"ё","hitbox":{"x":990,"y":15,"w":90,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":91,"h":154}},{"label":"ы","hitbox":{"x":90,"y":169,"w":91,"h":154}},{"label":"в","hitbox":{"x":180,"y":169,"w":91,"h":154}},{"label":"а","hitbox":{"x":270,"y":169,"w":91,"h":154}},{"label":"п","hitbox":{"x":360,"y":169,"w":91,"h":154}},{"label":"р","hitbox":{"x":450,"y":169,"w":91,"h":154}},{"label":"о","hitbox":{"x":540,"y":169,"w":91,"h":154}},{"label":"л","hitbox":{"x":630,"y":169,"w":91,"h":154}},{"label":"д","hitbox":{"x":720,"y":169,"w":91,"h":154}},{"label":"ж","hitbox":{"x":810,"y":169,"w":91,"h":154}},{"label":"э","hitbox":{"x":900,"y":169,"w":91,"h":154}},{"label":"ъ","hitbox":{"x":990,"y":169,"w":90,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":91,"h":154}},{"label":"я","hitbox":{"x":90,"y":323,"w":91,"h":154}},{"label":"ч","hitbox":{"x":180,"y":323,"w":91,"h":154}},{"label":"с","hitbox":{"x":270,"y":323,"w":91,"h":154}},{"label":"м","hitbox":{"x":360,"y":323,"w":91,"h":154}},{"label":"и","hitbox":{"x":450,"y":323,"w":91,"h":154}},{"label":"т","hitbox":{"x":540,"y":323,"w":91,"h":154}},{"label":"ь","hitbox":{"x":630,"y":323,"w":91,"h":154}},{"label":"б","hitbox":{"x":720,"y":323,"w":91,"h":154}},{"label":"ю","hitbox":{"x":810,"y":323,"w":91,"h":154}},{"label":"?","hitbox":{"x":900,"y":323,"w":91,"h":154}},{"action":"backspace","hitbox":{"x":990,"y":323,"w":90,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"extra"}}}\n'
    ],
    n_lines_list = [n_train_lines, None, None],
)

  0%|          | 5141/6000000 [00:00<01:56, 51409.31it/s]

100%|██████████| 6000000/6000000 [02:06<00:00, 47248.93it/s]
10000it [00:00, 37453.52it/s]
10000it [00:00, 43668.42it/s]


[]

In [39]:
def create_dataset_with_gridname_instead_of_grid(data_paths: List[str],
                                                 out_paths: List[str],
                                                 n_lines_list: List[int]):
                                                 
    assert len(data_paths) == len(out_paths) == len(n_lines_list)

    for data_path, out_path, n_lines in zip(data_paths, out_paths, n_lines_list):
        assert not os.path.exists(out_path), f"File {out_path} already exists!"
        with open(data_path, encoding="utf-8") as f, open(out_path, 'w', encoding="utf-8") as out_f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                line_data = json.loads(line)

                g_name = line_data['curve']['grid']['grid_name']

                line_data['curve']['grid_name'] = g_name
                del line_data['curve']['grid']
                json.dump(line_data,
                          out_f,
                          ensure_ascii=False,
                          separators=(',', ':'))
                out_f.write('\n')

In [40]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"
valid_dataset_path = "../../data/data/result_noctx_10k/valid.jsonl"
test_dataset_path = "../../data/data/result_noctx_10k/test.jsonl"

datapaths = [train_dataset_path, valid_dataset_path, test_dataset_path]
n_train_lines = 6000000

json_grid_str_to_gridname = {
    '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":99,"h":154}},{"label":"ц","hitbox":{"x":98,"y":15,"w":99,"h":154}},{"label":"у","hitbox":{"x":196,"y":15,"w":100,"h":154}},{"label":"к","hitbox":{"x":295,"y":15,"w":99,"h":154}},{"label":"е","hitbox":{"x":393,"y":15,"w":99,"h":154}},{"label":"н","hitbox":{"x":491,"y":15,"w":99,"h":154}},{"label":"г","hitbox":{"x":589,"y":15,"w":99,"h":154}},{"label":"ш","hitbox":{"x":687,"y":15,"w":99,"h":154}},{"label":"щ","hitbox":{"x":785,"y":15,"w":100,"h":154}},{"label":"з","hitbox":{"x":884,"y":15,"w":99,"h":154}},{"label":"х","hitbox":{"x":982,"y":15,"w":98,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":99,"h":154}},{"label":"ы","hitbox":{"x":98,"y":169,"w":99,"h":154}},{"label":"в","hitbox":{"x":196,"y":169,"w":100,"h":154}},{"label":"а","hitbox":{"x":295,"y":169,"w":99,"h":154}},{"label":"п","hitbox":{"x":393,"y":169,"w":99,"h":154}},{"label":"р","hitbox":{"x":491,"y":169,"w":99,"h":154}},{"label":"о","hitbox":{"x":589,"y":169,"w":99,"h":154}},{"label":"л","hitbox":{"x":687,"y":169,"w":99,"h":154}},{"label":"д","hitbox":{"x":785,"y":169,"w":100,"h":154}},{"label":"ж","hitbox":{"x":884,"y":169,"w":99,"h":154}},{"label":"э","hitbox":{"x":982,"y":169,"w":98,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":120,"h":154}},{"label":"я","hitbox":{"x":119,"y":323,"w":94,"h":154}},{"label":"ч","hitbox":{"x":212,"y":323,"w":95,"h":154}},{"label":"с","hitbox":{"x":306,"y":323,"w":94,"h":154}},{"label":"м","hitbox":{"x":399,"y":323,"w":95,"h":154}},{"label":"и","hitbox":{"x":493,"y":323,"w":94,"h":154}},{"label":"т","hitbox":{"x":586,"y":323,"w":95,"h":154}},{"label":"ь","hitbox":{"x":680,"y":323,"w":94,"h":154}},{"label":"б","hitbox":{"x":773,"y":323,"w":95,"h":154}},{"label":"ю","hitbox":{"x":867,"y":323,"w":95,"h":154}},{"action":"backspace","hitbox":{"x":961,"y":323,"w":119,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"default"}}}\n' : "default",
    '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":91,"h":154}},{"label":"ц","hitbox":{"x":90,"y":15,"w":91,"h":154}},{"label":"у","hitbox":{"x":180,"y":15,"w":91,"h":154}},{"label":"к","hitbox":{"x":270,"y":15,"w":91,"h":154}},{"label":"е","hitbox":{"x":360,"y":15,"w":91,"h":154}},{"label":"н","hitbox":{"x":450,"y":15,"w":91,"h":154}},{"label":"г","hitbox":{"x":540,"y":15,"w":91,"h":154}},{"label":"ш","hitbox":{"x":630,"y":15,"w":91,"h":154}},{"label":"щ","hitbox":{"x":720,"y":15,"w":91,"h":154}},{"label":"з","hitbox":{"x":810,"y":15,"w":91,"h":154}},{"label":"х","hitbox":{"x":900,"y":15,"w":91,"h":154}},{"label":"ё","hitbox":{"x":990,"y":15,"w":90,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":91,"h":154}},{"label":"ы","hitbox":{"x":90,"y":169,"w":91,"h":154}},{"label":"в","hitbox":{"x":180,"y":169,"w":91,"h":154}},{"label":"а","hitbox":{"x":270,"y":169,"w":91,"h":154}},{"label":"п","hitbox":{"x":360,"y":169,"w":91,"h":154}},{"label":"р","hitbox":{"x":450,"y":169,"w":91,"h":154}},{"label":"о","hitbox":{"x":540,"y":169,"w":91,"h":154}},{"label":"л","hitbox":{"x":630,"y":169,"w":91,"h":154}},{"label":"д","hitbox":{"x":720,"y":169,"w":91,"h":154}},{"label":"ж","hitbox":{"x":810,"y":169,"w":91,"h":154}},{"label":"э","hitbox":{"x":900,"y":169,"w":91,"h":154}},{"label":"ъ","hitbox":{"x":990,"y":169,"w":90,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":91,"h":154}},{"label":"я","hitbox":{"x":90,"y":323,"w":91,"h":154}},{"label":"ч","hitbox":{"x":180,"y":323,"w":91,"h":154}},{"label":"с","hitbox":{"x":270,"y":323,"w":91,"h":154}},{"label":"м","hitbox":{"x":360,"y":323,"w":91,"h":154}},{"label":"и","hitbox":{"x":450,"y":323,"w":91,"h":154}},{"label":"т","hitbox":{"x":540,"y":323,"w":91,"h":154}},{"label":"ь","hitbox":{"x":630,"y":323,"w":91,"h":154}},{"label":"б","hitbox":{"x":720,"y":323,"w":91,"h":154}},{"label":"ю","hitbox":{"x":810,"y":323,"w":91,"h":154}},{"label":"?","hitbox":{"x":900,"y":323,"w":91,"h":154}},{"action":"backspace","hitbox":{"x":990,"y":323,"w":90,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"extra"}}}\n' : "extra"
}

create_dataset_with_gridname_instead_of_grid(
    datapaths,
    out_paths = ['edit_train.jsonl', 'edit_valid.jsonl', 'edit_test.jsonl'],
    n_lines_list=[n_train_lines, None, None]
)

100%|██████████| 6000000/6000000 [44:02<00:00, 2271.00it/s]  
10000it [00:03, 2804.05it/s]
10000it [00:04, 2197.46it/s]


In [10]:
def create_dataset_with_gridname_instead_of_grid_primitive(data_paths: List[str],
                                                 out_paths: List[str],
                                                 n_lines_list: List[int],
                                                 json_grid_str_to_gridname: Dict[str, str]):
    assert len(data_paths) == len(out_paths) == len(n_lines_list)

    for data_path, out_path, n_lines in zip(data_paths, out_paths, n_lines_list):
        with open(data_path, encoding="utf-8") as f, open(out_path, 'a', encoding="utf-8") as out_f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                # find substring '"grid":' in line

                grid_index = line.find('"grid"')
                # print(grid_index)
                # print(line[:grid_index])
                gridname = json_grid_str_to_gridname[line[grid_index:]]

                line_gridname_instead_of_grid = line[:grid_index]+f'"grid_name":"{gridname}"\u007d\u007d\n'
                out_f.write(line_gridname_instead_of_grid)

In [None]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"
valid_dataset_path = "../../data/data/result_noctx_10k/valid.jsonl"
test_dataset_path = "../../data/data/result_noctx_10k/test.jsonl"

datapaths = [train_dataset_path, valid_dataset_path, test_dataset_path]
n_train_lines = 6000000

json_grid_str_to_gridname = {
    '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":99,"h":154}},{"label":"ц","hitbox":{"x":98,"y":15,"w":99,"h":154}},{"label":"у","hitbox":{"x":196,"y":15,"w":100,"h":154}},{"label":"к","hitbox":{"x":295,"y":15,"w":99,"h":154}},{"label":"е","hitbox":{"x":393,"y":15,"w":99,"h":154}},{"label":"н","hitbox":{"x":491,"y":15,"w":99,"h":154}},{"label":"г","hitbox":{"x":589,"y":15,"w":99,"h":154}},{"label":"ш","hitbox":{"x":687,"y":15,"w":99,"h":154}},{"label":"щ","hitbox":{"x":785,"y":15,"w":100,"h":154}},{"label":"з","hitbox":{"x":884,"y":15,"w":99,"h":154}},{"label":"х","hitbox":{"x":982,"y":15,"w":98,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":99,"h":154}},{"label":"ы","hitbox":{"x":98,"y":169,"w":99,"h":154}},{"label":"в","hitbox":{"x":196,"y":169,"w":100,"h":154}},{"label":"а","hitbox":{"x":295,"y":169,"w":99,"h":154}},{"label":"п","hitbox":{"x":393,"y":169,"w":99,"h":154}},{"label":"р","hitbox":{"x":491,"y":169,"w":99,"h":154}},{"label":"о","hitbox":{"x":589,"y":169,"w":99,"h":154}},{"label":"л","hitbox":{"x":687,"y":169,"w":99,"h":154}},{"label":"д","hitbox":{"x":785,"y":169,"w":100,"h":154}},{"label":"ж","hitbox":{"x":884,"y":169,"w":99,"h":154}},{"label":"э","hitbox":{"x":982,"y":169,"w":98,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":120,"h":154}},{"label":"я","hitbox":{"x":119,"y":323,"w":94,"h":154}},{"label":"ч","hitbox":{"x":212,"y":323,"w":95,"h":154}},{"label":"с","hitbox":{"x":306,"y":323,"w":94,"h":154}},{"label":"м","hitbox":{"x":399,"y":323,"w":95,"h":154}},{"label":"и","hitbox":{"x":493,"y":323,"w":94,"h":154}},{"label":"т","hitbox":{"x":586,"y":323,"w":95,"h":154}},{"label":"ь","hitbox":{"x":680,"y":323,"w":94,"h":154}},{"label":"б","hitbox":{"x":773,"y":323,"w":95,"h":154}},{"label":"ю","hitbox":{"x":867,"y":323,"w":95,"h":154}},{"action":"backspace","hitbox":{"x":961,"y":323,"w":119,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"default"}}}\n' : "default",
    '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":91,"h":154}},{"label":"ц","hitbox":{"x":90,"y":15,"w":91,"h":154}},{"label":"у","hitbox":{"x":180,"y":15,"w":91,"h":154}},{"label":"к","hitbox":{"x":270,"y":15,"w":91,"h":154}},{"label":"е","hitbox":{"x":360,"y":15,"w":91,"h":154}},{"label":"н","hitbox":{"x":450,"y":15,"w":91,"h":154}},{"label":"г","hitbox":{"x":540,"y":15,"w":91,"h":154}},{"label":"ш","hitbox":{"x":630,"y":15,"w":91,"h":154}},{"label":"щ","hitbox":{"x":720,"y":15,"w":91,"h":154}},{"label":"з","hitbox":{"x":810,"y":15,"w":91,"h":154}},{"label":"х","hitbox":{"x":900,"y":15,"w":91,"h":154}},{"label":"ё","hitbox":{"x":990,"y":15,"w":90,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":91,"h":154}},{"label":"ы","hitbox":{"x":90,"y":169,"w":91,"h":154}},{"label":"в","hitbox":{"x":180,"y":169,"w":91,"h":154}},{"label":"а","hitbox":{"x":270,"y":169,"w":91,"h":154}},{"label":"п","hitbox":{"x":360,"y":169,"w":91,"h":154}},{"label":"р","hitbox":{"x":450,"y":169,"w":91,"h":154}},{"label":"о","hitbox":{"x":540,"y":169,"w":91,"h":154}},{"label":"л","hitbox":{"x":630,"y":169,"w":91,"h":154}},{"label":"д","hitbox":{"x":720,"y":169,"w":91,"h":154}},{"label":"ж","hitbox":{"x":810,"y":169,"w":91,"h":154}},{"label":"э","hitbox":{"x":900,"y":169,"w":91,"h":154}},{"label":"ъ","hitbox":{"x":990,"y":169,"w":90,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":91,"h":154}},{"label":"я","hitbox":{"x":90,"y":323,"w":91,"h":154}},{"label":"ч","hitbox":{"x":180,"y":323,"w":91,"h":154}},{"label":"с","hitbox":{"x":270,"y":323,"w":91,"h":154}},{"label":"м","hitbox":{"x":360,"y":323,"w":91,"h":154}},{"label":"и","hitbox":{"x":450,"y":323,"w":91,"h":154}},{"label":"т","hitbox":{"x":540,"y":323,"w":91,"h":154}},{"label":"ь","hitbox":{"x":630,"y":323,"w":91,"h":154}},{"label":"б","hitbox":{"x":720,"y":323,"w":91,"h":154}},{"label":"ю","hitbox":{"x":810,"y":323,"w":91,"h":154}},{"label":"?","hitbox":{"x":900,"y":323,"w":91,"h":154}},{"action":"backspace","hitbox":{"x":990,"y":323,"w":90,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"extra"}}}\n' : "extra"
}

create_dataset_with_gridname_instead_of_grid_primitive(
    datapaths,
    out_paths = ['dangerous_train.jsonl', 'dangerous_valid.jsonl', 'dangerous_test.jsonl'],
    n_lines_list=[n_train_lines, None, None],
    json_grid_str_to_gridname = json_grid_str_to_gridname
)

In [None]:
# Given a huge dataset (text_file with 6e6 lines)
# create n files with 6e6//n lines each by shuffling
# original dataset and splitting it into n parts

def create_split_dataset(data_path: str,
                         n_lines,
                         dataset_size: int,
                         seed: int = 42):
    
    assert dataset_size % n_lines == 0, "dataset_size must be divisible by n_lines"

    n_datasets = dataset_size // n_lines

    

In [25]:
from typing import Set


def get_unique_chars(vocab_path: str) -> Set[str]:
    """Given a path to vocabulary returns a set of unique characters in it"""
    with open(vocab_path, encoding="utf-8") as f:
        vocab = f.read()
    return set(vocab)

In [26]:
vocab_path = r"..\..\data\data_separated_grid\voc.txt"
voc = get_unique_chars(vocab_path)

In [27]:
print(sorted(list(voc)))

['\n', '-', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я']


In [28]:
'ё' in voc

False

То есть у нас есть все буквы, кроме ё, а также "-"

наш словарь букв должен содержвать все буквы, кроме ё, а также ["-", "<sos>", "<eos>", "<pad>"]

насчет pad - не уверен, потому что если использолвать LSTM, он не должен встречаться, а если использует трансформер, то он может быть заменен на какой-ниудь dummy, потому что маска все равно исключает его из рассмотрения

In [29]:
import numpy as np
def get_max_output_len(vocab_path: str):
    """Given a path to vocabulary returns a maximum length of output sequence"""
    with open(vocab_path, encoding="utf-8") as f:
        vocab_str = f.read()
    vocab = vocab_str.split('\n')
    return vocab, np.argmax(np.array([len(word) for word in vocab])), max([len(word) for word in vocab])

In [30]:
vocab, argmax_word_len, max_word_len = get_max_output_len(vocab_path)

In [31]:
max_word_len

34

In [32]:
vocab[argmax_word_len]

'информационно-телекоммуникационной'

In [71]:
max_seq_len = max_word_len + 2  # with start and end tokens

In [67]:
if '\n' in voc:
    voc.remove('\n')

In [69]:
n_tokens = len(voc) + 2  # with start and end tokens

In [72]:
n_tokens, max_seq_len

(35, 36)

In [15]:
train_dataset_path = "../../data/data/result_noctx_10k/train.jsonl"
valid_dataset_path = "../../data/data/result_noctx_10k/valid.jsonl"
test_dataset_path = "../../data/data/result_noctx_10k/test.jsonl"

In [7]:
from typing import List, Optional

def get_trj_seq_lens(dataset_path: str, total: Optional[int] = None):
    """Given a path to dataset returns a list of trajectory lengths"""
    seq_lens = []
    with open(dataset_path, encoding="utf-8") as f:
        for line in tqdm(f, total = total):
            line_data = json.loads(line)
            trj_len = len(line_data['curve']['x'])
            seq_lens.append(trj_len)
    return seq_lens
            

In [10]:
train_seq_lens = get_trj_seq_lens(train_dataset_path, total = 6_000_000)

100%|██████████| 6000000/6000000 [15:31<00:00, 6441.15it/s] 


In [16]:
val_seq_lens = get_trj_seq_lens(valid_dataset_path)
test_seq_lens = get_trj_seq_lens(test_dataset_path)

10000it [00:02, 4784.61it/s]
10000it [00:01, 7352.93it/s]


In [17]:
max(train_seq_lens), max(val_seq_lens), max(test_seq_lens)

(299, 294, 283)

In [18]:
min(train_seq_lens), min(val_seq_lens), min(test_seq_lens)

(4, 4, 4)

In [43]:
# creates a sample dataset with 1000 lines from train dataset

def create_sample_dataset(dataset_path: str, sample_size: int, out_path: str):
    assert not os.path.exists(out_path), f"File {out_path} already exists!"
    with open(dataset_path, encoding="utf-8") as f, open(out_path, 'w', encoding="utf-8") as out_f:
        for i, line in tqdm(enumerate(f), total = sample_size):
            if i == sample_size:
                break
            out_f.write(line)

            

In [21]:
create_sample_dataset("../../data/data_separated_grid/train.jsonl", 1000, "../../data/data_separated_grid/sample_deleteme.jsonl")

100%|██████████| 1000/1000 [00:00<00:00, 27781.45it/s]


In [40]:
from typing import Dict, Optional
import json

def split_dataset_by_grid(dataset_path: str,
                          gridname_to_outpath: Dict[str, str],
                          total: Optional[int] = None):
    for out_path in gridname_to_outpath.values():
        if os.path.exists(out_path):
            raise ValueError(f"File {out_path} already exists!")

    with open(dataset_path, encoding="utf-8", ) as f:
        for line in tqdm(f, total = total):
            line_data = json.loads(line)
            grid_name = line_data['curve']['grid_name']
            out_path = gridname_to_outpath[grid_name]
            with open(out_path, 'a', encoding="utf-8") as out_f:
                out_f.write(line)

In [42]:
train_dataset_path = "../../data/data_separated_grid/train.jsonl"

gridname_to_outpath = {
    "default": "../../data/data_separated_grid/train__default_only.jsonl",
    "extra": "../../data/data_separated_grid/train__extra_only.jsonl"
}

split_dataset_by_grid(train_dataset_path, gridname_to_outpath, total = 6_000_000)

100%|██████████| 6000000/6000000 [1:15:08<00:00, 1330.80it/s] 


In [44]:
create_sample_dataset(
    "../../data/data_separated_grid/train__default_only.jsonl",
    1000,
    "../../data/data_separated_grid/sample_deleteme__default_only.jsonl")

100%|██████████| 1000/1000 [00:00<00:00, 55549.28it/s]
