In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !git clone https://github.com/proshian/yandex-cup-2023-ml-neuroswipe.git
# %cd yandex-cup-2023-ml-neuroswipe
# ! git checkout datasetv4

In [3]:
# !pip install dvc --quiet
# !pip install dvc_gdrive --quiet

In [4]:
# %cd /kaggle/working/yandex-cup-2023-ml-neuroswipe
# ! git pull
# ! git checkout datasetv4

In [11]:
%cd /kaggle/working/yandex-cup-2023-ml-neuroswipe/src

[WinError 3] Системе не удается найти указанный путь: '/kaggle/working/yandex-cup-2023-ml-neuroswipe/src'
c:\Users\proshian\Documents\yandex-cup-2023-ml-neuroswipe\src


  bkms = self.shell.db.get('bookmarks', {})


In [2]:
############# Script arguments emulation #############

GRID_NAME = "default"
BATCH_SIZE = 320
IN_KAGGLE = False
RANDOM_SEED = 12

DATA_ROOT = "../data/data_separated_grid"
MODELS_DIR = "../data/trained_models/m1"

In [3]:
import os
import json
import typing as tp
import traceback
from datetime import datetime
import copy

import torch
# import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from torch.utils.tensorboard import SummaryWriter


from model import SwipeCurveTransformer, get_m1_bigger_model
from tokenizers import CharLevelTokenizerv2, KeyboardTokenizerv1
from tokenizers import ALL_CYRILLIC_LETTERS_ALPHABET_ORD
from dataset import CurveDataset, CollateFn
from word_generators import GreedyGenerator
from nearest_key_lookup import ExtendedNearestKeyLookup
from transforms import TransformerInputOutputGetter, InitTransform, GetItemTransform

In [4]:
################ Other constants ####################
GRID_NAME_TO_DS_PATHS = {
    "extra": {
        "train": os.path.join(DATA_ROOT, "train__extra_only_no_errors__2023_11_01__19_49_14.jsonl"),
        "val": os.path.join(DATA_ROOT, "valid__in_train_format__extra_only.jsonl")
    },
    "default": {
        "train": os.path.join(DATA_ROOT, "train__default_only_no_errors__2023_10_31__03_26_16.jsonl"),
        "val": os.path.join(DATA_ROOT, "valid__in_train_format__default_only.jsonl")
    }
}

In [5]:
# if IN_KAGGLE:
#     DATA_ROOT = "/kaggle/input/neuroswipe-defualt-only-v1"
#     MODELS_DIR = ""

In [5]:
def init_random_seed(value):
    # random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    # torch.backends.cudnn.deterministic = True

In [6]:
def get_grid(grid_name: str, grids_path: str) -> dict:
    with open(grids_path, "r", encoding="utf-8") as f:
        return json.load(f)[grid_name]

In [7]:
from typing import List, Dict, Tuple, Optional, Set

def get_gridname_to_out_of_bounds_coords_dict(
        data_paths: List[str], gridname_to_wh: dict,
        total: Optional[int] = None
        ) -> Dict[str, Set[Tuple[int, int]]]:
    """
    Returns a dictionary with grid names as keys and lists of out of bounds coordinates as values.
    """
    gname_to_out_of_bounds = {gname: set() for gname in gridname_to_wh.keys()}

    for data_path in data_paths:
        with open(data_path, "r", encoding="utf-8") as json_file:
            for line in tqdm(json_file, total=total):
                json_data = json.loads(line)
                curve = json_data['curve']
                grid_name = curve['grid_name']
                w, h = gridname_to_wh[grid_name]
                X, Y = curve['x'], curve['y']
                out_of_bounds = set((x, y) for x, y in zip(X, Y) 
                                    if x < 0 or x >= w or y < 0 or y >= h)
                gname_to_out_of_bounds[grid_name].update(out_of_bounds)
    return gname_to_out_of_bounds

In [8]:
def get_datasets(grid_name: str, grid_name_to_grid_path: str,
                 train_data_path: str, val_data_path: str,
                 nearest_key_candidates: tp.Set[str],
                 kb_tokenizer: KeyboardTokenizerv1,
                 word_char_tokenizer: CharLevelTokenizerv2
                 ) -> tuple[CurveDataset, CurveDataset]:
    
    gridname_to_grid  = {grid_name: get_grid(grid_name, grid_name_to_grid_path)}

    gname_to_wh = {
        gname: (grid['width'], grid['height']) 
        for gname, grid in gridname_to_grid.items()
    }
    
    print("Accumulating out-of-bounds coordinates...")
    gname_to_out_of_bounds = get_gridname_to_out_of_bounds_coords_dict(
        [train_data_path, val_data_path], gname_to_wh, total=6_000_000
    )
    
    print("Creating ExtendedNearestKeyLookups...")
    gridname_to_nkl = {
        gname: ExtendedNearestKeyLookup(grid, nearest_key_candidates, gname_to_out_of_bounds[gname])
        for gname, grid in gridname_to_grid.items()
    }
    
    
    transformer_in_out_getter = TransformerInputOutputGetter(
        grid_name_to_nk_lookup=gridname_to_nkl,
        grid_name_to_wh=gname_to_wh,
        kb_tokenizer=kb_tokenizer,
        word_tokenizer=word_char_tokenizer,
        include_time=False,
        include_velocities=True,
        include_accelerations=True
    )
    
    print("Creating datasets...")
    train_ds = CurveDataset(
        data_path=train_data_path,
        transform = transformer_in_out_getter,
        total = 5_237_584,  # 349_172 for extra
    )

    val_ds = CurveDataset(
        data_path=val_data_path,
        transform = transformer_in_out_getter,
        total = 9_416,
    )
    
    return train_ds, val_ds

In [9]:
init_random_seed(RANDOM_SEED)

In [11]:
# Pickling the dataset would be great to not waste
# around 20 minutes creating train_dataset.

kb_tokenizer = KeyboardTokenizerv1()
voc_path=os.path.join(DATA_ROOT, "voc.txt")
word_char_tokenizer = CharLevelTokenizerv2(voc_path)

train_dataset, val_dataset = get_datasets(
    grid_name=GRID_NAME,
    grid_name_to_grid_path=os.path.join(DATA_ROOT, "gridname_to_grid.json"),
    train_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['train'],
    val_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['val'],
    nearest_key_candidates = ALL_CYRILLIC_LETTERS_ALPHABET_ORD,
    kb_tokenizer=kb_tokenizer,
    word_char_tokenizer=word_char_tokenizer,
)

Accumulating out-of-bounds coordinates...


 87%|████████▋ | 5237584/6000000 [04:18<00:37, 20274.45it/s]
  0%|          | 9416/6000000 [00:00<04:45, 20953.72it/s]


Creating ExtendedNearestKeyLookups...
Creating datasets...


100%|██████████| 5237584/5237584 [05:14<00:00, 16649.41it/s]
100%|██████████| 9416/9416 [00:00<00:00, 17750.98it/s]


In [20]:
train_dataset.transform.get_encoder_feats.grid_name_to_nk_lookup['default'].extended_coord_to_kb_label[624, -24]

'г'

In [10]:
n_iters = 100000

In [50]:
for i in range(n_iters):
    train_dataset.transform.get_encoder_feats.grid_name_to_nk_lookup['default'](624, -24)

In [51]:
for i in range(n_iters):
    train_dataset.transform.get_encoder_feats.grid_name_to_nk_lookup['default'](1, 1)

In [52]:
for i in range(n_iters):
    train_dataset.transform.get_encoder_feats.grid_name_to_nk_lookup['default'](-1111, -1111)

In [57]:
for i in tqdm(range(len(train_dataset))):
    train_dataset[i]

  0%|          | 16512/5237584 [01:27<7:42:23, 188.19it/s] 


KeyboardInterrupt: 

In [10]:
from dataset import NeuroSwipeDatasetv3

In [11]:
def get_datasets_old(grid_name: str, grid_name_to_grid_path: str,
                 train_data_path: str, val_data_path: str,
                 ds_kwargs: dict, kb_tokenizer: KeyboardTokenizerv1,
                 word_char_tokenizer: CharLevelTokenizerv2
                 ) -> tuple[NeuroSwipeDatasetv3, NeuroSwipeDatasetv3]:
    
    gridname_to_grid  = {grid_name: get_grid(grid_name, grid_name_to_grid_path)}

    train_ds = NeuroSwipeDatasetv3(
        data_path=train_data_path,
        gridname_to_grid = gridname_to_grid,
        kb_tokenizer=kb_tokenizer,
        word_tokenizer =word_char_tokenizer,
        total = 5_237_584,  # 349_172 for extra
        **ds_kwargs
    )

    val_ds = NeuroSwipeDatasetv3(
        data_path=val_data_path,
        gridname_to_grid =gridname_to_grid,
        kb_tokenizer=kb_tokenizer,
        word_tokenizer =word_char_tokenizer,
        total = 9_416,
        **ds_kwargs
    )

    return train_ds, val_ds

In [19]:
DS_KWARGS = dict(
    include_time = False,
    include_velocities = True,
    include_accelerations = True,
    has_target=True,
    has_one_grid_only=True,
    include_grid_name=False,
    keyboard_selection_set=set(ALL_CYRILLIC_LETTERS_ALPHABET_ORD)
)


train_dataset, val_dataset = get_datasets_old(
    grid_name=GRID_NAME,
    grid_name_to_grid_path=os.path.join(DATA_ROOT, "gridname_to_grid.json"),
    train_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['train'],
    val_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['val'],
    ds_kwargs=DS_KWARGS,
    kb_tokenizer=kb_tokenizer,
    word_char_tokenizer=word_char_tokenizer,
)

  0%|          | 20988/5237584 [00:03<16:33, 5252.13it/s]


KeyboardInterrupt: 

In [65]:
for i in tqdm(range(len(train_dataset))):
    train_dataset[i]

  0%|          | 1984/5237584 [00:02<2:05:15, 696.63it/s] 


KeyboardInterrupt: 

In [11]:
def get_datasets_new(grid_name: str, grid_name_to_grid_path: str,
                 train_data_path: str, val_data_path: str,
                 nearest_key_candidates: tp.Set[str],
                 kb_tokenizer: KeyboardTokenizerv1,
                 word_char_tokenizer: CharLevelTokenizerv2
                 ) -> tuple[CurveDataset, CurveDataset]:
    
    gridname_to_grid  = {grid_name: get_grid(grid_name, grid_name_to_grid_path)}

    gname_to_wh = {
        gname: (grid['width'], grid['height']) 
        for gname, grid in gridname_to_grid.items()
    }
    
    print("Accumulating out-of-bounds coordinates...")
    gname_to_out_of_bounds = get_gridname_to_out_of_bounds_coords_dict(
        [train_data_path, val_data_path], gname_to_wh, total=6_000_000
    )
    
    print("Creating ExtendedNearestKeyLookups...")
    gridname_to_nkl = {
        gname: ExtendedNearestKeyLookup(grid, nearest_key_candidates, gname_to_out_of_bounds[gname])
        for gname, grid in gridname_to_grid.items()
    }
    
    
    init_transform = InitTransform(
        grid_name_to_nk_lookup=gridname_to_nkl,
        kb_tokenizer=kb_tokenizer,
    )

    get_item_transform = GetItemTransform(
        grid_name_to_wh=gname_to_wh,
        word_tokenizer=word_char_tokenizer,
        include_time=False,
        include_velocities=True,
        include_accelerations=True,
    )

    
    print("Creating datasets...")
    train_ds = CurveDataset(
        data_path=train_data_path,
        init_transform=init_transform,
        get_item_transform=get_item_transform,
        total = 5_237_584,  # 349_172 for extra
    )

    val_ds = CurveDataset(
        data_path=val_data_path,
        init_transform=init_transform,
        get_item_transform=get_item_transform,
        total = 9_416,
    )
    
    return train_ds, val_ds

In [12]:
# Pickling the dataset would be great to not waste
# around 20 minutes creating train_dataset.

kb_tokenizer = KeyboardTokenizerv1()
voc_path=os.path.join(DATA_ROOT, "voc.txt")
word_char_tokenizer = CharLevelTokenizerv2(voc_path)

train_dataset, val_dataset = get_datasets_new(
    grid_name=GRID_NAME,
    grid_name_to_grid_path=os.path.join(DATA_ROOT, "gridname_to_grid.json"),
    train_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['train'],
    val_data_path = GRID_NAME_TO_DS_PATHS[GRID_NAME]['val'],
    nearest_key_candidates = ALL_CYRILLIC_LETTERS_ALPHABET_ORD,
    kb_tokenizer=kb_tokenizer,
    word_char_tokenizer=word_char_tokenizer,
)

Accumulating out-of-bounds coordinates...


 87%|████████▋ | 5237584/6000000 [06:06<00:53, 14308.43it/s]
  0%|          | 9416/6000000 [00:01<12:00, 8309.69it/s] 


Creating ExtendedNearestKeyLookups...
Creating datasets...


100%|██████████| 5237584/5237584 [18:01<00:00, 4841.60it/s]
100%|██████████| 9416/9416 [00:01<00:00, 5385.97it/s]


In [14]:
for el in tqdm(train_dataset):
    pass

  X, Y, T = (torch.tensor(arr) for arr in (X, Y, T))
  1%|          | 47360/5237584 [00:47<1:27:33, 987.89it/s] 


KeyboardInterrupt: 