In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, IterableDataset
from tqdm import tqdm
import numpy as np

from model import SwipeCurveTransformer, get_m1_model
from tokenizers import CharLevelTokenizerv2, KeyboardTokenizerv1
from dataset import NeuroSwipeDatasetv2
from word_generators import GreedyGenerator
from word_generation_v2 import predict_greedy_raw_multiproc, predict_greedy_raw

In [8]:
IN_KAGGLE = False

if IN_KAGGLE:
    DATA_ROOT = "/kaggle/input/yandex-cup-playground"
    MODELS_DIR = ""
else:
    DATA_ROOT = "../data/data_separated_grid"
    # MODELS_DIR = "../data/trained_models/m1"
    MODELS_ROOT = "../data/trained_models"

In [9]:
def get_grid(grid_name: str, grids_path: str) -> dict:
    with open(grids_path, "r", encoding="utf-8") as f:
        return json.load(f)[grid_name]

In [10]:
MAX_TRAJ_LEN = 299

grid_name_to_grid_path = os.path.join(DATA_ROOT, "gridname_to_grid.json")
grid_name_to_grid = {grid_name: get_grid(grid_name, grid_name_to_grid_path) for grid_name in ("default", "extra")}


kb_tokenizer = KeyboardTokenizerv1()
word_char_tokenizer = CharLevelTokenizerv2(os.path.join(DATA_ROOT, "voc.txt"))
keyboard_selection_set = set(kb_tokenizer.i2t)


val_path = os.path.join(DATA_ROOT, "valid__in_train_format.jsonl")


val_dataset = NeuroSwipeDatasetv2(
    data_path = val_path,
    gridname_to_grid = grid_name_to_grid,
    kb_tokenizer = kb_tokenizer,
    max_traj_len = MAX_TRAJ_LEN,
    word_tokenizer = word_char_tokenizer,
    include_time = False,
    include_velocities = True,
    include_accelerations = True,
    has_target=True,
    has_one_grid_only=False,
    include_grid_name=True,
    keyboard_selection_set=keyboard_selection_set,
    total = 10_000
)

test_path = os.path.join(DATA_ROOT, "test.jsonl")


test_dataset = NeuroSwipeDatasetv2(
    data_path = test_path,
    gridname_to_grid = grid_name_to_grid,
    kb_tokenizer = kb_tokenizer,
    max_traj_len = MAX_TRAJ_LEN,
    word_tokenizer = word_char_tokenizer,
    include_time = False,
    include_velocities = True,
    include_accelerations = True,
    has_target=False,
    has_one_grid_only=False,
    include_grid_name=True,
    keyboard_selection_set=keyboard_selection_set,
    total = 10_000
)

100%|██████████| 10000/10000 [00:01<00:00, 6920.25it/s]
100%|██████████| 10000/10000 [00:02<00:00, 4257.13it/s]


In [15]:
from torch.utils.data import Dataset

class NeuroSwipeGridSubset(Dataset):
    def __init__(self, dataset: Dataset, grid_name: str):
        self.dataset = dataset
        self.grid_name = grid_name
        self.grid_name_idxs = self._get_grid_name_idxs()
        
            
    def _get_grid_name_idxs(self):
        grid_name_idxs: list[int] = []
        for i, ((_, _, _, _, _), _, grid_name) in enumerate(self.dataset):
            if grid_name == self.grid_name:
                grid_name_idxs.append(i)
        return grid_name_idxs

    
    def __len__(self):
        return len(self.grid_name_idxs)
    
    def __getitem__(self, idx):
        return self.dataset[self.grid_name_idxs[idx]]

In [18]:
val_default_dataset = NeuroSwipeGridSubset(val_dataset, "default")
val_extra_dataset = NeuroSwipeGridSubset(val_dataset, "extra")

test_default_dataset = NeuroSwipeGridSubset(test_dataset, "default")
test_extra_dataset = NeuroSwipeGridSubset(test_dataset, "extra")

In [28]:
def remove_duplicates(preds):
    new_preds = []
    met_preds = set()
    for pred in preds:
        if pred in met_preds:
            continue
        met_preds.add(pred)
        new_preds.append(pred)
    return new_preds


def get_metric(preds_list, ref):
    # Works properly if has duplicates or n_line_preds < 4

    MMR = 0
    
    for preds, target in zip(preds_list, ref):
        preds = remove_duplicates(preds)

        weights = [1, 0.1, 0.09, 0.08]

        line_MRR = sum(weights[i]* (pred == target) for i, pred in enumerate(preds))

        MMR += line_MRR
    
    MMR /= len(preds_list)

    return MMR

In [30]:
from typing import Callable, Dict, List


def get_targets(dataset: NeuroSwipeDatasetv2) -> List[str]:
    targets = []
    for (_, _, _, _, _), target, _ in dataset:
        targets.append(target)
    return targets

def evaluate_model_greedy(val_dataset: NeuroSwipeDatasetv2,
                          model: nn.Module,
                          grid_name: str,
                          targets: List[str],
                          word_char_tokenizer: CharLevelTokenizerv2,
                          device: torch.device):
    """
    Evaluates model on validation dataset using greedy generation.
    """
    assert grid_name in ("extra", "default")
    model.eval()
    model.to(device)
    generator = GreedyGenerator(model, word_char_tokenizer, device)
    grid_name_to_greedy_generator = {grid_name:  generator}
    preds = predict_greedy_raw(val_dataset,
                                grid_name_to_greedy_generator)
    MMR = get_metric(preds, targets)
    return MMR, preds


def evaluate_weights_greedy(val_dataset: NeuroSwipeDatasetv2,
                            model_getter: Callable,
                            weights_path: str,
                            grid_name: str,
                            targets: List[str],
                            word_char_tokenizer: CharLevelTokenizerv2,
                            device: torch.device):
    
    model = model_getter(device, weights_path)
    MMR, preds = evaluate_model_greedy(val_dataset,
                                       model,
                                       grid_name,
                                       targets,
                                       word_char_tokenizer,
                                       device)
    return MMR, preds


In [45]:
# def get_i_to_grid_name(dataset: NeuroSwipeDatasetv2):
#     i_to_grid_name = []
#     for i, data in tqdm(enumerate(dataset), total=len(dataset)):
#         (_, _, _, _, _), _, grid_name = data
#         i_to_grid_name.append(grid_name)
#     return i_to_grid_name


# def combine_preds(i_to_grid_name, default_preds, extra_preds):
#     preds = []
#     default_i = 0
#     extra_i = 0
#     for i, grid_name in enumerate(i_to_grid_name):
#         if grid_name == "default":
#             preds.append(default_preds[default_i])
#             default_i += 1
#         elif grid_name == "extra":
#             preds.append(extra_preds[extra_i])
#             extra_i += 1
#         else:
#             raise ValueError(f"Unknown grid_name: {grid_name}")
        
#     return preds
        

In [None]:
def combine_preds(default_preds,
                  extra_preds,
                  default_idxs,
                    extra_idxs):
    preds = [None] * (len(default_preds) + len(extra_preds))

    for i in default_idxs:
        preds[i] = default_preds[i]
    for i in extra_idxs:
        preds[i] = extra_preds[i]

    return preds


In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
grid_name = "default"
model_getter = get_m1_model
weights_path = os.path.join(MODELS_ROOT, "m1_v2", "m1_v2__2023_11_09__10_36_02__0.14229_default_switch_0.pt")
model = model_getter(device, weights_path)
grid_name_to_greedy_generator = {grid_name: GreedyGenerator(model, word_char_tokenizer, device)}

In [38]:
greedy_generator = GreedyGenerator(model, word_char_tokenizer, device)


print("{:<20} {:<20}".format("target", "prediction"))
print("-"*31)

n_examples = 40

for i, data in enumerate(val_default_dataset):

    (xyt, kb_tokens, dec_in_char_seq, traj_pad_mask, word_pad_mask), target, grid_name = data

    pred = greedy_generator(xyt, kb_tokens, traj_pad_mask)

    # strip работвет только потому что в настоящих словах нет этих символов
    pred = pred
    target_len = int(torch.sum(~word_pad_mask)) - 1
    target = word_char_tokenizer.decode(target[:target_len])
    print("{:<20} {:<20}".format(target, pred))

    if i >= n_examples:
        break

target               prediction          
-------------------------------
на                   на                  
все                  все                 
добрый               добрый              
девочка              девочка             
сказала              сказала             
скинь                скинь               
геев                 геев                
тобой                тобой               
была                 быстра              
да                   да                  
муж                  маж                 
щас                  щас                 
она                  она                 
проблема             проблема            
билайн               билайн              
уже                  уже                 
раньше               раньше              
рам                  нам                 
щас                  щас                 
купил                купил               
ты                   ты                  
зовут                зовут               
ко

In [40]:
val_default_targets = get_targets(val_default_dataset)
val_extra_targets = get_targets(val_extra_dataset)

In [42]:
mmr, preds = evaluate_model_greedy(val_default_dataset,
                                    model,
                                    grid_name,
                                    val_default_targets,
                                    word_char_tokenizer,
                                    device)

  2%|▏         | 187/9416 [00:18<15:21, 10.02it/s]


KeyboardInterrupt: 

In [None]:
print(mmr)

In [43]:
print(preds[200:250])

[['скинь'], ['мазора'], ['то'], ['анюта'], ['звони'], ['лесник'], ['минут'], ['забрала'], ['на'], ['обуд'], ['завтра'], ['такими'], ['давай'], ['посади'], ['бон'], ['даже'], ['перчатка'], ['работа'], ['никого'], ['отресли'], ['не'], ['раз'], ['блин'], ['пока'], ['ну'], ['тогда'], ['башка'], ['был'], ['продал'], ['хочу'], ['хорошая'], ['кофе'], ['быть'], ['ты'], ['стиревем'], ['мойкой'], ['мы'], ['но'], ['мо'], ['нету'], ['ну'], ['так'], ['ты'], ['закрой'], ['сейчас'], ['пойми'], ['что'], ['поровну'], ['это'], ['не']]


In [36]:
{"m1_v2/best_model__2023_11_09__10_36_02__0.14229_default_switch_2_try_2.pt": 0.8512107051826678,
 "m1_bigger/m1_bigger_v2__2023_11_10__13_38_32__0.50552_default_l2_5e-05_ls0.045_switch_0.pt": 0.810429056924384,
 "m1_bigger/m1_bigger_v2__2023_11_10__16_36_38__0.49848_default_l2_5e-05_ls0.045_switch_0.pt": 0.818500424808836}

{'m1_v2/best_model__2023_11_09__10_36_02__0.14229_default_switch_2_try_2.pt': 0.8512107051826678}

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
grid_name = "default"
weights_path = os.path.join(MODELS_ROOT, "m1_v2", "m1_v2__2023_11_09__10_36_02__0.14229_default_switch_0.pt")
model = get_m1_model(weights_path = weights_path, device = device)
grid_name_to_greedy_generator = {grid_name:  GreedyGenerator(model, word_char_tokenizer, device)}

In [29]:
predictions = predict_greedy_raw_multiproc(val_default_dataset,
                                           grid_name_to_greedy_generator,
                                           num_workers=4)

100%|██████████| 9416/9416 [11:31<00:00, 13.61it/s]


In [28]:
predictions = predict_greedy_raw(val_default_dataset,
                                grid_name_to_greedy_generator)

  1%|          | 68/9416 [00:07<17:22,  8.97it/s]


KeyboardInterrupt: 

# Let's create a submission