In [1]:
import numpy as np
import pandas as pd
import re
import torch
from torch import nn
import os
import sklearn
import json
from tqdm.notebook import tqdm
import Levenshtein

# ordinal encoding

In [2]:
path_to_data_folder = "../data"

In [3]:
with open(path_to_data_folder + '/character_to_prediction_index.json') as json_file:
    CHAR2ORD = json.load(json_file)
    
ORD2CHAR = {j:i for i,j in CHAR2ORD.items()}
    
display(pd.Series(CHAR2ORD).to_frame('Ordinal Encoding'))

Unnamed: 0,Ordinal Encoding
,0
!,1
#,2
$,3
%,4
&,5
',6
(,7
),8
*,9


# hyperparameters

In [4]:
#IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
SEED = 1337
DEBUG = True
N_UNIQUE_CHARACTERS = len(CHAR2ORD) + 1 + 1 + 1 + 1#
PAD_TOKEN = len(CHAR2ORD) # Padding
SOS_TOKEN = len(CHAR2ORD) + 1 # Start Of Sentence
EOS_TOKEN = len(CHAR2ORD) + 2 # End Of Sentence
NAN_TOKEN = len(CHAR2ORD) + 3
BATCH_SIZE = 64
NUM_EPOCHS = 2 #if IS_INTERACTIVE else 100
NUM_WARMUP_EPOCHS = 10
WEIGHT_DECAY = 0.05
NUM_WORKERS = 2
TRAIN_MODEL = True
LOAD_WEIGHTS = False
MAX_LR = 1e-3
WARMUP_METHOD = 'exp'
USE_VAL = True
MAX_PHRASE_LENGTH = 33

In [5]:
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f27e85bb650>

# data manipulation

In [6]:
ORD2CHAR[PAD_TOKEN] = "<PAD>"
ORD2CHAR[SOS_TOKEN] = "<SOS>"
ORD2CHAR[EOS_TOKEN] = "<EOS>"
ORD2CHAR[NAN_TOKEN] = "<NAN>"
CHAR2ORD["<PAD>"] = PAD_TOKEN
ORD2CHAR["<SOS>"] = SOS_TOKEN
ORD2CHAR["<EOS>"] = EOS_TOKEN
CHAR2ORD["<NAN>"] = NAN_TOKEN

# Load data

In [7]:
if USE_VAL:
    # TRAIN
    X_train = np.load(path_to_data_folder + '/X_train.npy')
    y_train = np.load(path_to_data_folder + '/y_train.npy')[:,:MAX_PHRASE_LENGTH]
    N_TRAIN_SAMPLES = len(X_train)
    # VAL
    X_val = np.load(path_to_data_folder + '/X_val.npy')
    y_val = np.load(path_to_data_folder + '/y_val.npy')[:,:MAX_PHRASE_LENGTH]
    N_VAL_SAMPLES = len(X_val)
    # Shapes
    print(f'X_train shape: {X_train.shape}, X_val shape: {X_val.shape}')
# Train On All Data
else:
    # TRAIN
    X_train = np.load(path_to_data_folder + '/X.npy')
    y_train = np.load(path_to_data_folder + '/y.npy')[:,:MAX_PHRASE_LENGTH]
    N_TRAIN_SAMPLES = len(X_train)
    print(f'X_train shape: {X_train.shape}')

X_train shape: (54719, 128, 164), X_val shape: (7236, 128, 164)


in X i've right hand, left hand, lips coords(x,y) for i in range(number of frames)

and in y i have char for X hands and lips position

the main problem is that there are 128 frames and only 31 letters, so i gotta combine some frames that represent the same letter

In [8]:
y_train.shape

(54719, 33)

# Baseline

In [9]:
kernel_size = 2
stride = 2
n_embd = 512

In [10]:
class SignRecognition(nn.Module):
    def __init__(self, frames, kernel_size=2, stride=2):
        super().__init__()
        self.conv1 = nn.Conv2d(frames, frames // 4, kernel_size=kernel_size, stride=stride)
        self.bn1 = nn.BatchNorm2d(frames // 4)
        self.lin1 = nn.Linear(41, 128)
        self.lin2 = nn.Linear(128, N_UNIQUE_CHARACTERS)
        self.gelu = torch.nn.GELU()
        self.softmax = nn.Softmax(dim=-1)
        

    def forward(self, x):
        x = self.conv1(x) # 128 41 1
        x = self.gelu(self.bn1(x))
        x = x.squeeze(dim=-1)
        x = self.lin1(x)
        x = self.gelu(x)
        x = self.lin2(x)
        x = self.softmax(x)
        return x

In [11]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
config

  from .autonotebook import tqdm as notebook_tqdm


MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-zh-en",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 65000,
  "decoder_vocab_size": 65001,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "extra_pos_embeddings": 65001,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "m

In [12]:
config.vocab_size = N_UNIQUE_CHARACTERS
config.max_length = MAX_PHRASE_LENGTH
config.max_position_embeddings = MAX_PHRASE_LENGTH
config.pad_token_id = PAD_TOKEN
config.bos_token_id = SOS_TOKEN # <SOS>
config.eos_token_id = EOS_TOKEN
config.decoder_start_token_id = SOS_TOKEN
config.decoder_vocab_size = N_UNIQUE_CHARACTERS
config.extra_pos_embeddings = MAX_PHRASE_LENGTH
config.forced_eos_token_id = EOS_TOKEN
config.d_model = 128 # this is n_embd

In [13]:
config

MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-zh-en",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      65000
    ]
  ],
  "bos_token_id": 60,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 128,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 60,
  "decoder_vocab_size": 63,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 61,
  "extra_pos_embeddings": 33,
  "forced_eos_token_id": 61,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_len

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en", config=config)
transformer = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en", config=config, ignore_mismatched_sizes=True)

Some weights of MarianMTModel were not initialized from the model checkpoint at Helsinki-NLP/opus-mt-zh-en and are newly initialized because the shapes did not match:
- final_logits_bias: found shape torch.Size([1, 65001]) in the checkpoint and torch.Size([1, 63]) in the model instantiated
- model.shared.weight: found shape torch.Size([65001, 512]) in the checkpoint and torch.Size([63, 128]) in the model instantiated
- model.encoder.embed_tokens.weight: found shape torch.Size([65001, 512]) in the checkpoint and torch.Size([63, 128]) in the model instantiated
- model.encoder.embed_positions.weight: found shape torch.Size([512, 512]) in the checkpoint and torch.Size([33, 128]) in the model instantiated
- model.encoder.layers.0.self_attn.k_proj.weight: found shape torch.Size([512, 512]) in the checkpoint and torch.Size([128, 128]) in the model instantiated
- model.encoder.layers.0.self_attn.k_proj.bias: found shape torch.Size([512]) in the checkpoint and torch.Size([128]) in the model ins

In [15]:
def reset_model_weights(layer):
    if hasattr(layer, 'reset_parameters'):
        print(f"weights were resetted for {layer}")
        layer.reset_parameters()
    else:
        if hasattr(layer, 'children'):
            for child in layer.children():
                reset_model_weights(child)

In [16]:
reset_model_weights(transformer)

weights were resetted for Embedding(63, 128, padding_idx=59)
weights were resetted for Embedding(63, 128, padding_idx=59)
weights were resetted for MarianSinusoidalPositionalEmbedding(33, 128)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were resetted for LayerNorm((128,), eps=1e-05, elementwise_affine=True)
weights were resetted for Linear(in_features=128, out_features=2048, bias=True)
weights were resetted for Linear(in_features=2048, out_features=128, bias=True)
weights were resetted for LayerNorm((128,), eps=1e-05, elementwise_affine=True)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were resetted for Linear(in_features=128, out_features=128, bias=True)
weights were 

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
transformer = transformer.to(device)
cnn = SignRecognition(128).to(device)

In [19]:
class Tokenizer:
    def __init__(self):
        pass
    def encode(out: torch.tensor) -> str:
        """
        encode output of model into text
        """
        text = []
        for x in out:
            token = x.argmax().item()
            if token == PAD_TOKEN:
                continue # what if pad token will be in the middle of sentence
            text.append(ORD2CHAR[token])
        return ''.join(text)
    
    def decode(string: str) -> torch.tensor:
        """
        decode string into vocab size space so that i can put it as my target while training
        """
        out = []
        for symbol in string:
            out.append(CHAR2ORD[symbol])
        
        return torch.tensor(out)        


In [20]:
tokenizer = Tokenizer()

# Preprocess targets

In [21]:
X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train).type(torch.LongTensor)
if USE_VAL:
    X_val = torch.from_numpy(X_val)
    y_val = torch.from_numpy(y_val).type(torch.LongTensor)

In [22]:
num_examples, frames, features = X_train.shape
X_train = X_train.view(num_examples, frames, features // 2, 2)
X_train.shape

torch.Size([54719, 128, 82, 2])

In [23]:
if USE_VAL:
    num_examples, frames, features = X_val.shape
    X_val = X_val.view(num_examples, frames, features // 2, 2)
    X_val.shape

In [24]:
y_train = y_train.type(torch.LongTensor)
if USE_VAL:
    y_val = y_val.type(torch.LongTensor)

In [25]:
decoder_input_ids_train = torch.zeros_like(y_train)
decoder_input_ids_train.shape

torch.Size([54719, 33])

In [26]:
for i in range(len(y_train)):
    decoder_input_ids_train[i] = torch.concat((torch.tensor([SOS_TOKEN]), y_train[i][:-1]))

In [27]:
if USE_VAL:
    decoder_input_ids_val = torch.zeros_like(y_val)
    decoder_input_ids_val.shape

In [28]:
if USE_VAL:
    for i in range(len(y_val)):
        decoder_input_ids_val[i] = torch.concat((torch.tensor([SOS_TOKEN]), y_val[i][:-1]))

In [29]:
decoder_input_ids_train = decoder_input_ids_train.type(torch.LongTensor)
if USE_VAL:
    decoder_input_ids_val = decoder_input_ids_val.type(torch.LongTensor)

# Creating Dataloader

In [30]:
from torch.utils.data import DataLoader


In [31]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, decoder_input_ids):
        self.data = data
        self.labels = labels
        self.decoder_input_ids = decoder_input_ids

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.labels[index]
        decoder_input_ids = self.decoder_input_ids[index]
        return x, y, decoder_input_ids


In [32]:
train_dataset = CustomDataset(X_train, y_train, decoder_input_ids_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)

In [33]:
val_dataset = CustomDataset(X_val, y_val, decoder_input_ids_val)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)

# Levenshtein distance

In [34]:
import time

In [35]:
def encode_target(target: torch.tensor):
    answer = []
    for x in target:
        if x != PAD_TOKEN:
            answer.append(ORD2CHAR[x.item()])
    return "".join(answer)


In [36]:
def metric(pred: torch.tensor, target:torch.tensor):
    # shouldn't count pad token 
    # if i would use it in model
    # since levenshtein distance is not linear i'll return N and D of each batch and then sum them
    D = 0
    N = 0
    for i in range(len(pred)):# through batches
        p = encode(pred[i])
        t = encode_target(target[i])
        distance = Levenshtein.distance(p, t)
        
        D += distance
        N += len(p) + len(t)
    
    return N, D


In [37]:
# # testing metric
# for idx, (x, y) in enumerate(train_dataloader):
#     out = model(x)
#     print(metric(out, y))
#     if idx > 5:
#         break

# train parameters(optimizer, loss, etc)

In [38]:
loss_fn_cnn = nn.NLLLoss()
transformer_optimizer = torch.optim.AdamW(
    transformer.parameters(),
    lr = 1e-6
)
cnn_optimizer = torch.optim.AdamW(
    cnn.parameters(),
    lr = 1e-6
)
device = "cuda" if torch.cuda.is_available() else "cpu"
cnn = cnn.to(device)
transformer = transformer.to(device)

In [39]:
from accelerate import Accelerator

accelerator = Accelerator()
transformer, transformer_optimizer, train_dataloader, val_dataloader = accelerator.prepare(
    transformer, transformer_optimizer, train_dataloader, val_dataloader
)

# Train loop

In [40]:
# for g in optimizer.param_groups:
#     g['lr'] = 1e-3

overfitting on single example

real training starts

In [41]:
epoch_losses = []
for epoch in range(NUM_EPOCHS):
    losses = []
    # loop = tqdm(train_dataloader, leave=False)
    #train_NDs = [] # array with pairs (N, D) 
    for (x, y, decoder_input_ids) in train_dataloader:
        x = x.to(device)
        y = y.to(device)
        decoder_input_ids = decoder_input_ids.to(device)

        signs = cnn(x)
        input_ids = signs.argmax(dim=-1)
        out = transformer(input_ids, labels=y, decoder_input_ids=decoder_input_ids)
        
        loss_transformer = out.loss
        accelerator.backward(loss_transformer)
        transformer_optimizer.step()
        transformer_optimizer.zero_grad()
        
        loss_cnn = loss_fn_cnn(signs, y)
        loss_cnn.backward()
        cnn_optimizer.step()
        cnn_optimizer.zero_grad()

        losses.append([loss_transformer.item(), loss_cnn.item()])
        
        
        
        # if epoch % 5 == 0:
        #     train_NDs.append(metric(out, y))
        
    
    transformer_mean_loss = sum(losses, key=lambda x: x[0]) / len(losses)
    cnn_mean_loss = sum(losses, key=lambda x: x[1]) / len(losses)
    epoch_losses.append([transformer_mean_loss, cnn_mean_loss])
    if epoch % 2 == 0:
        print(f"epoch №{epoch + 1}\n transformer_mean_loss = {transformer_mean_loss}\cnn_mean_loss = {cnn_mean_loss}")


        # validation score 
    # CHANGE NAME FOR LOOP!!!!!!!!!!!!!!!!!!!!!
        # loop = tqdm(val_dataloader, leave=False)
        # for (x, y) in loop:
        #     x = x.to(device)
        #     y = y.to(device)
        #     with torch.no_grad():
        #         out = model(x)
        #     loss = loss_fn(out.view(BATCH_SIZE, -1, 31), y)
        #     val_losses.append(loss.item())
        
        # print(f"Validation loss on epoch №{epoch + 1} = {sum(val_losses) / len(val_losses)}\nbtw train loss = {mean_loss}")
        
        
    #val_losses = []
    # if epoch % 5 == 0:
    #     # compute levenshtein distance on validation data
    #     loop = tqdm(val_dataloader, leave=False)
    #     val_NDs = []
    #     for (x,y) in loop:
    #         x = x.to(device)
    #         y = y.to(device)
    #         with torch.no_grad():
    #             out = model(x)
    #         val_NDs.append(metric(out, y))
            
    #     train_distance = 1 - sum(x[1] for x in train_NDs) / sum(x[0] for x in train_NDs)
    #     val_distance = 1 - sum(x[1] for x in val_NDs) / sum(x[0] for x in val_NDs)
        
        
        
    #     print(f"epoch№ {epoch + 1}\ntrain_distance = {train_distance}\nval_distance = {val_distance}\n\n")
            
        


0 32
0 33


In [None]:
соеденить две модельки в sequential и перестать возиться с разными лоссами

In [None]:
product = 1
for x in y.shape:
    product *= x
product

In [None]:
encode(out[5])

In [None]:
Yt_train = y.type(torch.LongTensor).cuda()

In [None]:
loss(torch.transpose(out, 1, 2), Yt_train)

In [None]:
out[0]

In [1]:
from transformers import  AutoModelForSeq2SeqLM, AutoTokenizer
mname = "Helsinki-NLP/opus-mt-ru-fr"
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSeq2SeqLM.from_pretrained(mname)



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch

In [58]:
input = "Здарова, как жизнь?"
input_ids = tokenizer.encode(input, return_tensors="pt")
out = model(input_ids, decoder_input_ids=torch.tensor([[0, 321]]))
out

Seq2SeqLMOutput(loss=None, logits=tensor([[[ -1.3973, -12.1024,  -3.6717,  ..., -12.1039, -12.1037,   0.0000],
         [ -4.4700, -14.4538,  -6.5072,  ..., -14.4591, -14.4613,   0.0000]]],
       grad_fn=<AddBackward0>), past_key_values=((tensor([[[[ 0.1202,  0.9309, -0.6240,  ..., -0.9568,  1.5552, -1.4000],
          [ 0.6468,  1.0183, -0.5855,  ..., -1.3328,  1.7627, -1.8537]],

         [[ 1.7458, -0.8240, -1.6842,  ..., -1.3492,  1.9502,  0.6336],
          [ 1.1572, -0.2653, -0.7925,  ..., -1.7627,  3.7647,  1.7054]],

         [[ 0.2190, -0.2175,  0.0326,  ..., -0.8406,  0.7209,  0.0771],
          [-0.3864, -0.0734,  0.2335,  ..., -0.1937,  0.9352, -0.5549]],

         ...,

         [[-2.8143, -1.4174,  1.4170,  ..., -0.9163,  0.2304,  0.8446],
          [-3.0297, -2.4983,  1.4386,  ..., -1.1109,  0.8090,  0.9012]],

         [[-0.8551,  1.0457,  2.2835,  ..., -0.0140,  1.2346,  0.8761],
          [-0.6907,  1.0890,  1.3075,  ..., -1.4660,  0.5024,  1.6909]],

         [[-3.5

In [59]:
q = out.logits.argmax(dim=-1)

In [60]:
tokenizer.decode(q[0])

'<pad> <pad>'

In [61]:
q

tensor([[64376, 64376]])

In [63]:
tokenizer.decode(torch.tensor(321))

'должны'

In [57]:
for i in range(out.logits.shape[-1]):
    if tokenizer.decode(torch.tensor(i)) == "<SOS":
        print(i)

In [65]:
tokenizer.decode(model.generate(tokenizer.encode("Здарова как жизнь?", return_tensors="pt")))