In [1]:
import torch
import torchaudio

from torch.optim import AdamW
from torch.utils.data import DataLoader

from torch import nn

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.model_selection import KFold, StratifiedKFold

from transformers import PreTrainedTokenizerFast

from transformers import get_cosine_schedule_with_warmup

from modules.Transformer.model import Transformer
from modules.Transformer.train import train_epoch, eval_epoch
from modules.dataset import AudioDataset

import random
import os


import wandb

torch.cuda.is_available()

True

In [2]:
path_to_tokenizer = './tokenizer.json'
path_to_data = './audio_dataset/'

data = pd.read_csv(os.path.join(path_to_data,'df.csv'), usecols=['text','status','path','rate','duration','frames'])
data = data[data.status=='APPROVED'].reset_index(drop=True)
del data['status']
data.text = data.text.apply(lambda x: "".join([char for char in x if char.isalpha() or char==' ']).lower())
data.duration.max()

train_data = data.iloc[:1000]
valid_data = data.iloc[70000:]

In [3]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.displot(data.text.str.len())
# plt.show()dd
# sns.displot(data.duration)
# plt.show()

In [4]:
np.percentile(data.text.str.len(), 99.5)

135.0

In [5]:
def weights_init_xavier(m):
    '''
    Xavier uniform
    '''
    classname = m.__class__.__name__

    if classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = PreTrainedTokenizerFast(tokenizer_file=path_to_tokenizer, 
                                    padding_side ='right',
                                    bos_token = '[SOS]',
                                    eos_token = '[EOS]',
                                    pad_token = '[PAD]',
                                    unk_token = '[UNK]',
                                    mask_token = '[MASK]')


train_dataset = AudioDataset(train_data, path_to_data, tokenizer, 
                             n_fft=512,
                             n_mels=40, 
                             center=True, 
                             max_tokenized_length=100, 
                             max_audio_len=25, 
                             sr=16000)
valid_dataset = AudioDataset(valid_data, path_to_data, tokenizer, 
                             n_fft=512, 
                             n_mels=128, 
                             center=True, 
                             max_tokenized_length=100, 
                             max_audio_len=25, 
                             sr=16000)
model = Transformer(vocab_size=len(tokenizer),
                    n_mels=40,
                    enc_seq_len=25, 
                    dec_seq_len=100,
                    hidden_dim=16, 
                    enc_num_layers=2, 
                    dec_num_layers=2, 
                    num_heads=3, 
                    ff_dim=128, 
                    device=device,
                    dropout=0.0, 
                    sr=16000, 
                    n_fft=512,
                    padding_idx=tokenizer.pad_token_id)


        #.bias.data.fill_(0.01)
model.apply(weights_init_xavier)

1563 40


Transformer(
  (vgg): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU()
    (7): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (encoder): Encoder(
    (lin_in): Linear(in_features=40, out_features=40, bias=True)
    (norm_in): LayerNorm((40,), eps=1e-05, elementwise_affine=True)
    (pe): TrainablePositionalEncoding()
    (layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (attention): MHA(
          (dropout): Dropout(p=0.0, inplace=False)
          (heads): ModuleList(
            (0-2): 3 x MHAHead(
              (v): Linear(in_features=4

n_fft=1024, win_lenght=1024, hop_lenght=256, n_mels=64, center=True):

In [7]:
torch.cuda.empty_cache()

In [8]:
from torchmetrics.functional import word_error_rate
from torchmetrics.functional.classification import multiclass_accuracy

In [9]:
def cross_validation(model, 
                     dataset, 
                     loss_function,
                     strat_array=None,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle: bool=True, 
                     n_folds: int=10, 
                     epochs: int=5, 
                     lr: float=1e-3,
                     start_fold: int=0, 
                     batch_size: int=4,
                     iters_to_accumulate=None,
                     n_accumulated_grads: int = 0):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    if strat_array:
        kfold = StratifiedKFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset, strat_array)
    else: 
        kfold = KFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset)

    for fold, (train_ids, eval_ids) in enumerate(split):
        if fold >= start_fold:
            print(f'FOLD {fold}')
            print('--------------------------------')
            run = wandb.init(
                name=f"fold_{fold}",
                project=f"asr_fold_{fold}",
                config={ 
                         "random_state": random_state, 
                         "shuffle": shuffle,
                         "epochs": epochs, 
                         "learning_rate": lr,
                         "batch_size": batch_size,
                         "iters_to_accumulate": iters_to_accumulate
                        })

            optimizer = AdamW(model.parameters(), lr=lr
#                 [{"params": model.encoder.parameters(), "lr": 1e-4},
#                 {"params": model.decoder.parameters(), "lr": 1e-3},]
        )

            train_subsampler = torch.utils.data.Subset(dataset,  train_ids)
            train_loader = torch.utils.data.DataLoader(
                          train_subsampler, 
                          batch_size=batch_size,
                          shuffle=shuffle, drop_last=True)

            eval_subsampler = torch.utils.data.Subset(dataset,  eval_ids)
            eval_loader = torch.utils.data.DataLoader(
                          eval_subsampler,
                          batch_size=batch_size,
                          shuffle=shuffle, drop_last=True)
            
            total_steps = len(train_loader) * epochs 

            scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = len(train_loader)*2, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

            #scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, total_iters=0)
            for epoch_i in range(epochs):
                train_metrics, t_preds = train_epoch(model, train_loader, dataset.tokenizer, loss_function, optimizer, scheduler, device)
                eval_metrics, preds = eval_epoch(model, eval_loader, dataset.tokenizer, loss_function, device)
                print(f"EPOCH: {epoch_i}")
                print(train_metrics)
                print(eval_metrics)
                print(t_preds)
                print(preds)
                run.log(train_metrics)
                run.log(eval_metrics)
                            
            run.finish()
        break

In [10]:
string = tokenizer.encode("я люблю дашу")
tokenizer.decode(string, skip_special_tokens=True)

'я люблю дашу'

In [11]:
tokenizer.decode(16)

'l'

In [12]:
train_dataset[1]

{'text': 'этот процесс так захватывает человека что он отвлекается от переживаний переключаясь на чтото иное',
 'encoded_text': tensor([  1,  93, 184, 103, 209,  86,  69,  81, 104, 204, 107, 196,  85, 188,
          82,  91, 188, 234, 220, 208, 205,  74, 101, 244,  78, 121, 245,  66,
         207, 190,  69, 237,  78, 103, 227, 182,  70,  72, 188, 186, 102, 227,
         182,  74,  75,  94, 238,  95,  81, 106, 202,  87, 184, 195,  72, 183,
         112,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
           2,   2]),
 'spectre': tensor([[[0.0000e+00, 2.6849e-03, 6.1225e-02,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 2.8886e-03, 2.6989e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 1.9776e-03, 2.2738e-01,  ..., 0.0000e+00,
           0.0000e+00

In [None]:
cross_validation(model = model,
                 dataset=train_dataset, 
                 loss_function=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id), 
                 device=torch.device("cuda"),
                 random_state=69,
                 shuffle=True,
                 batch_size=16,
                 lr=3e-3)

FOLD 0
--------------------------------


[34m[1mwandb[0m: Currently logged in as: [33mshockless[0m ([33mfedor-avilov[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 5.189449599811009, 'Train Word Accuracy': -0.41356360912323, 'Train Accuracy': 0.0}
{'Val Loss': 4.783752838770549, 'Val Word Accuracy': 0.0, 'Val Accuracy': 0.0}
сссссссссссссссссссссссссссссссссссссссссссссс
сссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссссс


  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 4.712824259485517, 'Train Word Accuracy': 0.0, 'Train Accuracy': 0.0}
{'Val Loss': 4.741640329360962, 'Val Word Accuracy': -0.009872078895568848, 'Val Accuracy': 0.0}
сссссссссссссссссссссссссссссссссссссссссссссссисссссс
мсисссисисссисисисисисе се се се свса а са


  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 4.465691404683249, 'Train Word Accuracy': -0.03625190258026123, 'Train Accuracy': 0.0}
{'Val Loss': 5.129363854726155, 'Val Word Accuracy': -2.0308568477630615, 'Val Accuracy': 0.0}
в тотоадкисссасущнщсиасусений суассенй яссдужинесснсии сенв ть й сиздсеза ежа жасенй й и
в ский сь сущизизизим в саням в санях в сал и посанях санях ский саны й санях й й ский ский санях й й й й й й й саны ы ы


  0%|          | 0/56 [00:00<?, ?it/s]