In [1]:
import torch
import torchaudio

from torch.optim import AdamW
from torch.utils.data import DataLoader

from torch import nn

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.model_selection import KFold, StratifiedKFold

from transformers import PreTrainedTokenizerFast

from transformers import get_cosine_schedule_with_warmup

from modules.model import Transformer, train_epoch, eval_epoch
from modules.dataset import AudioDataset
from modules.tokenizer import tokenize

import random
import os

torch.cuda.is_available()

True

In [2]:
path_to_tokenizer = './tokenizer.json'
path_to_data = './audio_dataset/'

data = pd.read_csv(os.path.join(path_to_data,'df.csv'), usecols=['text','status','path','rate','duration','frames'])
data = data[data.status=='APPROVED'].reset_index(drop=True)
del data['status']
data.text = data.text.apply(lambda x: "".join([char for char in x if char.isalpha() or char==' ']).lower())
data.duration.max()

train_data = data.iloc[:10000]
valid_data = data.iloc[10000:20000]

In [3]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.displot(data.text.str.len())
# plt.show()dd
# sns.displot(data.duration)
# plt.show()

In [4]:
np.percentile(data.text.str.len(), 99.5)

135.0

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = PreTrainedTokenizerFast(tokenizer_file=path_to_tokenizer, 
                                    padding_side ='right',
                                    bos_token = '[SOS]',
                                    eos_token = '[EOS]',
                                    pad_token = '[PAD]',
                                    unk_token = '[UNK]',
                                    mask_token = '[MASK]')


train_dataset = AudioDataset(train_data, path_to_data, tokenizer, n_fft=1024, n_mels=64, center=True, max_tokenized_length=100, max_audio_len=25, sr=16000)
valid_dataset = AudioDataset(valid_data, path_to_data, tokenizer, n_fft=1024, n_mels=64, center=True, max_tokenized_length=100, max_audio_len=25, sr=16000)
model = Transformer(vocab_size=len(tokenizer),
                    n_mels=64,
                    enc_seq_len=25, 
                    dec_seq_len=100,
                    hidden_dim=32, 
                    enc_num_layers=2, 
                    dec_num_layers=2, 
                    num_heads=3, 
                    ff_dim=128, 
                    r_dim=100, 
                    device=device,
                    dropout=0.1, 
                    sr=16000, 
                    n_fft=1024)

n_fft=1024, win_lenght=1024, hop_lenght=256, n_mels=64, center=True):

In [6]:
torch.cuda.empty_cache()

In [7]:
from torchmetrics.functional import word_error_rate
from torchmetrics.functional.classification import multiclass_accuracy

In [8]:
def train_epoch(model, data_loader, tokenizer, loss_function, optimizer, scheduler, device):
    model.to(device)
    model.train()
    total_train_loss = 0

    dl_size = len(data_loader)

    preds = []
    targets = []

    for batch in tqdm(data_loader):
        batch['encoded_text'] = batch['encoded_text'].to(device)
        batch['spectre'] = batch['spectre'].to(device)
        batch['ohe_text'] = batch['ohe_text'].to(device)

        optimizer.zero_grad()
        logits = model(batch)
        
        pred = logits.argmax(dim=-1).to('cpu')
        pred = [tokenizer.decode(i) for i in pred]
        preds.append(pred)    
        targets.append(batch['text'])
        loss = loss_function(logits.transpose(1,2), batch['encoded_text'].squeeze())
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
    acc_t = 0
    wer = 0
    for batch in range(dl_size):
        acc = 0
        for sample in range(len(preds[batch])):
            acc += int(preds[batch][sample]==targets[batch][sample])
                       
        acc /= sample+1
        wer += word_error_rate(preds[batch], targets[batch])
        acc_t+=acc
    acc_t = acc_t / dl_size
    wer = wer / dl_size
    metrics = {
        "Train Loss": total_train_loss / dl_size,
        "Train WAcc": 1 - wer.item(),
        "Train Accuracy": acc,
    }

    return metrics


In [13]:
def eval_epoch(model, data_loader,tokenizer, loss_function, device):
    model.to(device)
    model.eval()
    total_train_loss = 0

    preds = []
    targets = []

    dl_size = len(data_loader)

    for batch in tqdm(data_loader):
        batch['encoded_text'] = batch['encoded_text'].to(device)
        batch['spectre'] = batch['spectre'].to(device)
        batch['ohe_text'] = batch['ohe_text'].to(device)

        with torch.no_grad():
            pred, logits = model.predict(batch,tokenizer.bos_token_id, tokenizer.eos_token_id,device)
            pred = [tokenizer.decode(i) for i in pred]
            preds.append(pred) 
            targets.append(batch['text'])

        loss = loss_function(logits.transpose(1, 2), batch['encoded_text'].squeeze())
        total_train_loss += loss.item()

    acc_t = 0
    wer = 0
    for batch in range(dl_size):
        acc = 0
        for sample in range(len(preds[batch])):
            acc += int(preds[batch][sample]==targets[batch][sample])
                       
        acc /= sample+1
        wer += word_error_rate(preds[batch], targets[batch])
        acc_t+=acc
    acc_t = acc_t/dl_size
    wer = wer / (dl_size)
    metrics = {
        "Val Loss": total_train_loss / dl_size,
        "Val WAcc": 1-wer.item(),
        "Val Accuracy": acc,
    }

    return metrics, preds[-1][-1]

In [14]:
def cross_validation(model, 
                     dataset, 
                     loss_function,
                     strat_array=None,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle: bool=True, 
                     n_folds: int=10, 
                     epochs: int=5, 
                     lr: float=1e-6,
                     start_fold: int=0, 
                     batch_size: int=4,
                     iters_to_accumulate=None,
                     n_accumulated_grads: int = 0):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    if strat_array:
        kfold = StratifiedKFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset, strat_array)
    else: 
        kfold = KFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset)

    for fold, (train_ids, eval_ids) in enumerate(split):
        if fold >= start_fold:
            print(f'FOLD {fold}')
            print('--------------------------------')

            optimizer = AdamW(
            model.parameters(),
            #lr = lr,
        )

            train_subsampler = torch.utils.data.Subset(dataset,  train_ids)
            train_loader = torch.utils.data.DataLoader(
                          train_subsampler, 
                          batch_size=batch_size,
                          shuffle=shuffle, drop_last=True)

            eval_subsampler = torch.utils.data.Subset(dataset,  eval_ids)
            eval_loader = torch.utils.data.DataLoader(
                          eval_subsampler,
                          batch_size=batch_size,
                          shuffle=shuffle, drop_last=True)
            
            total_steps = len(train_loader) * epochs 

            scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)


            for epoch_i in range(epochs):
                train_metrics = train_epoch(model, train_loader, dataset.tokenizer, loss_function, optimizer, scheduler, device)
                eval_metrics, preds = eval_epoch(model, eval_loader, dataset.tokenizer, loss_function, device)
                print(f"EPOCH: {epoch_i}")
                print(train_metrics)
                print(eval_metrics) 
                print(preds) 
        break

In [15]:
cross_validation(model = model,
                 dataset=train_dataset, 
                 loss_function=nn.CrossEntropyLoss(), 
                 device=torch.device("cuda"),
                 random_state=69,
                 shuffle=True,
                 batch_size=16)

FOLD 0
--------------------------------


  0%|          | 0/562 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 1.161193019236534, 'Train WAcc': -6.985538482666016, 'Train Accuracy': 0.0}
{'Val Loss': 30.14263423796623, 'Val WAcc': -7.106573104858398, 'Val Accuracy': 0.0}
[SOS] [SOS] [UNK] [MASK] [MASK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ª [MASK] à [MASK] â ã ä ã ã ã ã ã ã ã ã ã ð ã ã ã ã ã ã ã ã ª ã ã ã ã ã ư ã ã ª ä ư ã à ж ã ä à ª â à ã ä ư ư [SOS] ã [SOS] ð ã à ã ã ä [UNK] ж à à ã ã ª ã [UNK] ª


  0%|          | 0/562 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]


KeyboardInterrupt

