In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import sys

WORKSPACE_DIR = 'college/595_final_project'
WORKSPACE_PATH = os.path.join('drive', 'My Drive', WORKSPACE_DIR)
sys.path.append(WORKSPACE_PATH)
print(WORKSPACE_PATH)

drive/My Drive/college/595_final_project


In [4]:
!pip install datasets



In [5]:
import torch
import torch.nn as nn
from torch import optim
from tqdm import tqdm
import torch.nn.functional as F
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import random
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
from datasets import load_dataset
timit_dataset = load_dataset('timit_asr', cache_dir="drive/My Drive/college/595_final_project/data")

Reusing dataset timit_asr (drive/My Drive/college/595_final_project/data/timit_asr/clean/2.0.1/5bebea6cd9df0fc2c8c871250de23293a94c1dc49324182b330b6759ae6718f8)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# GLOBALS
MAX_LEN = 72

# MODEL HYPERPARAMS
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# TRAINING PARAMS
BATCH_SIZE = 128
N_EPOCHS = 35

# REGULARIZATION PARAMATERS
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
CLIP = 1

In [8]:
def map_phonemes_to_words(dp):
  word_idx = 0
  phoneme_idx = 0
  mappings = []
  for word_stop_time in dp['word_detail']['stop']:
    while word_stop_time >= dp['phonetic_detail']['stop'][phoneme_idx]:
      mappings.append(word_idx)
      phoneme_idx += 1
    word_idx += 1

  while phoneme_idx < len(dp['phonetic_detail']['utterance']):
    mappings.append(word_idx-1)
    phoneme_idx += 1

  phonetic_len = len(dp['phonetic_detail']['utterance'])
  assert len(mappings) == len(dp['phonetic_detail']['utterance']), f'ASSERT FAILED: mapping len: {len(mappings)} vs phoneme len: {phonetic_len}'
  return mappings

In [9]:
def preprocess_timit_data(data):
  pairs = []
  for dp in data:
    word_seq = dp['word_detail']['utterance']
    phoneme_seq = dp['phonetic_detail']['utterance']
    phoneme_word_mappings = map_phonemes_to_words(dp)
    pairs.append((word_seq, phoneme_seq, phoneme_word_mappings))
  return pairs

train_pairs = preprocess_timit_data(timit_dataset['train'])
# val_pairs = preprocess_timit_data(timit_dataset['train'])[0:100]
test_pairs = preprocess_timit_data(timit_dataset['test'])

# dev_train_pairs = preprocess_timit_data(timit_dataset['train'])[:100]
# dev_val_pairs = preprocess_timit_data(timit_dataset['train'])[0:10]

In [10]:
from token_encoder import TokenEncoder, build_io_token_encodings

word_encoder = TokenEncoder(MAX_LEN)
phoneme_encoder = TokenEncoder(MAX_LEN)
build_io_token_encodings(word_encoder, phoneme_encoder, train_pairs)
print(f'training points: {len(train_pairs)}, num input tokens: {word_encoder.n_tokens}, num output tokens: {phoneme_encoder.n_tokens}')

training points: 4620, num input tokens: 4897, num output tokens: 65


In [11]:
def vectorize_seq(token_encoder, sequence):
  indexes = [token_encoder.bos_token_id] + [token_encoder.get_token_index(token) for token in sequence] + [token_encoder.eos_token_id]
  if len(indexes) < MAX_LEN:
    indexes = indexes + [token_encoder.pad_token_id] * (MAX_LEN - len(indexes))
  else:
    indexes = indexes[:MAX_LEN]
  return torch.tensor(indexes, dtype=torch.long, device=DEVICE)

def vectorize_mappings(indexes):
  indexes = [0] + [i + 1 for i in indexes] + [indexes[-1]+1]
  if len(indexes) < MAX_LEN:
    indexes = indexes + [MAX_LEN+1] * (MAX_LEN - len(indexes))
  else:
    indexes = indexes[:MAX_LEN]
  return torch.tensor(indexes, dtype=torch.long, device=DEVICE)


def vectorize_pair(pair):
  word_vector = vectorize_seq(word_encoder, pair[0])
  phoneme_vector = vectorize_seq(phoneme_encoder, pair[1])
  phone_word_mapping_vector = vectorize_mappings(pair[2])
  return word_vector, phoneme_vector, phone_word_mapping_vector

In [12]:
from torch.utils.data import TensorDataset, DataLoader

def build_dataset(pairs):
  word_vecs = torch.ones((len(pairs), MAX_LEN), dtype=torch.long)
  phoneme_vecs = torch.ones((len(pairs), MAX_LEN), dtype=torch.long)
  phoneme_word_mapping_vecs = torch.ones((len(pairs), MAX_LEN), dtype=torch.long)
  for idx, pair in enumerate(pairs):
    word_vec, phoneme_vec, mapping_vec = vectorize_pair(pair)
    word_vecs[idx] = word_vec
    phoneme_vecs[idx] = phoneme_vec
    phoneme_word_mapping_vecs[idx] = mapping_vec

  return TensorDataset(word_vecs, phoneme_vecs, phoneme_word_mapping_vecs)

In [13]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [14]:
def train(model, iterator, optimizer, vocab_criterion, index_critereon, clip):    
  model.train()
  
  epoch_loss = 0
  for i, batch in enumerate(iterator):
      
    src = batch[0].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
    trg = batch[1].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
    true_mappings = batch[2].to(DEVICE).permute(1, 0) # (seq_len, batch_size)

    
    optimizer.zero_grad()
    
    output, output_mappings = model(src, trg)
    
    #trg = [trg len, batch size]
    #output = [trg len, batch size, output dim]
    #output_mappings = [seq_len, batch_size, MAX_LEN]

    
    output_dim = output.shape[-1]
    
    output = output[1:].reshape(-1, output_dim)
    trg = trg[1:].reshape(-1)

    output_mappings = output_mappings[1:].reshape(-1, model.max_len)
    true_mappings = true_mappings[1:].reshape(-1)     

    
    #trg = [(trg len - 1) * batch size]
    #output = [(trg len - 1) * batch size, output dim]
    
    loss = vocab_criterion(output, trg) # + index_critereon(output_mappings, true_mappings)
    
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    
    optimizer.step()
    
    epoch_loss += loss.item()
        
  return epoch_loss / len(iterator)


In [22]:
from nltk.translate import bleu
from nltk.translate.bleu_score import SmoothingFunction

def edit_score(pred, truth, padding_token = None):
    # if the lists are padded, then remove the padding before calculating edit distance
    if padding_token:
        if padding_token in pred:
            pred = pred[:pred.index(padding_token)]

        if padding_token in truth:
            truth = truth[:pred.index(padding_token)]

        print(pred, truth)

    m = len(pred)
    n = len(truth)

    # Create a table to store results of subproblems
    dp = [[0 for x in range(n + 1)] for x in range(m + 1)]
  
    # Fill d[][] in bottom up manner
    for i in range(m + 1):
        for j in range(n + 1):
  
            # If first list is empty, only option is to
            # insert all elements of second list
            if i == 0:
                dp[i][j] = j    # Min. operations = j
  
            # If second list is empty, only option is to
            # remove all elements of second list
            elif j == 0:
                dp[i][j] = i    # Min. operations = i
  
            # If last elements are same, ignore last char
            # and recur for remaining list

            elif pred[i - 1] == truth[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
  
            # If last elements are different, consider all
            # possibilities and find minimum
            else:
                dp[i][j] = 1 + min(dp[i][j - 1],        # Insert
                                   dp[i - 1][j],        # Remove
                                   dp[i - 1][j - 1])    # Replace
  
    # normalize the edit distance by dividing the number edit distance by the the length of truth
    return dp[m][n] / n

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

def accuracy_fn(ref, pred, index=None):
  if index:
    ref = ref[: min(len(ref), index)]
    pred = pred[: min(len(pred), index)]
  if len(ref) > len(pred):
    pred = pred + [-1] * (len(ref) - len(pred))
  if len(pred) > len(ref):
    pred = pred[: len(pred)]
  return sum(1 for x,y in zip(ref, pred) if x == y) / len(ref)   

def evaluate(model, dataloader, vocab_criterion, index_critereon):

  smoothie = SmoothingFunction().method4
  
  bleu_score = 0
  phoneme_accuracy = 0
  vocab_loss = 0
  index_loss = 0
  num_datapoints = 0
  phoneme_edit = 0

  model.eval()
  with torch.no_grad():
    for i, batch in enumerate(dataloader):
      src = batch[0].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
      trg = batch[1].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
      true_mappings = batch[2].to(DEVICE).permute(1, 0)

      output, output_mappings = model(src, trg, 0) #turn off teacher forcing
      output_dim = output.shape[-1]
      
      vocab_loss += vocab_criterion(output[1:].reshape(-1, output_dim), trg[1:].reshape(-1))
      index_loss += index_critereon(output_mappings[1:].reshape(-1, model.max_len), true_mappings[1:].reshape(-1)     )

      output = output[1:].permute(1, 0, 2) # (batch_size, seq_len, output_vocab_size)
      output = output.argmax(-1) # (seq_len, batch_size)
      trg = trg[1:].permute(1, 0) # (batch_size, seq_len)

      output_mappings =  output_mappings[1:].permute(1, 0, 2)
      output_mappings = output_mappings.argmax(-1) # (seq_len, batch_size)
      trg_mappings = batch[2].to(DEVICE)[1:]


      for j in range(len(batch[0])):
        num_datapoints += 1

        predicted_sequence = phoneme_encoder.decode_sequence(output[j].tolist())
        target_sequence = phoneme_encoder.decode_sequence(trg[j].tolist())[1:]

        if phoneme_encoder.eos_token in predicted_sequence:
          predicted_sequence = predicted_sequence[:predicted_sequence.index(phoneme_encoder.eos_token)][1:]
        target_len = len(target_sequence)
        if phoneme_encoder.eos_token in target_sequence:
          target_len = target_sequence.index(phoneme_encoder.eos_token)
          target_sequence = target_sequence[:target_len]  
        bleu_score += bleu([target_sequence], predicted_sequence, smoothing_function=smoothie)
        phoneme_accuracy += accuracy_fn(target_sequence, predicted_sequence)
        # print('target_seq: ', phoneme_encoder.decode_sequence(trg[j].tolist())[1:])
        phoneme_edit += edit_score(predicted_sequence, target_sequence)
  
  return {
      'phoneme_acc': phoneme_accuracy/num_datapoints, 
      'phoneme_bleu': bleu_score/num_datapoints,
      'phoneme_edit': phoneme_edit/num_datapoints,
      'vocab_loss': vocab_loss.cpu().numpy() / len(dataloader)
  }


def generate_samples(model, dataloader):

  smoothie = SmoothingFunction().method4

  samples = []

  model.eval()
  with torch.no_grad():
    for i, batch in enumerate(dataloader):
      src = batch[0].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
      trg = batch[1].to(DEVICE).permute(1, 0) # (seq_len, batch_size)
      true_mappings = batch[2].to(DEVICE).permute(1, 0)

      output, output_mappings = model(src, trg, 0) #turn off teacher forcing
      output_dim = output.shape[-1]

      output = output[1:].permute(1, 0, 2) # (batch_size, seq_len, output_vocab_size)
      output = output.argmax(-1) # (seq_len, batch_size)
      trg = trg[1:].permute(1, 0) # (batch_size, seq_len)

      output_mappings =  output_mappings[1:].permute(1, 0, 2)
      output_mappings = output_mappings.argmax(-1) # (seq_len, batch_size)
      trg_mappings = batch[2].to(DEVICE)[1:]


      for j in range(len(batch[0])):

        predicted_sequence = phoneme_encoder.decode_sequence(output[j].tolist())[1:]
        target_sequence = phoneme_encoder.decode_sequence(trg[j].tolist())[1:]

        if phoneme_encoder.eos_token in predicted_sequence:
          predicted_sequence = predicted_sequence[:predicted_sequence.index(phoneme_encoder.eos_token)]
        target_len = len(target_sequence)
        if phoneme_encoder.eos_token in target_sequence:
          target_len = target_sequence.index(phoneme_encoder.eos_token)
          target_sequence = target_sequence[:target_len]   

        bleu_score = bleu([target_sequence], predicted_sequence, smoothing_function=smoothie)
        phoneme_accuracy = accuracy_fn(target_sequence, predicted_sequence)
        phoneme_edit = edit_score(target_sequence, predicted_sequence)

        samples.append({
            'target': target_sequence,
            'predicted': predicted_sequence,
            'phoneme_accuracy': phoneme_accuracy,
            'phoneme_edit': phoneme_edit,
            'bleu_score': bleu_score
        })        
  
  return samples


In [23]:
train_dataset = build_dataset(train_pairs)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
eval_dataset = build_dataset(test_pairs)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE) 

In [24]:
from phoneme_lstm import Encoder, Decoder, Seq2Seq, Attention

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(word_encoder.n_tokens, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(phoneme_encoder.n_tokens, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn, MAX_LEN+1)

model = Seq2Seq(enc, dec, MAX_LEN+1, DEVICE).to(DEVICE)

model.apply(init_weights)
optimizer = optim.Adam(model.parameters())
vocab_criterion = nn.CrossEntropyLoss(ignore_index = phoneme_encoder.pad_token_id)
index_criterion = nn.CrossEntropyLoss(ignore_index = MAX_LEN+1)

In [25]:
best_valid_loss = float('inf')

train_losses = []
test_losses = []

test_phoneme_accs = []

epochs = []

for epoch in tqdm(range(N_EPOCHS)):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, vocab_criterion, index_criterion, CLIP)
    metrics = evaluate(model, eval_loader, vocab_criterion, index_criterion)
    valid_loss = metrics['vocab_loss']
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss

    # keep track of numbers we want to plot
    train_losses.append(train_loss)
    test_losses.append(valid_loss)
    test_phoneme_accs.append(metrics['phoneme_acc'])
    epochs.append(epoch)

    
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  3%|▎         | 1/35 [00:35<19:54, 35.14s/it]

Epoch: 01 | Time: 0m 35s
	Train Loss: 3.847 | Train PPL:  46.868
	 Val. Loss: 3.709 |  Val. PPL:  40.809


  6%|▌         | 2/35 [01:10<19:22, 35.23s/it]

Epoch: 02 | Time: 0m 35s
	Train Loss: 3.419 | Train PPL:  30.537
	 Val. Loss: 3.754 |  Val. PPL:  42.712


  9%|▊         | 3/35 [01:45<18:48, 35.26s/it]

Epoch: 03 | Time: 0m 35s
	Train Loss: 3.098 | Train PPL:  22.157
	 Val. Loss: 3.609 |  Val. PPL:  36.921


 11%|█▏        | 4/35 [02:20<18:12, 35.23s/it]

Epoch: 04 | Time: 0m 35s
	Train Loss: 2.898 | Train PPL:  18.143
	 Val. Loss: 3.623 |  Val. PPL:  37.456


 14%|█▍        | 5/35 [02:55<17:34, 35.16s/it]

Epoch: 05 | Time: 0m 35s
	Train Loss: 2.804 | Train PPL:  16.516
	 Val. Loss: 3.626 |  Val. PPL:  37.561


 17%|█▋        | 6/35 [03:30<16:54, 34.98s/it]

Epoch: 06 | Time: 0m 34s
	Train Loss: 2.780 | Train PPL:  16.118
	 Val. Loss: 3.644 |  Val. PPL:  38.252


 20%|██        | 7/35 [04:05<16:19, 34.99s/it]

Epoch: 07 | Time: 0m 35s
	Train Loss: 2.742 | Train PPL:  15.519
	 Val. Loss: 3.647 |  Val. PPL:  38.352


 23%|██▎       | 8/35 [04:40<15:45, 35.00s/it]

Epoch: 08 | Time: 0m 35s
	Train Loss: 2.697 | Train PPL:  14.843
	 Val. Loss: 3.686 |  Val. PPL:  39.866


 26%|██▌       | 9/35 [05:15<15:11, 35.05s/it]

Epoch: 09 | Time: 0m 35s
	Train Loss: 2.680 | Train PPL:  14.583
	 Val. Loss: 3.710 |  Val. PPL:  40.873


 29%|██▊       | 10/35 [05:50<14:34, 34.99s/it]

Epoch: 10 | Time: 0m 34s
	Train Loss: 2.662 | Train PPL:  14.319
	 Val. Loss: 3.691 |  Val. PPL:  40.075


 31%|███▏      | 11/35 [06:25<14:00, 35.03s/it]

Epoch: 11 | Time: 0m 35s
	Train Loss: 2.647 | Train PPL:  14.115
	 Val. Loss: 3.729 |  Val. PPL:  41.627


 34%|███▍      | 12/35 [07:01<13:27, 35.10s/it]

Epoch: 12 | Time: 0m 35s
	Train Loss: 2.593 | Train PPL:  13.364
	 Val. Loss: 3.825 |  Val. PPL:  45.832


 37%|███▋      | 13/35 [07:36<12:52, 35.13s/it]

Epoch: 13 | Time: 0m 35s
	Train Loss: 2.562 | Train PPL:  12.965
	 Val. Loss: 3.804 |  Val. PPL:  44.878


 40%|████      | 14/35 [08:11<12:18, 35.15s/it]

Epoch: 14 | Time: 0m 35s
	Train Loss: 2.510 | Train PPL:  12.300
	 Val. Loss: 3.844 |  Val. PPL:  46.719


 43%|████▎     | 15/35 [08:46<11:42, 35.13s/it]

Epoch: 15 | Time: 0m 35s
	Train Loss: 2.473 | Train PPL:  11.858
	 Val. Loss: 3.844 |  Val. PPL:  46.727


 46%|████▌     | 16/35 [09:21<11:06, 35.05s/it]

Epoch: 16 | Time: 0m 34s
	Train Loss: 2.420 | Train PPL:  11.243
	 Val. Loss: 3.894 |  Val. PPL:  49.129


 49%|████▊     | 17/35 [09:56<10:30, 35.03s/it]

Epoch: 17 | Time: 0m 34s
	Train Loss: 2.357 | Train PPL:  10.555
	 Val. Loss: 3.895 |  Val. PPL:  49.178


 51%|█████▏    | 18/35 [10:31<09:54, 34.98s/it]

Epoch: 18 | Time: 0m 34s
	Train Loss: 2.253 | Train PPL:   9.518
	 Val. Loss: 3.971 |  Val. PPL:  53.017


 54%|█████▍    | 19/35 [11:06<09:19, 34.97s/it]

Epoch: 19 | Time: 0m 34s
	Train Loss: 2.185 | Train PPL:   8.895
	 Val. Loss: 4.009 |  Val. PPL:  55.092


 57%|█████▋    | 20/35 [11:41<08:44, 34.98s/it]

Epoch: 20 | Time: 0m 35s
	Train Loss: 2.121 | Train PPL:   8.335
	 Val. Loss: 4.018 |  Val. PPL:  55.602


 60%|██████    | 21/35 [12:16<08:10, 35.05s/it]

Epoch: 21 | Time: 0m 35s
	Train Loss: 2.039 | Train PPL:   7.682
	 Val. Loss: 4.099 |  Val. PPL:  60.282


 63%|██████▎   | 22/35 [12:51<07:35, 35.03s/it]

Epoch: 22 | Time: 0m 34s
	Train Loss: 1.969 | Train PPL:   7.161
	 Val. Loss: 4.094 |  Val. PPL:  59.973


 66%|██████▌   | 23/35 [13:26<07:00, 35.03s/it]

Epoch: 23 | Time: 0m 35s
	Train Loss: 1.851 | Train PPL:   6.364
	 Val. Loss: 4.137 |  Val. PPL:  62.639


 69%|██████▊   | 24/35 [14:01<06:24, 35.00s/it]

Epoch: 24 | Time: 0m 34s
	Train Loss: 1.806 | Train PPL:   6.089
	 Val. Loss: 4.220 |  Val. PPL:  68.024


 71%|███████▏  | 25/35 [14:36<05:49, 34.98s/it]

Epoch: 25 | Time: 0m 34s
	Train Loss: 1.731 | Train PPL:   5.649
	 Val. Loss: 4.248 |  Val. PPL:  69.975


 74%|███████▍  | 26/35 [15:11<05:14, 34.99s/it]

Epoch: 26 | Time: 0m 34s
	Train Loss: 1.662 | Train PPL:   5.272
	 Val. Loss: 4.240 |  Val. PPL:  69.418


 77%|███████▋  | 27/35 [15:46<04:40, 35.04s/it]

Epoch: 27 | Time: 0m 35s
	Train Loss: 1.643 | Train PPL:   5.169
	 Val. Loss: 4.243 |  Val. PPL:  69.590


 80%|████████  | 28/35 [16:21<04:05, 35.10s/it]

Epoch: 28 | Time: 0m 35s
	Train Loss: 1.551 | Train PPL:   4.718
	 Val. Loss: 4.244 |  Val. PPL:  69.661


 83%|████████▎ | 29/35 [16:56<03:30, 35.04s/it]

Epoch: 29 | Time: 0m 34s
	Train Loss: 1.543 | Train PPL:   4.679
	 Val. Loss: 4.299 |  Val. PPL:  73.615


 86%|████████▌ | 30/35 [17:31<02:54, 34.95s/it]

Epoch: 30 | Time: 0m 34s
	Train Loss: 1.542 | Train PPL:   4.673
	 Val. Loss: 4.357 |  Val. PPL:  78.015


 89%|████████▊ | 31/35 [18:06<02:19, 35.00s/it]

Epoch: 31 | Time: 0m 35s
	Train Loss: 1.472 | Train PPL:   4.359
	 Val. Loss: 4.399 |  Val. PPL:  81.402


 91%|█████████▏| 32/35 [18:41<01:45, 35.03s/it]

Epoch: 32 | Time: 0m 35s
	Train Loss: 1.480 | Train PPL:   4.392
	 Val. Loss: 4.401 |  Val. PPL:  81.515


 94%|█████████▍| 33/35 [19:16<01:09, 34.98s/it]

Epoch: 33 | Time: 0m 34s
	Train Loss: 1.429 | Train PPL:   4.173
	 Val. Loss: 4.376 |  Val. PPL:  79.555


 97%|█████████▋| 34/35 [19:51<00:35, 35.04s/it]

Epoch: 34 | Time: 0m 35s
	Train Loss: 1.366 | Train PPL:   3.918
	 Val. Loss: 4.389 |  Val. PPL:  80.554


100%|██████████| 35/35 [20:26<00:00, 35.05s/it]

Epoch: 35 | Time: 0m 35s
	Train Loss: 1.299 | Train PPL:   3.667
	 Val. Loss: 4.512 |  Val. PPL:  91.096





In [26]:
test_dataset = build_dataset(test_pairs)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
evaluate(model, eval_loader, vocab_criterion, index_criterion)

{'phoneme_acc': 0.11418641978358571,
 'phoneme_bleu': 0.2620238258227077,
 'phoneme_edit': 0.21276660128620858,
 'vocab_loss': 4.511916841779437}

In [None]:
import matplotlib.pyplot as plt

plt.plot(epochs, train_losses, 'g', label='Training loss')
plt.plot(epochs, test_losses, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, test_phoneme_accs, 'b', label='Validation accuracy')
plt.title('Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from collections import defaultdict

samples = generate_samples(model, eval_loader)
len_perf_mapping = defaultdict(lambda: [])


for sample in samples:
  len_perf_mapping[len(sample['target'])].append(sample['phoneme_accuracy'])

lengths = []
accs = []

for key, val in len_perf_mapping.items():

  lengths.append(key)
  accs.append(sum(val)/len(val))


In [None]:
plt.bar(lengths, accs, color='b')

plt.title('Phoneme length vs Accuracy')
plt.xlabel('Lengths')
plt.ylabel('Accuracy')
plt.show()