In [1]:
from __future__ import print_function, division,unicode_literals
import os
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
import librosa

from io import open
import unicodedata
import string
import re
import random

from tqdm.auto import tqdm
import json

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn
from torch import optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data.dataloader import default_collate


import torch.nn.functional as Fi


import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

print(torch.__version__)
print(torchaudio.__version__)


import string

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

1.9.0
0.9.0


In [2]:
table_trans = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}


path ="/Users/dami.osoba/work/bawk/small_dataset/small/CV_unpacked/cv-corpus-6.1-2020-12-11/en/validated.tsv"
meta = pd.read_csv(path,sep="\t")
meta_path = meta.set_index('path')

def read_manifest(path):
    manifest = []
    with open(path, 'r') as f:
        for line in tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest
train_manifest_path = '/Users/dami.osoba/work/bawk/src/data/commonvoice_train_manifest.json'
train_manifest_data = read_manifest(train_manifest_path)
# keep audio < 4s
train_text = [data['text'] for data in train_manifest_data if data['duration']<=4]
train_path = [data['audio_filepath'] for data in train_manifest_data if data['duration']<=4]
train_path_pd = pd.DataFrame(train_path,columns=['train_path'])

# remove unicode
sentences = [c.encode(encoding="ascii",errors="ignore").decode().translate(table_trans) for c in train_text]
char_dict = sorted(list(set([b for a in sentences for b in a]))) +['EOS','SOS','PAD']
char_index = {a:char_dict.index(a) for a in char_dict}
dictOfindex = {char_dict.index(a):a for a in char_dict}
# char_index['EOS'] = len(char_dict)
# char_index['SOS'] = len(char_dict)+1
# char_index['PAD'] = len(char_dict)+2

Reading manifest data: 0it [00:00, ?it/s]

# Create voice dataset

In [83]:
class VoiceDataset(Dataset):
    def __init__(self, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
#         self.landmarks_frame = pd.read_csv(csv_file)
#         self.root_dir = root_dir
        self.transform = transform
        self.path_frame = train_path_pd

    def __len__(self):
        return len(self.path_frame)

    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

#         waveform, _ = librosa.load(self.path_frame.loc[idx][0],sr=16000)
        waveform, _ = torchaudio.load(self.path_frame.loc[idx][0],)
        label = self.path_frame.loc[idx][0].split("/")[-1].split("wav")[0]+"mp3"
        # transcription for audio
        trans = meta_path.loc[label]['sentence']
#         if len(trans.split(" ")) == 1:
#             waveform = torch.cat([waveform,waveform],dim=1)
#             trans = trans +" "+trans
#             print(trans)
        # encode to ascii
        trans = trans.encode(encoding="ascii",errors="ignore").decode().translate(table_trans).lower()
        chars =[b for a in trans for b in a]
        coded = [28]+[char_dict.index(a) for a in chars]+[27]

        sample = {'waveform': waveform, 'transcription': coded,'sentence':trans}

        if self.transform:
            sample = self.transform(sample)

        return sample

# Create FFT transform

In [84]:
window_size = 25/1000
stride = 10/1000
sample_rate = 16000
n_fft =int(window_size *sample_rate)
win_length = None
hop_length = int(sample_rate*stride)
n_mels = 80
max_time = 4


mel_spectrogram = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    onesided=True,
    n_mels=n_mels,
    mel_scale="htk",
)

# melspec = mel_spectrogram(waveform)


class MelSpec(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """
    def __init__(self):
        
        self.window_size = 25/1000
        self.stride = 10/1000
        self.sample_rate = 16000
        self.n_fft =int(self.window_size *self.sample_rate)
        self.win_length = None
        self.hop_length = int(self.sample_rate*self.stride)
        self.n_mels = 80
        self.max_time = 4
        pass
#         assert isinstance(output_size, (int, tuple))
#         self.output_size = output_size

    def mel_spectrogram(self,a):
        mel_spec = T.MelSpectrogram(
                        sample_rate=self.sample_rate,
                        n_fft=self.n_fft,
                        hop_length=self.hop_length,
                        center=True,
                        pad_mode="reflect",
                        power=2.0,
                        norm='slaney',
                        onesided=True,
                        n_mels=self.n_mels,
                        mel_scale="htk")
        return mel_spec(a)
        

    def __call__(self, sample):
        waveform, transcription,sentence = sample['waveform'], sample['transcription'],sample['sentence']
        #zero pad waveform
        zero_pad = torch.zeros(1, sample_rate*max_time- waveform.size()[1])
        padding = torch.cat([waveform,zero_pad],1)
        # get spectrogram
        wave_spec = self.mel_spectrogram(waveform)
        wave_spec = wave_spec.swapaxes(1,2)
        #change transcription list to tensor
        transcription = torch.tensor(transcription, dtype=torch.long, device=device)

        return {'waveform': wave_spec, 'transcription': transcription, 'sentence':sentence}
    
transformed_dataset = VoiceDataset(transform = MelSpec())

for i in range(len(transformed_dataset)):
    sample = transformed_dataset[i]

    print(i, sample['waveform'].size(), sample['sentence'])

    if i == 3:
        break

0 torch.Size([1, 226, 80]) that sobered him a little
1 torch.Size([1, 363, 80]) open confession is good for the soul
2 torch.Size([1, 344, 80]) but the englishman was exultant
3 torch.Size([1, 274, 80]) i am following my destiny


In [5]:
dictOfindex = { i : char_dict[i] for i in range(0, len(char_dict) ) }
dictOfchar = { char_dict[i]:i for i in range(0, len(char_dict) ) }

In [85]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]    
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)
    return batch


def pad_collate(batch):
    max_input_len = float('-inf')
    max_target_len = float('-inf')

    for elem in batch:
        feature = elem['waveform']
        feature = feature.squeeze()
        trn = elem['transcription']
        max_input_len = max_input_len if max_input_len > feature.shape[0] else feature.shape[0]
        max_target_len = max_target_len if max_target_len > len(trn) else len(trn)

    for i, elem in enumerate(batch):
        f = elem['waveform']
        trn = elem['transcription']
        sentence = elem['sentence']
        f = f.squeeze()
        input_length = f.shape[0]
        input_dim = f.shape[1]
        # print('f.shape: ' + str(f.shape))
        feature = np.zeros((max_input_len, input_dim), dtype=np.float32)
        feature[:f.shape[0], :f.shape[1]] = f
        trn = np.pad(trn, (0, max_target_len - len(trn)), 'constant', constant_values=29)
        batch[i] = (feature, trn, input_length,sentence)
        # print('feature.shape: ' + str(feature.shape))
        # print('trn.shape: ' + str(trn.shape))

    batch.sort(key=lambda x: x[2], reverse=True)

    return default_collate(batch)



def collate_fn(batch):

    # A data tuple has the form:
    # waveform, sample_rate, label, speaker_id, utterance_number

    tensors, targets,sentence = [], [],[]

    # Gather in lists, and encode labels as indices
    for a in batch:
        tensors += [a['waveform']]
        targets += [a['transcription']]
        sentence += [a['sentence']]
                   
    # Group the list of tensors into a batched tensor
    tensors = tensors
#     targets = torch.stack(targets)
    targets = pad_sequence(targets)

    return tensors, targets,sentence


train_loader = DataLoader(transformed_dataset, batch_size=10,collate_fn=pad_collate,
                        shuffle=True, num_workers=0)

iterator = iter(train_loader)
x_batch,y,input_lengths,sentence = iterator.next()
print(x_batch,y,input_lengths,sentence)

tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [1.0571e-07, 3.7094e-07, 4.8095e-07,  ..., 2.9530e-07,
          1.9673e-07, 7.4557e-08],
         [5.0575e-08, 1.7748e-07, 4.0925e-07,  ..., 2.9312e-08,
          9.3720e-09, 1.3724e-08],
         [1.4446e-09, 5.0692e-09, 4.5073e-09,  ..., 6.0489e-10,
          1.2518e-09, 5.0188e-10]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [5.6637e-09, 1.9875e-08, 1.1874e-08,  ..., 1.1692e-11,
          2.9177e-11, 9.1143e-11],
         ...,
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.000

In [86]:
y

tensor([[28, 20,  8,  5,  0,  9, 14, 20,  5, 18,  6,  1,  3,  5,  0, 23,  1, 19,
          0,  9, 14,  4,  5,  3,  9, 16,  8,  5, 18,  1,  2, 12,  5, 27, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29, 29, 29],
        [28, 20,  8,  5,  0, 12, 15, 18, 18, 25,  0,  4, 18,  9, 22,  5, 18,  0,
         25,  1, 23, 14,  5,  4,  0,  1, 14,  4,  0,  4,  5,  3,  9,  4,  5,  4,
          0, 20, 15,  0,  8,  5,  1,  4,  0,  6, 15, 18,  0, 20,  8,  5,  0, 14,
          5, 24, 20,  0, 13, 15, 20,  5, 12, 27],
        [28, 19,  8,  5,  0, 12,  5,  1, 18, 14,  5,  4,  0,  8, 15, 23,  0, 20,
         15,  0, 16, 12,  1, 25,  0, 20,  8,  5,  0, 16,  9,  1, 14, 15,  0,  9,
         14,  0,  1,  0, 13, 21, 19,  9,  3,  0, 19,  3,  8, 15, 15, 12, 27, 29,
         29, 29, 29, 29, 29, 29, 29, 29, 29, 29],
        [28,  2,  1,  4,  8,  1, 13,  0,  1, 12, 19, 15,  0, 16, 21,  2, 12,  9,
         19,  8,  5,  4,  0, 19, 15, 13,

In [7]:
d1 = transformed_dataset[10]['waveform']

In [8]:
d1.shape

torch.Size([1, 363, 80])

In [36]:
n1 = nn.GRU(80,10)
n2 = nn.GRU(10, 10)
n3 = nn.GRU(10, 10)

In [37]:
o1,o2 = n1(x_batch)
b1,b2 = n2(o1)

In [60]:
hmm = nn.GRU(80,10,bidirectional=True)
hmm2 = nn.GRU(20,5,bidirectional=True)
input_x = x_batch.size(1)
enc_len = x_batch.size(2)
total_length = x_batch.size(1)
packed_input = pack_padded_sequence(x_batch, input_lengths, batch_first=True)
hah, oo= hmm(packed_input,oo)
wq,tt = hmm2(hah)
output, _ = pad_packed_sequence(wq, batch_first=True, total_length=total_length)


In [61]:
x_batch.shape

torch.Size([10, 399, 80])

In [62]:
output.shape

torch.Size([10, 399, 10])

In [66]:
y

tensor([[ 9, 20,  0,  3,  1, 14,  0,  8, 15, 23,  5, 22,  5, 18,  0,  2,  5,  0,
         19, 12,  1,  9, 14,  0, 23,  9, 20,  8,  0,  1,  0, 19,  9, 12, 22,  5,
         18,  0,  2, 21, 12, 12,  5, 20],
        [ 9, 12, 12,  0, 15, 23, 14,  0, 20,  8,  1, 20,  0,  8,  5, 19,  0, 23,
          9, 20,  8,  0,  1,  0,  8,  1, 18, 19,  8,  0, 19,  5, 20, 29, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29],
        [ 4, 15,  0, 18,  9,  7,  8, 20,  0,  1, 14,  4,  0,  6,  5,  1, 18,  0,
         14, 15,  0, 13,  1, 14, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29],
        [ 1,  0, 13,  1, 14,  0,  9, 19,  0, 11, 14, 15, 23, 14,  0,  2, 25,  0,
         20,  8,  5,  0,  3, 15, 13, 16,  1, 14, 25,  0,  8,  5,  0, 11,  5,  5,
         16, 19, 29, 29, 29, 29, 29, 29],
        [20,  8,  1, 20,  0, 19, 15,  2,  5, 18,  5,  4,  0,  8,  9, 13,  0,  1,
          0, 12,  9, 20, 20, 12,  5, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 2

In [70]:
def pad_list(xs, pad_value):
        # From: espnet/src/nets/e2e_asr_th.py: pad_list()
        n_batch = len(xs)
        max_len = max(x.size(0) for x in xs)
        pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
        for i in range(n_batch):
            pad[i, :xs[i].size(0)] = xs[i]
        return pad

In [71]:
ys = [yi[yi != 29] for yi in y]  # parse padded ys
# prepare input and output word sequences with sos/eos IDs
eos = ys[0].new([27])
sos = ys[0].new([28])
ys_in = [y for y in ys]
ys_out = [y for y in ys]
# padding for ys with -1
# pys: utt x olen
ys_in_pad = pad_list(ys_in, 27)
ys_out_pad = pad_list(ys_out, 29)
assert ys_in_pad.size() == ys_out_pad.size()
batch_size = ys_in_pad.size(0)
output_length = ys_in_pad.size(1)

In [81]:
y

tensor([[ 9, 20,  0,  3,  1, 14,  0,  8, 15, 23,  5, 22,  5, 18,  0,  2,  5,  0,
         19, 12,  1,  9, 14,  0, 23,  9, 20,  8,  0,  1,  0, 19,  9, 12, 22,  5,
         18,  0,  2, 21, 12, 12,  5, 20],
        [ 9, 12, 12,  0, 15, 23, 14,  0, 20,  8,  1, 20,  0,  8,  5, 19,  0, 23,
          9, 20,  8,  0,  1,  0,  8,  1, 18, 19,  8,  0, 19,  5, 20, 29, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29],
        [ 4, 15,  0, 18,  9,  7,  8, 20,  0,  1, 14,  4,  0,  6,  5,  1, 18,  0,
         14, 15,  0, 13,  1, 14, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 29, 29, 29, 29, 29],
        [ 1,  0, 13,  1, 14,  0,  9, 19,  0, 11, 14, 15, 23, 14,  0,  2, 25,  0,
         20,  8,  5,  0,  3, 15, 13, 16,  1, 14, 25,  0,  8,  5,  0, 11,  5,  5,
         16, 19, 29, 29, 29, 29, 29, 29],
        [20,  8,  1, 20,  0, 19, 15,  2,  5, 18,  5,  4,  0,  8,  9, 13,  0,  1,
          0, 12,  9, 20, 20, 12,  5, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29, 29, 2

In [82]:
ys_in_pad

tensor([[ 9, 20,  0,  3,  1, 14,  0,  8, 15, 23,  5, 22,  5, 18,  0,  2,  5,  0,
         19, 12,  1,  9, 14,  0, 23,  9, 20,  8,  0,  1,  0, 19,  9, 12, 22,  5,
         18,  0,  2, 21, 12, 12,  5, 20],
        [ 9, 12, 12,  0, 15, 23, 14,  0, 20,  8,  1, 20,  0,  8,  5, 19,  0, 23,
          9, 20,  8,  0,  1,  0,  8,  1, 18, 19,  8,  0, 19,  5, 20, 27, 27, 27,
         27, 27, 27, 27, 27, 27, 27, 27],
        [ 4, 15,  0, 18,  9,  7,  8, 20,  0,  1, 14,  4,  0,  6,  5,  1, 18,  0,
         14, 15,  0, 13,  1, 14, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
         27, 27, 27, 27, 27, 27, 27, 27],
        [ 1,  0, 13,  1, 14,  0,  9, 19,  0, 11, 14, 15, 23, 14,  0,  2, 25,  0,
         20,  8,  5,  0,  3, 15, 13, 16,  1, 14, 25,  0,  8,  5,  0, 11,  5,  5,
         16, 19, 27, 27, 27, 27, 27, 27],
        [20,  8,  1, 20,  0, 19, 15,  2,  5, 18,  5,  4,  0,  8,  9, 13,  0,  1,
          0, 12,  9, 20, 20, 12,  5, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
         27, 27, 27, 2

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.0, bidirectional=True):
        super(Encoder, self).__init__()
        
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout,
                          bidirectional=bidirectional)
        self.rnn2 = nn.GRU(hidden_size, hidden_size)
        

    def forward(self, input_x, enc_len):
        total_length = input_x.size(1)  # get the max sequence length
        # print('total_length: ' + str(total_length))
        # print('input_x.size(): ' + str(input_x.size()))
        packed_input = pack_padded_sequence(input_x, enc_len, batch_first=True)
        # print('enc_len: ' + str(enc_len))
        packed_output, hidden = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=total_length)
        return output, hidden
    
hmm = nn.GRU(80,10,bidirectional=True)
input_x = x_batch.size(1)
enc_len = x_batch.size(2)
total_length =x_batch.size(1)
packed_input = pack_padded_sequence(x_batch, input_lengths, batch_first=True)
hah, _= hmm(packed_input)
output, _ = pad_packed_sequence(hah, batch_first=True, total_length=total_length)


In [12]:
output.shape

torch.Size([10, 399, 20])

In [16]:
encoder = Encoder(80, 30, 1, dropout=0.0, bidirectional=False)

gg, oo = encoder(x_batch,input_lengths)

In [17]:
gg.shape

torch.Size([10, 399, 30])

In [18]:
oo.shape

torch.Size([1, 10, 30])

In [191]:
encoder = Encoder(80, 30, 1, dropout=0.0, bidirectional=True)
decoder = Decoder(vocab_size=29, embedding_dim=15, hidden_size=30, num_layers=1)

In [115]:
y.shape

torch.Size([10, 63])

In [192]:
ww, wo ,wi = decoder(y,gg,oo)

In [193]:
ww.shape, wo.shape,wi.shape

(torch.Size([640, 29]), torch.Size([640]), torch.Size([10, 64]))

In [223]:
hmm = Fi.cross_entropy(ww, wo,
                                  ignore_index=29,
                                  reduction='none')

In [211]:
hmm.mean()

tensor(1.9204, grad_fn=<MeanBackward0>)

In [221]:
hmm

tensor(3.3766, grad_fn=<NllLossBackward>)

In [226]:
hmm[hmm >0].shape

torch.Size([364])

In [222]:
1229/3.3766

363.9755967541314

In [157]:
# rnny = nn.ModuleList()
rnny = nn.LSTMCell(15, 30,)

In [188]:
em = decoder.embedding(y)

IndexError: index out of range in self

ln = em.size(1)

In [160]:
wah = rnny(em[:,0,:],(oo[0],oo[1]))

In [163]:
wah[2].shape

IndexError: tuple index out of range

In [148]:
em[:,i,:].shape

torch.Size([10, 15])

In [146]:
for i in range(em.size(1)):
    h,c = rnny(em[:,i,:])

In [121]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        # Hyper parameters
        # embedding + output
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        # rnn
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.encoder_hidden_size = hidden_size  # must be equal now
        # Components
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.ModuleList()
        self.gru += [nn.LSTMCell(self.embedding_dim, self.hidden_size)]
        self.mlp = nn.Sequential(
            nn.Linear(self.encoder_hidden_size,
                      self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.vocab_size))

    def zero_state(self, encoder_padded_outputs, H=None):
        N = encoder_padded_outputs.size(0)
        H = self.hidden_size if H == None else H
        return encoder_padded_outputs.new_zeros(N, H)
    
    def pad_list(self,xs, pad_value):
        # From: espnet/src/nets/e2e_asr_th.py: pad_list()
        n_batch = len(xs)
        max_len = max(x.size(0) for x in xs)
        pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
        for i in range(n_batch):
            pad[i, :xs[i].size(0)] = xs[i]
        return pad
    
    def get_pads(self,padded_input):
        PAD_token = dictOfchar['PAD']
        EOS_token = dictOfchar['EOS']
        SOS_token = dictOfchar['SOS']
        ys = [y[y != PAD_token] for y in padded_input]  # parse padded ys
        # prepare input and output word sequences with sos/eos IDs
#         eos = ys[0].new([EOS_token])
#         sos = ys[0].new([SOS_token])
#         ys_in = [torch.cat([sos, y], dim=0) for y in ys]
#         ys_out = [torch.cat([y, eos], dim=0) for y in ys]
        # padding for ys with -1
        # pys: utt x olen
        ys_in_pad = self.pad_list(ys, EOS_token)
        ys_out_pad = self.pad_list(ys, PAD_token)
        assert ys_in_pad.size() == ys_out_pad.size()
        
        return ys_in_pad, ys_out_pad

    def forward(self, padded_input, encoder_padded_outputs,encoder_hidden):
        """
        Args:
            padded_input: N x To
            # encoder_hidden: (num_layers * num_directions) x N x H
            encoder_padded_outputs: N x Ti x H
        Returns:
        """
        # *********Get Input and Output
        # from espnet/Decoder.forward()
        
#         ys_in_pad, ys_out_pad = self.get_pads(padded_input)
        
        ys_in_pad = padded_input
        ys_out_pad = padded_input

        batch_size = ys_in_pad.size(0)
        output_length = ys_in_pad.size(1)
             
        # *********Init decoder rnn
        h_list = encoder_hidden[0]
        c_list = encoder_hidden[1]

        y_all = []
        

        # **********LAS: 1. decoder rnn 2. attention 3. concate and MLP
        embedded = self.embedding(ys_in_pad)
        for t in range(output_length):
            # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)

            rnn_input = embedded[:, t, :]
            h_list, c_list = self.gru[0](
                rnn_input, (h_list, c_list))
            rnn_output = h_list # below unsqueeze: (N x H) -> (N x 1 x H)
            mlp_input = rnn_output
            predicted_y_t = self.mlp(mlp_input)
            y_all.append(predicted_y_t)

        y_all = torch.stack(y_all, dim=1)  # N x To x C

        # **********Cross Entropy Loss
        # F.cross_entropy = NLL(log_softmax(input), target))
        y_all = y_all.view(batch_size * output_length, self.vocab_size)
#         ce_loss = F.cross_entropy(y_all, ys_out_pad.view(-1),
#                                   ignore_index=PAD_token,
#                                   reduction='mean')

        return y_all, ys_out_pad.view(-1)


In [299]:
EOS_token

27

In [88]:
PAD_token = dictOfchar['PAD']
EOS_token = dictOfchar['EOS']
SOS_token = dictOfchar['SOS']


def train(features, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_tensor = features[0]
    target_tensor = features[1]
    input_length = features[2]
    
    batch_size = input_tensor.size(0)
    
    encoder_output, encoder_hidden = encoder(input_tensor,input_length)
    pred,actual = decoder(target_tensor,encoder_output,encoder_hidden)
    loss = criterion(pred,actual,ignore_index=PAD_token,reduction='mean')
    
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [89]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [116]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    lns = len(transformed_dataset)

    encoder_optimizer = optim.ADAM(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.ADAM(decoder.parameters(), lr=learning_rate)
                      
#     criterion = nn.NLLLoss()
    criterion = Fi.cross_entropy
    


    for i in range(1, n_iters):
        rand_sampler = torch.utils.data.RandomSampler(transformed_dataset, num_samples=10, replacement=True)
        train_sampler = DataLoader(transformed_dataset, batch_size=11, sampler=rand_sampler,collate_fn=pad_collate)
        iterator = iter(train_sampler)        
        
        features = iterator.next()            
        loss = train(features, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, i/ n_iters),
                                         i, i / n_iters * 100, print_loss_avg))

        if i % plot_every == 0:
            plot_loss_avg = plot_loss_total / (plot_every*1.0)
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
    
    


# Train model

In [91]:
rand_sampler = torch.utils.data.RandomSampler(transformed_dataset, num_samples=10, replacement=True)
train_sampler = DataLoader(transformed_dataset, batch_size=10, sampler=rand_sampler,collate_fn=pad_collate)

In [None]:
DataLoader(transformed_dataset, batch_size=10,collate_fn=pad_collate,
                        shuffle=True, num_workers=0)

<function iter>

In [99]:
trainIters(encoder,decoder, 1000, print_every=1000, plot_every=2000,learning_rate=0.01)

NameError: name 'decoder' is not defined

In [122]:
encoder = Encoder(80, 30, 1, dropout=0.05, bidirectional=True)
decoder = Decoder(vocab_size=30, embedding_dim=15, hidden_size=30, num_layers=1)

trainIters(encoder,decoder, 1000, print_every=100, plot_every=2000,learning_rate=0.01)

0m 37s (- 5m 40s) (100 10%) 3.3931
1m 14s (- 4m 58s) (200 20%) 3.3420
1m 50s (- 4m 17s) (300 30%) 3.2889
2m 25s (- 3m 38s) (400 40%) 3.2373
3m 1s (- 3m 1s) (500 50%) 3.1831
3m 37s (- 2m 25s) (600 60%) 3.1304
4m 13s (- 1m 48s) (700 70%) 3.0799
4m 49s (- 1m 12s) (800 80%) 3.0354
5m 25s (- 0m 36s) (900 90%) 2.9940


In [118]:
def evaluate(encoder, decoder, features, max_length=100):
    with torch.no_grad():
        input_tensor = samp[0]
        target_tensor = samp[1]
        input_length = samp[2]
        decoded_words = []

        encoder_output, encoder_hidden = encoder(input_tensor,input_length)
        dec_input = torch.tensor([[SOS_token]], device=device)  # SOS
        h_list = encoder_hidden[0] 
        c_list = encoder_hidden[1]

        for c in range(max_length):
            decoder_input = decoder.embedding(dec_input)
#             print(decoder_input.shape)
            rng = decoder_input.size(1)
            for rn in range(rng):
                h_list, c_list = decoder.gru[0](decoder_input[:,rn,:], (h_list, c_list))
            mlp_input = h_list
            predicted_y_t = decoder.mlp(mlp_input)
            local_scores = Fi.log_softmax(predicted_y_t, dim=1)
            # topk scores
            topv, topi  = torch.topk(local_scores,1,dim=1)
            output_probs = torch.exp(local_scores)
#             yay = torch.distributions.categorical.Categorical(output_probs)
#             topi = yay.sample().reshape(1,1)
#             torch.cat([dec_input,topi],dim=1)

            if topi.item() == EOS_token:
                decoded_words.append('EOS')
                break
            else:
                decoded_words.append(dictOfindex[topi.item()])
            dec_input = topi
    return decoded_words

In [369]:
PATH = "/Users/dami.osoba/work/bawk/models/dec_model_new"
torch.save(decoder, PATH)

In [370]:
PATH = "/Users/dami.osoba/work/bawk/models/enc_model_new"
torch.save(encoder, PATH)

In [50]:
enmodel = torch.load("/Users/dami.osoba/work/bawk/models/enc_model")
enmodel.eval()

decmodel = torch.load("/Users/dami.osoba/work/bawk/models/dec_model")
decmodel.eval()

AttnDecoderRNN(
  (embedding): Embedding(29, 20)
  (attn): Linear(in_features=40, out_features=401, bias=True)
  (attn_combine): Linear(in_features=40, out_features=20, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(20, 20)
  (out): Linear(in_features=20, out_features=29, bias=True)
)

In [119]:
samp_1 = DataLoader(transformed_dataset, batch_size=1,collate_fn=pad_collate,
                        shuffle=True, num_workers=0)
iterator = iter(samp_1) 
samp = iterator.next()
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        iterator = iter(samp_1) 
        samp = iterator.next()
        actual = samp[3]
        output_words = evaluate(encoder, decoder, samp, max_length=100)
        output_sentence = ' '.join(output_words[1:-1])
        print(actual, '<', output_sentence)
        print('')

In [350]:
samp[3]

('leko resigned when checkmate was threatened',)

In [120]:
evaluateRandomly(encoder, decoder, n=10)

('oh no answered the woodman',) < SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

('if a pen has no ink its broken',) < SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

('they are not listed here',) < SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SO

In [79]:
train_text[23]

'that might be all right.'

In [77]:
predict(train_path[23],train_text[23],enmodel,decmodel)

that might be all right < SOSthe dige tone<EOS>



In [76]:
def predict(wav_path,transcription,encoder,decoder):
    # Use the model to predict the label of the waveform
    waveform, _ = torchaudio.load(wav_path)
    sentence = transcription.encode(encoding="ascii",errors="ignore").decode().translate(table_trans)
    chars =[b for a in sentence for b in a]
    coded = [28]+[char_dict.index(a) for a in chars]+[27]
    sample['waveform'] =waveform
    sample['transcription'] = coded
    sample['sentence'] = sentence
    transformer = MelSpec()
    mels =transformer(sample)
    ex =mels['waveform']
    
    output_words, attentions,_ = evaluate(encoder, decoder, ex)
    output_sentence = ''.join(output_words)
    return output_sentence
    

In [67]:
def evaluate(encoder, decoder, tens, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tens
        input_length = input_tensor.size(2)
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        encoder_output, encoder_hidden = encoder(input_tensor.reshape(1,1,80*401), encoder_hidden,MAX_LENGTH)


        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden
        decoder_output = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention,decoder_probs = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            yay = torch.distributions.categorical.Categorical(decoder_probs)
            topi = yay.sample()
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(dictOfindex[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1],decoder_output

In [34]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        choice = np.random.randint(200)
        print(choice)
        actual = transformed_dataset[choice]['sentence']
        ex = transformed_dataset[choice]['waveform']
        output_words, attentions,_ = evaluate(encoder, decoder, ex)
        output_sentence = ''.join(output_words)
        print(actual, '<', output_sentence)
        print('')

# Really bad model

In [39]:
evaluateRandomly(encoder1, attn_decoder1)


168
you said it to me too < SOSthet hy one ill ster a troulenpf<EOS>

183
now look what youve done < SOScoof honing ipler<EOS>

13
two < SOSyo lrlieb cowiovere zoon<EOS>

62
he disappeared into the tent < SOShegiver a of a the marze her erpeesen up eaniap browndoyrewid for adyorcrawntent witt dearntnged on u<EOS>

166
ill talk to you tonight < SOSyou poet fee hy odous courldengtas i sotellndingty on feny ontouenilhan<EOS>

27
how did it happen < SOSthee loong uperee right spigel ven<EOS>

157
richard has gone camping by himself < SOShe it fakrieve auntpy a seaits salingedirs t neees feccid<EOS>

170
what shall we do now < SOSwhot mir they inrey<EOS>

152
good weekend is edited by amelia lester < SOSjex do ge goes lhth<EOS>

11
say what you have got to say < SOSser<EOS>

