#### Importing libraries

In [1]:
import os
import numpy as np
import pickle
import tensorflow as tf
import time
import random
from tensorflow.keras.layers import Dense,GRU,LSTM,Masking,Embedding

#### loading data stored in .pkl and .npy

In [2]:
def load_pkl(path):
    """
    Loads data from given path to .pkl file.
    """
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

audio_sequence_padded=load_pkl('processed_numpy/audio')

In [3]:
txt_sequence_padded=np.load('processed_numpy/txtx.npy')
txt_sequence_length=np.load('processed_numpy/txt_length.npy')

In [4]:
# unique_chars=load_pkl('processed_numpy/unique_chars')
ind2char=load_pkl('processed_numpy/ind2char')
char2ind=load_pkl('processed_numpy/char2ind')

#### Defining Model

In [5]:
#input_shape=(2,1628,494)
class Encoder(tf.keras.Model):
    def __init__(self,enc_units,batch_sz):
        super(Encoder,self).__init__()
        self.batch_sz=batch_sz
        self.enc_units=enc_units
        self.gru=GRU(self.enc_units,dropout=0.3,recurrent_dropout=0.3,return_sequences=True,return_state=True)#can add dropout and recurrent dropout if needed for regularization

    def call(self,x,hidden):
        mask =Masking(mask_value=0.0)(x) 
        output,hidden=self.gru(mask,initial_state=hidden)        #TRY WITH LSTM only cell_state will be added
        return output,hidden                                     #(2,1628,256(enc_units)) and (2,256(enc_units))

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

encoder=Encoder(256,2)
#sample_input
# sample_hidden=encoder.initialize_hidden_state()
# sample_output,sample_hidden=encoder(s,sample_hidden)
# print(sample_output.shape)                                
# print(sample_hidden.shape)



In [6]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.w1=Dense(units)
        self.w2=Dense(units)
        self.v=Dense(1)

    def call(self,query,values):       #query:hidden_decoder size(2,256)   values:encoder_output(2,1628,256)
        #query_shape=(bs,features)
        #query_with_time_axis=(bs,1,features)
        query_with_time_axis = tf.expand_dims(query,1)    #(2,1,256)      
        #values_shape=(bs,max_len,features)
        #self.w1(query_with_time_axis) :shape(2,1,32) for w2 (2,1628,32)
        #tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)):shape(2,1628,32)
        score=self.v(tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)))   #(2,1628,1)
        #attention_weights shape=(bs,max_len,1)
        attention_weights=tf.nn.softmax(score,axis=1)                              #(2,1628,1)

        context_vector=attention_weights*values                                   #(2,1628,256)
        context_vector=tf.reduce_sum(context_vector,axis=1)                       #(2,256)
        #context_vector shape=(batch_size,hidden_size)

        return context_vector,attention_weights

attention_layer=BahdanauAttention(256)
# attention_result,attention_weights=attention_layer(sample_hidden,sample_output)
# print(attention_result.shape)
# print(attention_weights.shape)

In [7]:
len_unique_chars=30

class Decoder(tf.keras.Model):
    def __init__(self,dec_units,batch_sz):
        super(Decoder,self).__init__()
        self.batch_sz=batch_sz
        self.dec_units=dec_units
        #Forward through unidirectional GRU
        self.embedding=tf.keras.layers.Embedding(len_unique_chars,dec_units)
        self.gru=GRU(dec_units,return_sequences=True,return_state=True)
        self.fc=Dense(len_unique_chars)

        #used for attention
        self.attention=BahdanauAttention(self.dec_units)

    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        #(2,256) and (2,1628,1)
        #enc_output shape=(batch_size,max_length,hidden_size)
        x=self.embedding(x)
        #x shape after concatenation == (batch_size,1,features+hidden_size)
        x=tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)   #(2,1,256) and(2,1,256) so x=(2,1,518)
        #passing concatenated vector to GRU
        output,state=self.gru(x)                                     #(2,1,256)
        #output_shape=(batch_size*1,hidden_size)
        output=tf.reshape(output,(-1,output.shape[2]))               #(2,256)
        #output_shape=(batch_size,unique_chars)
        x=self.fc(output)                                             #(2,27)
        return x,state,attention_weights

decoder=Decoder(256,2)
# prediction,dec_hidden,_=decoder(tf.random.uniform((2,1)),sample_hidden,sample_output)
# print(sample_decoder_output.shape)

#### defining loss func and optimizer

In [8]:
# clip = 50.0
teacher_forcing_ratio=0.3
learning_rate = 0.0001
# decoder_learning_ratio = 5.0
optimizer=tf.keras.optimizers.Adam(lr=learning_rate)

#REMEMBER IN SparseCategoricalCrossentropy target is in int nd prediction is in float
loss_object=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none') 
#The from_logits=True attribute inform the loss function that the output values generated by the model are not normalized, a.k.a. logits. In other words, the softmax function has not been applied on them to produce a probability distribution. Therefore, the output layer in this case does not have a softmax activation function:

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

#### Saving model checkpoint

In [9]:
checkpoint_dir='/training_checkpoints'
checkpoint_prefix=os.path.join(checkpoint_dir+"ckpt")
checkpoint=tf.train.Checkpoint(optimizer=optimizer,
                              encoder=encoder,
                              decoder=decoder)

In [10]:
batch_size=2
y=txt_sequence_padded
X=audio_sequence_padded
number_of_batches = (len(X)+batch_size-1)//batch_size
sample_index = np.arange(len(X))
val_batches=int(np.floor(number_of_batches*0.2))
train_batches=number_of_batches-val_batches

def batch_generator(batch):
    
    x_batch,y_batch=[],[]
    if batch == (number_of_batches-1):
        batch_index=sample_index[batch_size*batch::]
    else:
        batch_index=sample_index[batch_size*batch:batch_size*(batch+1)]
        
    for i in (batch_index):
#         print(i)
        x_batch.append(X[i].toarray())
        y_batch.append(y[i])
    
    y_batch=np.array(y_batch)
    y_batch=y_batch.astype('int32')
    
    x_batch=np.array(x_batch)
    x_batch=x_batch.astype('float32')
    
    return x_batch,y_batch

In [17]:
# X1,Y1=batch_generator(1349)
# print(X1.dtype)
# print(Y1.dtype)
# Y1.shape
# X1.shape
# dummy_loss=train(X1,Y1,encoder,decoder,char2ind,optimizer)

In [11]:
# @tf.function

# x_batch dtype=float64   and y_batch dtype is int32   number_of_batches:1352   total:2703
def train(x_batch,y_batch,encoder,decoder,char2ind,optimizer):
    loss=0
    print_losses=[]
    
    with tf.GradientTape() as tape:
        
        #Forward pass through encoder
        enc_hidden=encoder.initialize_hidden_state()
        encoder_outputs,encoder_hidden=encoder(x_batch,enc_hidden)

        #create initial decoder input(starting with SOS_token for each sent)
        decoder_input=tf.expand_dims([char2ind['<SOS>']]*batch_size,1)

        #set initial decoder hidden state to encoders final hidden state
        decoder_hidden=encoder_hidden

        #Determine if we are using teacher forcing for iterations
        use_teacher_forcing=True if random.random() < teacher_forcing_ratio else False

        #Forward batch of seq one time step at time through decoder
        if use_teacher_forcing:
            for t in range(1,(y_batch.shape[1])-1):

                decoder_output,decoder_hidden,_=decoder(decoder_input,decoder_hidden,encoder_outputs)
               # Teacher forcing: next input is current target
                decoder_input=np.array((y_batch[0][t],y_batch[1][t])).reshape(2,1)

                real=np.array((y_batch[0][t+1],y_batch[1][t+1])).reshape(2,1)
                loss+=loss_function(real,decoder_output)
                
        else:
            for t in range(1,(y_batch.shape[1])-1):
                decoder_output,decoder_hidden,_=decoder(decoder_input,decoder_hidden,encoder_outputs)
                topi = np.argmax(decoder_output,1).reshape(2,1)
                decoder_input = topi
                real=np.array((y_batch[0][t],y_batch[1][t])).reshape(2,1)
                loss+=loss_function(real,decoder_output)
#                 print('loss added')
    
    batch_loss=loss/int(y_batch.shape[1]-1)
    
    # Perform backpropatation
    variables=encoder.trainable_variables + decoder.trainable_variables
    gradients=tape.gradient(loss,variables)
    optimizer.apply_gradients(zip(gradients,variables))
    
    return batch_loss

#### Start training

In [14]:
def train_iters(train_batches,val_batches,encoder,decoder,char2ind,ind2char,optimizer):
    print_loss=0
    plot_loss=0
    print_every=50
    plot_every=100
    plot_loss_list=[]
    #TRY ADDING EPOCHS
    start=time.time()
    for train_batch in range(train_batches):
        x_train,y_train=batch_generator(val_batches+train_batch)
        train_loss=train(x_train,y_train,encoder,decoder,char2ind,optimizer)
        print_loss+=train_loss
        plot_loss+=train_loss
        
        if train_batch % print_every == 0:
            print_loss_avg = print_loss / (print_every+1)
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f};time:{}".format(train_batch, train_batch / train_batches * 100, print_loss_avg,time.time()-start))
            print('_'*30)
            print_loss = 0
            
        if train_batch % plot_every == 0:
            plot_loss_list.append(plot_loss/(plot_every+1))
            plot_loss = 0
        
        if train_batch > train_batches-3:
            checkpoint.save(file_prefix=checkpoint_prefix)
            
        
        for val_batch in range(val_batches):
            output1=[]
            output2=[]
            val_batch=random.randrange(0,val_batches)
            x_val,y_val=batch_generator(val_batch)
            input1=[ind2char[each] for each in y_val[0] if each!=0]
            input2=[ind2char[each] for each in y_val[1] if each!=0]
            
            enc_hidden=encoder.initialize_hidden_state()
            encoder_outputs,encoder_hidden=encoder(x_val,enc_hidden)
            
            decoder_input=tf.expand_dims([char2ind['<SOS>']]*batch_size,1)
            decoder_hidden=encoder_hidden
            
            for t in range(y_val.shape[1]):
                decoder_output,decoder_hidden,_=decoder(decoder_input,decoder_hidden,encoder_outputs)
                topi = np.argmax(decoder_output,1).reshape(2,1)
                
#                 print(type(topi))
                if topi[0][0]==2 and topi[1][0]==2:   #since <EOS> occured
                    break
                
                decoder_input = topi
                
#                 if topi[0][0]!=0:
                output1.append(ind2char[topi[0][0]])
#                 if topi[1][0]!=0:
                output2.append(ind2char[topi[1][0]])
                
#                 real=np.array((y_batch[0][t+1],y_batch[1][t+1])).reshape(2,1)
#                 loss+=loss_function(real,decoder_output)
            
#
            
            print('input1:{}'.format(''.join(input1)))
            print('input2:{}'.format(''.join(input2)))
            print('output1:{}'.format(''.join(output1)))
            print('output2:{}'.format(''.join(output2)))
            print('time:{}'.format(time.time()-start))
        
    plt.plot(plot_loss_list)    

In [19]:
train_iters(train_batches,val_batches,encoder,decoder,char2ind,ind2char,optimizer)

In [15]:
# def eval_example(number):
#     Xval_exp=X[number].toarray()
#     Xval_exp=np.array(Xval_exp)
#     Xval_exp=Xval_exp.astype('float32')
    
#     Yval_exp=y[number]
#     Yval_exp=np.array(Yval_exp)
#     Yval_exp=Yval_exp.astype('int32')
    
#     return Xval_exp,Yval_exp

# Xval_exppp,Yval_exppp = eval_example(3)   
# Yval_exppp.shape

In [13]:
# For brevity, this error message is generated when there is not enough memory to handle the batch size.
# ERROR:InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run Cast: Dst tensor is not initialized. [Op:Cast]

loading .flac file of audio 

In [None]:
import IPython.display as ipd
import librosa
samples,sample_rate=librosa.load('LibriSpeech/dev-clean/2078/142845/2078-142845-0000.flac',sr=16000)

ipd.Audio(samples, rate=sample_rate)