# **DISCLAIMER: THE SKELETON OF THE CODE AND VARIOUS FUNCTIONS/CLASSES OF THE CODE ARE BASED UPON EXAMPLE CODE GIVEN IN RECITATION 8 PART 1 AND PART 2**

# **Importing all Relevant Libraries and Mounting my Drive**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from matplotlib.pyplot import *
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torchvision import transforms
from torchvision import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss
from functools import partial
from dataclasses import dataclass
from collections import OrderedDict
import os
from PIL import Image
import torchvision.transforms.functional as TF
import time
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
cuda = torch.cuda.is_available()
cuda

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!kaggle competitions download -c 11-785-fall-20-homework-4-part-2

In [None]:
!unzip -q dev.npy.zip
!unzip -q test.npy.zip
!unzip -q train.npy.zip
!unzip -q train_transcripts.npy.zip

# **Loading the Data**

In [None]:
def load_data():
    speech_train = np.load('train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list):
    idx =[]
    l2i= {}
    i2l= {}
    for id, letter in enumerate(letter_list):
      id= l2i[letter]
      letter= i2l[id]
    for index, label in enumerate(transcript):
      for letter in label:
        ls= letter.decode('utf-6') 
        for letter in index:
          speech= idx.join(ls)
    speech_list. append([i2l[-1]['<eos>']]+ls[letter[:]]+i21[0]['<sos>'])
    return speech_list




class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))


def collate_train(batch_data):
    i_lens = []
    t_lens = []
    i_pad = []
    t_pad = []
   
    for idx,b in enumerate(batch_data)):
        i_pad= np.append(i_pad, batch_data[idx*-1][-1])
        i_padt= torch.tensor(i_pad)
        i_lens= np.append(i_lens,batch_data[idx][0] )
        i_lenst= torch.tensor(i_lens)
        t_pad= np.append(t_pad, batch_data[idx][1][1:len(batch_data)+1])
        t_padt= torch.tensor(t_pad)
        t_lens= np.append(t_lens, (len(batch_data[idx][0])-len(batch_data[idx][1])))
        t_lenst= torch.tensor(t_lens)
    inputs_pad = pad_sequence(i_padt) # dim (B, T, C) since batch_first is true, (T, B, C) if false
    targets_pad = pad_sequence(t_padt)
    return inputs_pad, targets_pad, i_lenst, t_lenst


def collate_test(batch_data):
    i_pad = []
    i_lens = []
    for idx,b in enumerate(batch_data)):
        i_pad= np.append(i_pad, batch_data[idx*-1][-1])
        i_padt= torch.tensor(i_pad)
        i_lens= np.append(i_lens,batch_data[idx][0] )
        i_lenst= torch.tensor(i_lens)
    inputs_pad = pad_sequence(i_padt)
    return inputs_pad, i_lenst


# **Model Architecture- Based on Recitation 8 part 2 framework**

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils as utils

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
  
        attention = torch.bmm(value, query.unsqueeze(2)).squeeze(2)
        
        mask = torch.arange(query.size(1)).unsqueeze(0) >= lens.unsqueeze(1)
        
        attention.masked_fill_(key, -1e9)
        
        attention = nn.functional.softmax(attention, dim=1)

        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1)

        return out, attention


class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)

    def forward(self, x):
        '''
        :param x :(N, T) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        x_lens = np.pad(len(x), batch_first=True)
        x_pad = np.pad(x, batch_first=True)
        x_lens = x_lens.to(DEVICE)

     
        x_pad1 = x_pad[(x_pad.size(1) * 2) / 2, :, :] 

    
        x_shape = np.squeeze(x_pad1.size(1) * 1/2*(x_padded.size(2)) * 2, axis=1)
        x_lens = x_lens // 2

        x_pack = np.concacatnate(x_shape, x_lens)


        out, _ = self.blstm(x_pack)
        return out


class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        
         self.pBLSTMs = nn.Sequential(
            pBLSTM(hidden_dim*4, hidden_dim*3),
            pBLSTM(hidden_dim*3, hidden_dim*3),
            pBLSTM(hidden_dim*2, hidden_dim)
        )

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)

        ### Use the outputs and pass it through the pBLSTM blocks! ###
        outputs = self.pBLSTMs(outputs)

        linear_input, _ = utils.rnn.pad_packed_sequence(outputs)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value


class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, text=None, isTrain=True, rate=0.01, isGumbel= False):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[1]

        if (isTrain == True):
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 250

        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros(batch_size,1).to(DEVICE)#(torch.ones(batch_size, 1)*33).to(DEVICE)

        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do not get index out of range errors. 

            if (isTrain):
                char_embed = embeddings[:,i,:]
                if np.random(0,100) > Rate:
                    teacher_forcing = True 
                else False
                 if teacher_forcing is not True:
                  
                    if i > 0 and isGumbel is True: 
                        char_embed = self.embedding(np.argmax(prediction[:, axis=1)
                    else:
                      char_embed0 = torch.nn.functional.gumbel_softmax(prediction)
                      chare_embed= char_embed0.mm(self.embedding.weight)
                else:
                    if i == 0:
                        begin = np.zeros(batch_size)
                        begin1= np.fill(letter2index['<sos>'])
                        begin2=torch.tensor(begin1, dtype=torch.long()) 
                        char_embed = self.embedding(begin2)
                    else:
                        char_embed = embeddings[i-1, :, -1]
            else:
                char_embed = self.embedding(np.argmax(prediction, dim=-1))

            inp = torch.cat([char_embed, values[i,:,:]], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]

            prediction = self.character_prob(torch.cat([output, values[i,:,:]], dim=1))
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)


class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True):
        key, value = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions = self.decoder(key, value, text_input)
        else:
            predictions = self.decoder(key, value, text=None, isTrain=False)
        return predictions


# **Train/Test Function -Based on Recitaion 8 part 1 and part 2 framework/code**

In [None]:
import time
import torch


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    model.to(DEVICE)
    start = time.time()

    torch.manual_seed(11785)
encoder = Encoder(len(letters), embed_size=4, hidden_size=4)
decoder = Decoder(len(phonemes), embed_size=4, hidden_size=4)
# Sum over the batch at every timestep. We manually divide the total loss by number of tokens.(default is 'mean')
criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.1)

loss_history = []

for epoch in range(100):
    loss = 0
    # List of attention vectors
    all_attentions = []
    
    context, state = encoder(X, X_lens)
    
    context = context.transpose(0, 1)
    
    state = tuple(st.transpose(0, 1).reshape(X.size(1), -1) for st in state)
    
    
    n_tokens = Y_lens.sum() - Y_lens.size(0)
    
   
    for i in range(Y.size(0) - 1):
        out, state, attention = decoder(Y[i], context, X_lens, state)
        all_attentions.append(attention.detach())
        # Mask of sequenuces that haven't ended (i.e. current tokens are "real")
        active = i + 1 < Y_lens
        # Compute loss only on "real" outputs
        loss += criterion(out[active], Y[i + 1, active])
    
  s
    loss /= n_tokens
    loss_history.append(loss.item())
  
    all_attentions = torch.stack(all_attentions, dim=1)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print('Final train loss:', loss_history[-1])

    end = time.time()

def test(model, test_loader, epoch):
    model.eval()
    test_loss = []
    accuracy = 0
    total = 0

    for batch_num, (feats, labels) in enumerate(test_loader):
        feats, labels = feats.to(device), labels.to(device)
        outputs = model(feats)[1]
        
        pred_text =[letter2index['<eos>'], letter2index['<pad>']])
        target_text = [letter2index['<eos>'], letter2index['<pad>']])
        
        loss = criterion(outputs, labels.long())
        
        accuracy += torch.sum(torch.eq(pred_text, target_text)).item()
      
        del feats
        del labels

    model.train()
    return accuracy, pred_text, target_text

# **Training and Testing the Netwrok- Based on recitation 8 part 1 and 2 code**

In [None]:


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']

def main():
    model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=128)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(reduction='none')
    nepochs = 25
    batch_size = 64 if DEVICE == 'cuda' else 1

    speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
    character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
    character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

    train_dataset = Speech2TextDataset(speech_train, character_text_train)
    # val_dataset = 
    test_dataset = Speech2TextDataset(speech_test, None, False)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
    # val_loader = 
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

    for epoch in range(nepochs):
        train(model, train_loader, criterion, optimizer, epoch)
        # val()
        test(model, test_loader, epoch)


if __name__ == '__main__':
    main()

# **Developing Attention Graphs- Based on recitation 8 part 1 code**

In [None]:
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
def plot_attn_flow(attn_mask, path):
    plt.imsave(path, attn_mask, cmap='hot')
    return plt

def plot_grad_flow(named_parameters, path):
    ave_grads = []
    max_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            if(p is not None):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    #plt.tight_layout()
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    plt.show()
    plt.savefig(path)
    return plt, max_grads


# **DISCLAIMER: THE SKELETON OF THE CODE AND VARIOUS FUNCTIONS/CLASSES OF THE CODE ARE BASED UPON EXAMPLE CODE GIVEN IN RECITATION 8 PART 1 AND PART 2**