In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "ruoxinhuang" 
os.environ['KAGGLE_KEY'] = "f9d6d5aade71452503766f80812da73d"
!kaggle competitions download -c 11-785-fall-20-homework-4-part-2
!unzip test.npy.zip
!unzip train.npy.zip
!unzip dev.npy.zip
!unzip train_transcripts.npy

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset 
import torch.nn as nn

def load_data():
    speech_train = np.load('train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    l2i, i2l = create_dictionaries(letter_list)

    output = []
    for sentence in transcript:
        arr = []
        for i, word in enumerate(sentence):
            if i>0 and i<len(sentence):
                arr.append(32)

            for character in word.decode("utf-8"):
                arr.append(l2i[character])

            if i==len(sentence)-1:
                arr.append(34)
        
        output.append(arr)
    return output



'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    letter2index = dict()
    index2letter = dict()
    for idx, letter in enumerate(letter_list):
        letter2index[letter] = idx
        index2letter[idx] = letter

    return letter2index, index2letter


class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), len(self.speech[index]), torch.tensor(self.text[index]), len(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32)), len(self.speech[index])


def collate_train(batch):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    X = [i[0] for i in batch]
    X_len = [i[1] for i in batch]
    Y = [i[2] for i in batch]
    Y_len = [i[3] for i in batch]

    X_pad = nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0.0)
    Y_pad = nn.utils.rnn.pad_sequence(Y, batch_first=True, padding_value=0)


    return X_pad,X_len,Y_pad,Y_len


def collate_test(batch):
    ### Return padded speech and length of utterance ###
    X = [i[0] for i in batch]
    X_len = [i[1] for i in batch]

    X_pad = nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0.0)


    return X_pad,X_len

In [None]:
import time
import torch
### Add Your Other Necessary Imports Here! ###

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    model.to(DEVICE) 

    total_loss = 0
    for batch_num, (X, X_lens, Y, Y_lens) in enumerate(train_loader):
        X = X.to(DEVICE)
        Y = Y.to(DEVICE)

        optimizer.zero_grad()

        # 4) Pass your inputs, and length of speech into the model.
        outputs = model(X, X_lens, Y, isTrain=True)
        # 6) If necessary, reshape your predictions and origianl text input 
        outputs = outputs.permute(0, 2, 1)
        # 7) Use the criterion to get the loss.
        loss = criterion(outputs, Y)
        # 9) Run the backward pass on the masked loss. 
        loss.backward()
        # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        # 11) Take a step with your optimizer
        optimizer.step()
        
        total_loss += loss.item()
        if batch_num % 100 == 0:
            print(total_loss/100)
            total_loss = 0
    


In [None]:
import torch
import torch.nn as nn
import torch.nn.utils as utils

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import random


class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()
        self.softmax = nn.Softmax(1)

    def forward(self, query, key, value, lens):
        '''
        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param keys: (batch_size, max_len, encoder_size) Key Projection from Encoder
        :param values: (batch_size, max_len, encoder_size) Value Projection from Encoder
        :return context: (batch_size, encoder_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted  
        '''
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze()
        mask = torch.arange(key.shape[1]).unsqueeze(0) >= lens.unsqueeze(1)
        mask = mask.to(DEVICE)
        energy.masked_fill_(mask, -np.inf)
        attention = self.softmax(energy)
        context = torch.bmm(attention.unsqueeze(1), value).squeeze()
        return context


class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

    def forward(self, x, lens):
        '''
        :param x :(N, T) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        batch_size, length, dim = x.shape
        if(length%2==1):
            x = x[:,:-1,:]

        x = x.reshape(batch_size,int(length/2),dim*2)

        out = utils.rnn.pack_padded_sequence(x, lengths=lens//2, batch_first=True, enforce_sorted=False)
        out,_ = self.blstm(out)
        out, out_lens = utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        return out, out_lens
        


class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        
        ### Add code to define the blocks of pBLSTMs! ###
        self.pblstm1 = pBLSTM(hidden_dim*4,hidden_dim)
        self.pblstm2 = pBLSTM(hidden_dim*4,hidden_dim)
        self.pblstm3 = pBLSTM(hidden_dim*4,hidden_dim)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(rnn_inp)
        out, out_lens = utils.rnn.pad_packed_sequence(out, batch_first=True)

        ### Use the outputs and pass it through the pBLSTM blocks! ###

        out, out_lens = self.pblstm1(out,out_lens)
        out, out_lens = self.pblstm2(out,out_lens)
        out, out_lens = self.pblstm3(out,out_lens)
        
        keys = self.key_network(out)
        value = self.value_network(out)

        return keys, value, out_lens


class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)
        self.tf = 0.05

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, lens, text=None, isTrain=True):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[0]

        if (isTrain == True):
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 600

        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros((batch_size, self.vocab_size), device=DEVICE)
        prediction[:, 33] = 1   
        context = values[:, 0, :]

        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do not get index out of range errors. 
            if i==0:
              char_embed = self.embedding(prediction.argmax(dim=-1))
            else:
              if isTrain:
                if random.random() > self.tf:
                  char_embed = embeddings[:, i-1, :]
                else:
                  char_embed = self.embedding(prediction.argmax(dim=-1))
              else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            inp = torch.cat([char_embed, context], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
            context = self.attention(output, key, values, lens)

            with torch.no_grad():
              self.character_prob.weight = self.embedding.weight
            prediction = self.character_prob(torch.cat([output, context], dim=1))
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)


class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim, isAttended=True)

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True):
        key, value, lens = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions = self.decoder(key, value, lens, text_input)
        else:
            predictions = self.decoder(key, value, lens, text=None, isTrain=False)
        return predictions


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']

In [None]:
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

In [None]:
model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=256, isAttended=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)
nepochs = 60
batch_size = 64 if DEVICE == 'cuda' else 1

In [None]:
train_dataset = Speech2TextDataset(speech_train, character_text_train)
val_dataset = Speech2TextDataset(speech_valid, character_text_train)
test_dataset = Speech2TextDataset(speech_test, None, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_train)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

In [None]:
for epoch in range(nepochs):
    model.decoder.tf +=0.01  #this is commented out in the beginning
    train(model, train_loader, criterion, optimizer, epoch)

In [None]:
def test(model, test_loader):
    ### Write your test code here! ###
    model.eval()
    result = []
    for X, X_lens in test_loader:
        X = X.to(DEVICE)

        outputs = model(X, X_lens, isTrain=False)
        indexes = torch.argmax(outputs, dim=-1).cpu()
        
        for i in indexes:
          string = ''
          for j in i:
            if j == 34:
              break
            string += i2l[j.item()]
        
          result.append(string)
      
    return result

In [None]:
l2i, i2l = create_dictionaries(LETTER_LIST)
result = test(model, val_loader)

In [None]:
import csv

with open('submission.csv', 'w') as csvfile:
  csvwriter = csv.writer(csvfile)
  csvwriter.writerow(('id','label'))
  for i, row in enumerate(result):
    csvwriter.writerow((i,row))

In [None]:
from google.colab import files
files.download("submission.csv")