In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
import torchvision
from torchvision import transforms
import math
from PIL import Image
from torchsummary import summary 
from tqdm import trange 
import glob
import os
import unicodedata
import string
import time 
import random 

%matplotlib inline

# GPU still outperfoms CPU for no vivid reason, that's why we leave this line

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
# strip removes spaces at the start and the end of each line
# split separates the string by the provided delimeter and returns a list of strings

data = open('train.txt', encoding='utf-8').read().strip().split('\n')

print(data[0]) # to ensure we did everything right 

# hardcoding this is a bad practice that leads to a higher probability of missing something and takes a HUGE amount
# of time, but I'll leave it as it is for now 

chardict =  {'': 0, '.': 1, "'": 2, '-':3, 'A': 4, 'B': 5, 'C': 6, 'D': 7, 'E': 8, 'F': 9, 'G': 10, 'H': 11, 'I': 12, 'J': 13, 'K': 14, 'L': 15, 'M': 16, 'N': 17, 'O': 18, 'P': 19, 'Q': 20, 'R': 21, 'S': 22, 'T': 23, 'U': 24, 'V': 25, 'W': 26, 'X': 27, 'Y': 28, 'Z': 29}
phonedict = {'AA' : 0, 'AE' : 1, 'AH' : 2, 'AO' : 3, 'AW' : 4, 'AY' : 5, 'B' : 6, 'CH' : 7, 'D' : 8, 'DH' : 9, 'EH' : 10, 'ER' : 11, 'EY' : 12, 'F' : 13, 'G' : 14, 'HH' : 15, 'IH' : 16, 'IY' : 17, 'JH' : 18, 'K' : 19, 'L' : 20, 'M' : 21, 'N' : 22, 'NG' : 23, 'OW' : 24, 'OY' : 25, 'P' : 26, 'R' : 27, 'S' : 28, 'SH' : 29, 'T' : 30, 'TH' : 31, 'UH' : 32, 'UW' : 33, 'V' : 34, 'W' : 35, 'Y' : 36, 'Z' : 37, 'ZH' : 38}

# we'll use them later for evaluation and submission

rev_chardict = {v:k for k,v in chardict.items()}
rev_phonedict = {v:k for k,v in phonedict.items()}

# adding 2 for SOS and EOS tokens

n_chars = len(chardict) + 2
n_phones = len(phonedict) + 2
n_words = len(data)

# some useful functions for preparing our data to feed the network
# note that we use one-hot encoding to get rid of nonexistent correlations that appear when we use indexing
# I actually wanted to try embeddings, yet character-level embeddings doesn't seem to make much sense 

def wordToTensor(line):
    tensor = torch.zeros(len(line) + 1, 1, n_chars)
    for li, letter in enumerate(line):
        tensor[li][0][chardict[letter] + 2] = 1
    tensor[len(line)][0][1] = 1
    return tensor

def phoneToTensor(line):
    tensor = torch.zeros(len(line), 1, n_phones)
    linelist = line.split('_')
    for li, letter in enumerate(linelist):
        tensor[li][0][phonedict[letter] + 2] = 1
    tensor[len(linelist)][0][1] = 1
    return tensor

def pairTensor(i):
    linelist = data[i].split(' ')
    return (wordToTensor(linelist[0]), phoneToTensor(linelist[1]))

# I'm terribly sorry even this is hardcoded, I experimented with cutting off words by their size to reduce training
# time. Actuallly it didn't give any valuable boost, so I left the whole dataset without trimming. 

max_wordlen = 36
max_phonelen = 20

data = [word for word in data if len(word.split(' ')[0]) <= max_wordlen]

for word in data:
    max_phonelen = max(max_phonelen, len(word.split(' ')[1]))
    
print(max_phonelen)

LEMIEUX L_AH_M_Y_UW
79


In [6]:
n_hidden = 256 # it sure helped to squeeze another several percent when i doubled hidden size (128->256)

In [7]:
# A few words about encoder and decoder: they are done in a simplest possible manner, yet some tweaking was very 
# helpful (actually it was the thing that gave the most significant improvements, wonder how much better would 
# tweaked attention model be)
# The first point: LSTM > GRU (yet training time is somewhat longer)
# Second point: bidirectional LSTM > LSTM (that too gave some significant boost, multyplying training time by 
# something like 1.5 
# Third point: dropout is crucial, and 0.5 > 0.2 (I was afraid of underfitting so I started with a lighter dropout)
# Fourth point: 2-layer LSTM (GRU) gave better results than 1-layer and 3-layer.

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.gru = nn.LSTM(n_chars, hidden_size, batch_first = True, bidirectional = True, num_layers = 2, dropout = 0.5)

    def forward(self, inp, hidden):
        output, hidden = self.gru(inp.view(1, 1, -1), hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(6, 1, self.hidden_size, device=device), torch.zeros(6, 1, self.hidden_size, device=device))
    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.gru = nn.LSTM(output_size, hidden_size, batch_first = True, bidirectional = True, num_layers = 2, dropout = 0.5)
        self.out = nn.Linear(hidden_size * 2, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inp, hidden):
        output = inp.view(1, 1, -1)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
encoder = EncoderRNN(n_chars, n_hidden).to(device)
decoder = DecoderRNN(n_hidden, n_phones).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)

# At first I forgot about lr decay and had mediocre results. Adam learning rate decay has proven to be on of the 
# most effective (yet absolutely unintuitive) practices. My guess is that it is because of the instability Adam
# has closer to the local minima. By manually reducing learning rate we make the extent of instability lower 
# (because learning rate in Adam is adaptive and we can only adjust the upper bound for neural net adjustments)

encoder_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(encoder_optimizer, gamma=0.85)
decoder_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(decoder_optimizer, gamma=0.85)

criterion = nn.NLLLoss()

In [9]:
# We use full teacher forcing for the sake of simplicity, so we actually use encoder only to initialize decoder 
# hidden state. By feeding the decoder SOS tokemn, we signal it that we expect it to generate a transcription for 
# us.

def train(input_tensor, target_tensor, max_length=max_wordlen):
    criterion = nn.NLLLoss()
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0.0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        
    sos = torch.zeros(1, 1, n_phones)
    sos[0][0][0] = 1
    sos = sos.to(device)
    
    decoder_input = sos

    decoder_hidden = encoder_hidden
    
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
        loss += criterion(decoder_output, torch.max(target_tensor[di], 1)[1])
        decoder_input = target_tensor[di]

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [10]:
# Quite useful functions to use for actual epoch time estimation.

import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [11]:
# Our first train function takes one word, so we use a wrapper for it to train a whole epoch.

def trainIters(encoder, decoder, print_every=1000, plot_every=100):
    encoder.train()
    decoder.train()
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  
    
    training_pairs = [pairTensor(i)
                      for i in tqdm(range(len(data)))]
    
    random.shuffle(training_pairs)

    for iter in range(1, len(data) + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor.to(device), target_tensor.to(device))
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / len(data)),
                                         iter, iter / len(data) * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [12]:
# The plotting doesn't actually work for whatever reason.

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [13]:
n_epochs = 1 # I trained for around 10 to 20 epochs, setting 1 for the sake of demonstration

for i in range(n_epochs):
    encoder_lr_scheduler.step()
    decoder_lr_scheduler.step()
    print("Epoch %d" % (i + 1))
    trainIters(encoder, decoder, print_every=5000)

Epoch 1


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 10s (- 65m 19s) (5000 6%) 1.3671
8m 14s (- 60m 17s) (10000 12%) 1.0127
12m 19s (- 56m 0s) (15000 18%) 0.8134
16m 23s (- 51m 47s) (20000 24%) 0.6915
20m 25s (- 47m 33s) (25000 30%) 0.5782
24m 28s (- 43m 24s) (30000 36%) 0.4877
28m 38s (- 39m 26s) (35000 42%) 0.4404
33m 2s (- 35m 40s) (40000 48%) 0.3984
37m 29s (- 31m 48s) (45000 54%) 0.3666
41m 32s (- 27m 34s) (50000 60%) 0.3473
45m 39s (- 23m 24s) (55000 66%) 0.3237
49m 44s (- 19m 13s) (60000 72%) 0.3081
53m 50s (- 15m 4s) (65000 78%) 0.2902
57m 56s (- 10m 55s) (70000 84%) 0.2923
62m 1s (- 6m 46s) (75000 90%) 0.2727
66m 6s (- 2m 38s) (80000 96%) 0.2689
Epoch 2


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 13s (- 66m 5s) (5000 6%) 0.2386
8m 19s (- 60m 54s) (10000 12%) 0.2394
12m 26s (- 56m 34s) (15000 18%) 0.2300
16m 30s (- 52m 10s) (20000 24%) 0.2280
20m 35s (- 47m 56s) (25000 30%) 0.2254
24m 40s (- 43m 45s) (30000 36%) 0.2218
28m 46s (- 39m 37s) (35000 42%) 0.2184
32m 53s (- 35m 31s) (40000 48%) 0.2149
36m 58s (- 31m 22s) (45000 54%) 0.2088
41m 4s (- 27m 16s) (50000 60%) 0.2130
45m 9s (- 23m 8s) (55000 66%) 0.2058
49m 14s (- 19m 1s) (60000 72%) 0.2110
53m 19s (- 14m 55s) (65000 78%) 0.2046
57m 27s (- 10m 49s) (70000 84%) 0.1956
61m 33s (- 6m 43s) (75000 90%) 0.2000
65m 39s (- 2m 37s) (80000 96%) 0.1915
Epoch 3


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 13s (- 66m 8s) (5000 6%) 0.1723
8m 19s (- 60m 58s) (10000 12%) 0.1763
12m 26s (- 56m 33s) (15000 18%) 0.1756
16m 32s (- 52m 16s) (20000 24%) 0.1744
20m 37s (- 47m 59s) (25000 30%) 0.1719
24m 42s (- 43m 49s) (30000 36%) 0.1702
28m 48s (- 39m 39s) (35000 42%) 0.1702
32m 55s (- 35m 33s) (40000 48%) 0.1741
37m 2s (- 31m 25s) (45000 54%) 0.1699
41m 7s (- 27m 18s) (50000 60%) 0.1697
45m 14s (- 23m 11s) (55000 66%) 0.1692
49m 22s (- 19m 5s) (60000 72%) 0.1659
53m 28s (- 14m 57s) (65000 78%) 0.1678
57m 34s (- 10m 51s) (70000 84%) 0.1644
61m 40s (- 6m 44s) (75000 90%) 0.1696
65m 45s (- 2m 37s) (80000 96%) 0.1655
Epoch 4


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 14s (- 66m 22s) (5000 6%) 0.1510
8m 22s (- 61m 15s) (10000 12%) 0.1506
12m 27s (- 56m 39s) (15000 18%) 0.1452
16m 33s (- 52m 18s) (20000 24%) 0.1491
20m 41s (- 48m 9s) (25000 30%) 0.1465
24m 47s (- 43m 56s) (30000 36%) 0.1446
28m 53s (- 39m 46s) (35000 42%) 0.1423
32m 58s (- 35m 36s) (40000 48%) 0.1443
37m 5s (- 31m 28s) (45000 54%) 0.1483
41m 11s (- 27m 21s) (50000 60%) 0.1417
45m 19s (- 23m 14s) (55000 66%) 0.1479
49m 25s (- 19m 6s) (60000 72%) 0.1453
53m 31s (- 14m 59s) (65000 78%) 0.1487
57m 39s (- 10m 52s) (70000 84%) 0.1439
61m 43s (- 6m 44s) (75000 90%) 0.1401
65m 50s (- 2m 37s) (80000 96%) 0.1472
Epoch 5


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 14s (- 66m 19s) (5000 6%) 0.1283
8m 22s (- 61m 17s) (10000 12%) 0.1293
12m 27s (- 56m 40s) (15000 18%) 0.1262
16m 34s (- 52m 22s) (20000 24%) 0.1316
21m 0s (- 48m 53s) (25000 30%) 0.1333
25m 6s (- 44m 30s) (30000 36%) 0.1275
29m 11s (- 40m 11s) (35000 42%) 0.1277
33m 17s (- 35m 57s) (40000 48%) 0.1277
37m 23s (- 31m 44s) (45000 54%) 0.1311
41m 30s (- 27m 33s) (50000 60%) 0.1278
45m 38s (- 23m 23s) (55000 66%) 0.1299
49m 42s (- 19m 13s) (60000 72%) 0.1264
53m 48s (- 15m 3s) (65000 78%) 0.1270
57m 55s (- 10m 55s) (70000 84%) 0.1268
62m 1s (- 6m 46s) (75000 90%) 0.1322
66m 8s (- 2m 38s) (80000 96%) 0.1288
Epoch 6


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 15s (- 66m 37s) (5000 6%) 0.1170
8m 19s (- 60m 55s) (10000 12%) 0.1153
12m 26s (- 56m 33s) (15000 18%) 0.1154
16m 34s (- 52m 21s) (20000 24%) 0.1180
20m 41s (- 48m 9s) (25000 30%) 0.1143
24m 48s (- 43m 58s) (30000 36%) 0.1158
28m 55s (- 39m 49s) (35000 42%) 0.1214
33m 0s (- 35m 38s) (40000 48%) 0.1138
37m 6s (- 31m 29s) (45000 54%) 0.1184
41m 14s (- 27m 22s) (50000 60%) 0.1148
45m 21s (- 23m 14s) (55000 66%) 0.1179
49m 29s (- 19m 7s) (60000 72%) 0.1160
53m 35s (- 15m 0s) (65000 78%) 0.1120
57m 43s (- 10m 52s) (70000 84%) 0.1151
61m 49s (- 6m 45s) (75000 90%) 0.1232
65m 56s (- 2m 37s) (80000 96%) 0.1164
Epoch 7


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


4m 17s (- 67m 1s) (5000 6%) 0.1038
8m 21s (- 61m 12s) (10000 12%) 0.1040
12m 27s (- 56m 37s) (15000 18%) 0.1046
16m 34s (- 52m 21s) (20000 24%) 0.1070
20m 39s (- 48m 4s) (25000 30%) 0.1061
24m 47s (- 43m 57s) (30000 36%) 0.1095
28m 53s (- 39m 47s) (35000 42%) 0.1070
32m 59s (- 35m 38s) (40000 48%) 0.1047
37m 8s (- 31m 31s) (45000 54%) 0.1053


KeyboardInterrupt: 

In [None]:
# Actually NOT a good evaluation function, it just returns a generated translation for the word. TBA: actual
# evaluation (with various metrcis and losses)

def evaluate(encoder, decoder, sentence, max_length=max_wordlen):
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        input_tensor = wordToTensor(sentence).to(device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)

        sos = torch.zeros(1, 1, n_phones)
        sos[0][0][0] = 1
        sos = sos.to(device)
    
        decoder_input = sos
    
        decoder_hidden = encoder_hidden

        decoded_chars = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            
            if topi.item() == 1:
                break
            else:
                decoded_chars.append(rev_phonedict[topi.item() - 2])
            
            decoder_input = torch.zeros(1, 1, n_phones)
            decoder_input[0][0][topi.item()] = 1
            decoder_input = decoder_input.to(device)
        
        return "_".join(decoded_chars)

In [None]:
# So this actually works like a sanity check. If everything looks fair enough - we're good to go.

for i in range(20):
    print(data[i])
    print(evaluate(encoder, decoder, data[i].split(' ')[0]))

In [None]:
import pandas as pd

test = pd.read_csv('test.csv')
test_x = test['Word'].tolist()

In [None]:
test_y = [evaluate(encoder, decoder, word) for word in tqdm(test_x)]
print(test_y)

In [None]:
test.rename(columns={'Word':'Transcription'}, inplace=True) 
test['Transcription'] = test_y

In [None]:
# I always write a brief summary of my submission in the filename. It's useful for keeping track of what ideas 
# worked and what didn't. It also may be useful for blending (as you may remember, it's better to use a set of 
# diverse architectures for weighted voting) I usually also provide CV score or validation acc/loss, but now I
# dont have any.

test.to_csv("lstm_one_hot_2_layer_bidirectional_16_epochs.csv",index=False)