In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
import torchvision
from torchvision import transforms
import math
from PIL import Image
from torchsummary import summary 
from tqdm import trange 
import glob
import os
import unicodedata
import string
import time 
import random 

%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(device)

cuda:0


In [2]:
data = open('train.txt', encoding='utf-8').read().strip().split('\n')

print(data[0])

chardict =  {'': 0, '.': 1, "'": 2, '-':3, 'A': 4, 'B': 5, 'C': 6, 'D': 7, 'E': 8, 'F': 9, 'G': 10, 'H': 11, 'I': 12, 'J': 13, 'K': 14, 'L': 15, 'M': 16, 'N': 17, 'O': 18, 'P': 19, 'Q': 20, 'R': 21, 'S': 22, 'T': 23, 'U': 24, 'V': 25, 'W': 26, 'X': 27, 'Y': 28, 'Z': 29}
phonedict = {'AA' : 0, 'AE' : 1, 'AH' : 2, 'AO' : 3, 'AW' : 4, 'AY' : 5, 'B' : 6, 'CH' : 7, 'D' : 8, 'DH' : 9, 'EH' : 10, 'ER' : 11, 'EY' : 12, 'F' : 13, 'G' : 14, 'HH' : 15, 'IH' : 16, 'IY' : 17, 'JH' : 18, 'K' : 19, 'L' : 20, 'M' : 21, 'N' : 22, 'NG' : 23, 'OW' : 24, 'OY' : 25, 'P' : 26, 'R' : 27, 'S' : 28, 'SH' : 29, 'T' : 30, 'TH' : 31, 'UH' : 32, 'UW' : 33, 'V' : 34, 'W' : 35, 'Y' : 36, 'Z' : 37, 'ZH' : 38}

rev_chardict = {v:k for k,v in chardict.items()}
rev_phonedict = {v:k for k,v in phonedict.items()}

n_chars = len(chardict) + 2
n_phones = len(phonedict) + 2
n_words = len(data)

def wordToTensor(line):
    tensor = torch.zeros(len(line) + 1, n_chars)
    for li, letter in enumerate(line):
        tensor[li][chardict[letter] + 2] = 1
    tensor[len(line)][1] = 1
    return tensor

def phoneToTensor(line):
    tensor = torch.zeros(len(line), n_phones)
    linelist = line.split('_')
    for li, letter in enumerate(linelist):
        tensor[li][phonedict[letter] + 2] = 1
    tensor[len(linelist)][1] = 1
    return tensor

def pairTensor(i):
    linelist = data[i].split(' ')
    return (wordToTensor(linelist[0]), phoneToTensor(linelist[1]))

max_wordlen = 36
max_phonelen = 20

data = [word for word in data if len(word.split(' ')[0]) <= max_wordlen]

for word in data:
    max_phonelen = max(max_phonelen, len(word.split(' ')[1]))
    
print(max_phonelen)

LEMIEUX L_AH_M_Y_UW
79


In [3]:
n_hidden = 128
batch_size = 4

In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.gru = nn.LSTM(n_chars, hidden_size, bidirectional = False, num_layers = 2, dropout = 0.2)

    def forward(self, inp, hidden):
        output, hidden = self.gru(inp, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(2, batch_size, self.hidden_size, device=device), torch.zeros(2, batch_size, self.hidden_size, device=device))
    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.gru = nn.LSTM(output_size, hidden_size, bidirectional = False, num_layers = 2, dropout = 0.2)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, inp, hidden):
        output, hidden = self.gru(inp.view(1, batch_size, -1), hidden)
        output = self.out(output[0])
        return output, hidden
    
encoder = EncoderRNN(n_chars, n_hidden).to(device)
decoder = DecoderRNN(n_hidden, n_phones).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)

In [5]:
def train(input_tensor, target_tensor, max_length=max_wordlen):
    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    #input_length = input_tensor.size(0)
    #target_length = target_tensor.size(0)

    loss = 0

    #for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
        
    sos = torch.zeros(1, batch_size, n_phones)
    for i in range (batch_size):
        sos[0][i][0] = 1
    sos = sos.to(device)
    
    decoder_input = sos

    decoder_hidden = encoder_hidden

    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
    target_tensor, target_lengths = nn.utils.rnn.pad_packed_sequence(target_tensor)
    
    target_length, _ = torch.max(target_lengths, 0)
    target_length = target_length.item()
    
    #print('szz')
    #print(encoder_hidden[0].size())
    #print(target_tensor.size())
    
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output, torch.max(target_tensor[di, :, :], 1)[1])
        #print("decoder")
        #print(decoder_output)
        #print("target")
        #print(torch.max(target_tensor[di, :, :], 1)[1])
        decoder_input = target_tensor[di, :, :]
    
    #print(decoder_output.size())
    #print(nn.utils.rnn.pad_packed_sequence(target_tensor, batch_first = True)[0].size())

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [6]:
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [7]:
def trainIters(encoder, decoder, print_every=1000, learning_rate=0.001, batch_size = 256):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  
    
    training_pairs = [pairTensor(i)
                      for i in tqdm(range(len(data)))]
    
    random.shuffle(training_pairs)

    for iter in range(1, ((len(data) + 1) // batch_size - 1)):
        input_tensor = [word[0] for word in training_pairs[(iter - 1) * batch_size : iter * batch_size]]
        input_tensor.sort(key=len, reverse = True)
        input_tensor = nn.utils.rnn.pack_sequence(input_tensor)
        
        target_tensor = [word[1] for word in training_pairs[(iter - 1) * batch_size : iter * batch_size]]
        target_tensor.sort(key=len, reverse = True)
        target_tensor = nn.utils.rnn.pack_sequence(target_tensor)
        
        loss = train(input_tensor.to(device), target_tensor.to(device))
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter * batch_size / len(data)),
                                         iter * batch_size, iter * batch_size / len(data) * 100, print_loss_avg))

In [8]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [9]:
n_epochs = 10

for i in range(n_epochs):
    print("Epoch %d" % (i + 1))
    trainIters(encoder, decoder, print_every = 5000, batch_size = batch_size)

Epoch 1


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 1s (- 6m 25s) (20000 24%) 1.1516
3m 57s (- 4m 16s) (40000 48%) 0.9617
5m 52s (- 2m 16s) (60000 72%) 0.8586
7m 48s (- 0m 18s) (80000 96%) 0.7931
Epoch 2


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 1s (- 6m 24s) (20000 24%) 0.7430
3m 58s (- 4m 17s) (40000 48%) 0.6932
5m 54s (- 2m 17s) (60000 72%) 0.6452
7m 50s (- 0m 18s) (80000 96%) 0.6080
Epoch 3


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 2s (- 6m 26s) (20000 24%) 0.5904
3m 58s (- 4m 17s) (40000 48%) 0.5729
5m 55s (- 2m 17s) (60000 72%) 0.5590
7m 51s (- 0m 18s) (80000 96%) 0.5540
Epoch 4


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 2s (- 6m 27s) (20000 24%) 0.5398
3m 59s (- 4m 18s) (40000 48%) 0.5409
5m 56s (- 2m 17s) (60000 72%) 0.5226
7m 52s (- 0m 18s) (80000 96%) 0.5183
Epoch 5


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 2s (- 6m 28s) (20000 24%) 0.5054
3m 59s (- 4m 18s) (40000 48%) 0.5063
5m 56s (- 2m 17s) (60000 72%) 0.5111
7m 52s (- 0m 18s) (80000 96%) 0.5089
Epoch 6


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))


2m 2s (- 6m 28s) (20000 24%) 0.4978
3m 59s (- 4m 18s) (40000 48%) 0.4949
5m 55s (- 2m 17s) (60000 72%) 0.4920
7m 52s (- 0m 18s) (80000 96%) 0.4944
Epoch 7


HBox(children=(IntProgress(value=0, max=83194), HTML(value='')))




KeyboardInterrupt: 

In [10]:
batch_size = 1

def evaluate(encoder, decoder, sentence, max_length=max_wordlen):
    with torch.no_grad():
        input_tensor = wordToTensor(sentence).to(device)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei].view(1, 1, -1),
                                                     encoder_hidden)

        sos = torch.zeros(1, 1, n_phones)
        sos[0][0][0] = 1
        sos = sos.to(device)
    
        decoder_input = sos
    
        decoder_hidden = encoder_hidden

        decoded_chars = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
            decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            
            if topi.item() == 1:
                break
            else:
                decoded_chars.append(rev_phonedict[topi.item() - 2])
            
            decoder_input = torch.zeros(1, 1, n_phones)
            decoder_input[0][0][topi.item()] = 1
            decoder_input = decoder_input.to(device)
            
        print(sentence)
        print("_".join(decoded_chars))
        return "_".join(decoded_chars)

In [11]:
for i in range(40):
    evaluate(encoder, decoder, data[i].split(' ')[0])

LEMIEUX
B_L_EH_M_AH_N_AH_S
MINDING
M_M_IH_N_D_IH_NG
STRIPED
B_S_T_R_EY_P_T
KEN
S_K_EH_N
CONFERENCE
S_K_AH_N_F_ER_N_AH_S
IMMOLATE
N_IH_M_AH_L_EY_T
TRANSGRESS
B_L_AA_N_D_IH_K_AH_L
RABBLE
F_R_AE_B_AH_L
AIRSHARE
W_EH_R_AH_S_T_AH_N
INTOLERANCE
M_IH_N_T_AA_L_ER_AH_N_S
ILVA
K_IH_L_V_AH
RYGEL
F_R_IY_G_AH_L
MARLETTE
AO_M_AA_R_EH_L_IY
DILDO
G_R_IY_S_T
ORELIA
N_AO_R_EH_L_IY_AH
MCNISH
M_M_AH_N_S_IH_SH
FURBISHED
F_F_ER_B_IH_SH_T
COMFED
S_K_AH_M_F_EH_D
WALKENHORST
AO_S_W_IH_L_AH_N_T_IH_NG
MILLIRONS
M_M_IH_L_IY_AA_R_N_Z
JERE
B_JH_EH_R
LIVAN'S
B_L_IH_V_AH_N_Z
PREVIEW
P_P_R_IY_V_IY
GRAYING
B_G_R_EY_IH_NG
KU
K_W_Y_UW
FREEHOLD
F_F_R_IY_HH_OW_L_D
CONCA
G_K_AA_N_K_AH
TECK'S
B_L_AA_N_D_AH
QUINTER
V_K_W_IH_N_T_ER
CIRCUMSTANTIAL
S_S_ER_K_S_AH_N_T_AH_M_EY_SH_AH_L
RYDELL
F_R_IY_D_AH_L
ROTOTILLER
F_R_OW_T_AH_T_AY_L_ER
HAVINGTON'S
HH_HH_AE_V_IH_N_S_AH_N_T
DECALS
B_D_IH_K_AH_L_Z
DIBATTISTA
IH_D_IH_B_AA_T_IH_S_T_AH
RAVI'S
F_R_EY_V_IY_Z
INTERCEPTING
N_IH_N_T_ER_S_EH_P_T_IH_NG
FROMMELT
F_F_R_AA_M_AH_L_T
ACCOMPANIED
R

In [None]:
import pandas as pd

test = pd.read_csv('test.csv')
test_x = test['Word'].tolist()

In [None]:
test_y = [evaluate(encoder, decoder, word) for word in tqdm(test_x)]
print(test_y)

In [None]:
test.rename(columns={'Word':'Transcription'}, inplace=True) 
test['Transcription'] = test_y

In [None]:
test.to_csv("lstm_one_hot_2_layer_6_epochs.csv",index=False)