In [1]:
import os
import datetime
import torch
import random
import pickle
import numpy as np
import pandas as pd

#os.chdir(os.path.join(os.getcwd(), 'LAS Model'))
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from data import SpeechDataset, AudioDataLoader
from listener import Listener
from attend_and_spell import AttendAndSpell
from seq2seq import Seq2Seq
from utils import  train

### Load Training data

In [2]:
def preprocess(s):
    s = s.lower().replace('\n', '')
    return s.translate(str.maketrans('', '', string.punctuation)) # remove punctuation

# Used when each sentence is in a separate text file
def make_train_df(root_dir, dataset, file_ext, csv_file):
    dataset_dir = os.path.join(root_dir, dataset)
    data = []
    files = os.listdir(dataset_dir)
    for f in files:
        if '.txt' in f:
            with open(os.path.join(dataset_dir, f), 'r') as text_file:
                data_list = text_file.readlines()
            for example in data_list:
                path = os.path.join(dataset, str(example.split(' ')[0])) + file_ext   
                sent = preprocess(str(' '.join(example.split(' ')[1:])))
                data.append((path, sent))

    data_df = pd.DataFrame(data, columns=['path', 'sent'])
    data_df.to_csv(os.path.join(root_dir, csv_file), header=None)
    return data_df


root_dir = '../../../Dataset/LibriSpeech'
DEVICE = torch.device('cuda:1') if torch.cuda.is_available() else 'cpu'
print('DEVICE :', DEVICE)

DEVICE : cuda:1


### Load Training data

In [3]:
train_100 = pd.read_csv(os.path.join(root_dir, 'train_100.csv'), header=None, names=['path', 'sent'])
train_360 = pd.read_csv(os.path.join(root_dir, 'train_360.csv'), header=None, names=['path', 'sent'])
train_500 = pd.read_csv(os.path.join(root_dir, 'train_500.csv'), header=None, names=['path', 'sent'])

# combine all of them
train_df = pd.concat([train_100, train_360, train_500])
print("Number of training examples:", train_df.shape[0])
train_df.head()

Number of training examples: 281241


Unnamed: 0,path,sent
0,dataset_100/103-1240-0000.flac,chapter one missus rachel lynde is surprised m...
1,dataset_100/103-1240-0001.flac,that had its source away back in the woods of ...
2,dataset_100/103-1240-0002.flac,for not even a brook could run past missus rac...
3,dataset_100/103-1240-0003.flac,and that if she noticed anything odd or out of...
4,dataset_100/103-1240-0004.flac,but missus rachel lynde was one of those capab...


In [4]:
# # Removing very large sentences
# def remove_long_sent(train_df, max_len):
#     data = []
#     for idx in range(train_df.shape[0]):
#         path, sent = train_df.iloc[idx]
#         if len(sent) > max_len:
#             continue
#         data.append((path, sent))
#     return pd.DataFrame(data, columns=['path', 'sent'])

# max_len = 225
# train_df = remove_long_sent(train_df, max_len)
# print("Number of training examples:",  train_df.shape[0])

In [4]:
# save train_df
#train_df.to_csv(os.path.join(root_dir, 'total_train.csv'), header=None)
# load train_df
train_df = pd.read_csv(os.path.join(root_dir, 'total_train.csv'), header=None, names=['path', 'sent'])
print("Number of training examples:",  train_df.shape[0])
train_df.head(3)

Number of training examples: 219709


Unnamed: 0,path,sent
0,dataset_100/103-1240-0000.flac,chapter one missus rachel lynde is surprised m...
1,dataset_100/103-1240-0006.flac,as avonlea housekeepers were wont to tell in a...
2,dataset_100/103-1240-0009.flac,missus rachel knew that he ought because she h...


### DataLoaders and hyperparams

In [5]:
def get_chars(include_digits=True):
    if include_digits:
        chars = ['<sos>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', \
                 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', \
                'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', \
                 'x', 'y', 'z', ' ', "'", '<eos>', '<pad>', '<unk>']
    else:
        chars = ['<sos>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', \
                'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', \
                 
                'y', 'z', ' ', "'", '<eos>', '<pad>', '<unk>']
    print('Number of chars', len(chars))
    return chars


chars = get_chars(include_digits=False)
char_to_token = {c:i for i,c in enumerate(chars)} 
token_to_char = {i:c for c,i in char_to_token.items()}
sos_token = char_to_token['<sos>']
eos_token = char_to_token['<eos>']
pad_token = char_to_token['<pad>']

Number of chars 32


In [6]:
tensorboard_dir = os.path.join('tb_summary')
train_dataset = SpeechDataset(train_df, root_dir, char_to_token)
train_loader = AudioDataLoader(pad_token, train_dataset, batch_size=64, 
                               shuffle=True, drop_last=True, num_workers=8)

### Instantiate model

In [7]:
load = False

if load:
    saved_file = 'Trained Models/Training_2019-12-25 00:09:23.921978/las_model_6'
    model.load_state_dict(torch.load(saved_file))
    start_epoch = int(saved_file[-1]) + 1
    time = os.listdir(tensorboard_dir)[-1]  # use the last one
else:
    start_epoch = 0
    time = str(datetime.datetime.now())

name = f'amsgrad_ln_640_{time}'
save_dir = os.path.join('trained_models_librispeech', name)
try:    
    os.mkdir(save_dir);
except FileExistsError:
    pass

In [8]:
input_size = 128    # num rows in instagram
hidden_dim = 640  # 256*2 nodes in each LSTM
num_layers = 4
dropout = 0.1
layer_norm = True   
encoder = Listener(input_size, hidden_dim, num_layers, dropout=dropout, layer_norm=layer_norm)

hid_sz = 640
embed_dim = 50
vocab_size = len(chars)
decoder = AttendAndSpell(embed_dim, hid_sz, encoder.output_size, vocab_size)

hyperparams = {'input_size':input_size, 'hidden_dim':hidden_dim, 
               'num_layers':num_layers,'dropout':dropout, 
               'layer_norm':layer_norm, 'hid_sz':hid_sz, 
               'embed_dim':embed_dim, 'vocab_size':vocab_size}


# Saving hyperparmas
with open(os.path.join(save_dir, 'info.pickle'), 'wb') as f:
    pickle.dump(hyperparams, f)


model = Seq2Seq(encoder, decoder, tf_ratio = 1.0, device=DEVICE).to(DEVICE)
model.train()

Seq2Seq(
  (encoder): Listener(
    (layers): ModuleList(
      (0): piBLSTM(
        (lstm): LSTM(128, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (1): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (2): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (3): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): AttendAndSpell(
    (embedding): Embedding(32, 50)
    (attention_layer): At

### Training

In [10]:
# model.load_state_dict(torch.load(os.path.join(save_dir, 'las_model_1')))
# model.train()

In [None]:
# optimizer = optim.ASGD(model.parameters(), lr=0.05)  # lr = 0.2 used in paper
optimizer = optim.Adam(model.parameters(), amsgrad=True)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

log_interval = 5
print_interval = 40

epochs = 20
load = False

summary_dir = os.path.join(tensorboard_dir, time)
writer = SummaryWriter(summary_dir)
print('save_dir', save_dir)

for epoch in range(0, epochs):
    print("\nTeacher forcing ratio:", model.tf_ratio)
    train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
    # scheduler.step()                                 # Decrease learning rate
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}'))
    model.tf_ratio = max(model.tf_ratio - 0.025, 0.8)    # Decrease teacher force ratio

save_dir trained_models_librispeech/amsgrad_ln_640_2019-12-28 14:41:29.019788

Teacher forcing ratio: 1.0
Training, Logging: Mean loss of previous 40 batches 


Teacher forcing ratio: 0.975
Training, Logging: Mean loss of previous 40 batches 




Teacher forcing ratio: 0.95
Training, Logging: Mean loss of previous 40 batches 




Teacher forcing ratio: 0.9249999999999999
Training, Logging: Mean loss of previous 40 batches 




Teacher forcing ratio: 0.8999999999999999
Training, Logging: Mean loss of previous 40 batches 



In [None]:
### DOES DEEPER NETWORK HELP ?

### DOES AMSGRAD HELP ?

### DOES LAYER NORMALIZATION HELP ?

In [18]:
# # optimizer = optim.ASGD(model.parameters(), lr=0.05)  # lr = 0.2 used in paper
# optimizer = optim.Adadelta(model.parameters())

# # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# log_interval = 5
# print_interval = 40

# epochs = 20
# load = False

# summary_dir = os.path.join(tensorboard_dir, name)
# writer = SummaryWriter(summary_dir)
# print('save_dir', save_dir)

# for epoch in range(0, epochs):
#     print("\nTeacher forcing ratio:", model.tf_ratio)
#     train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
#     # scheduler.step()                                    # Decrease learning rate
#     torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}'))
#     model.tf_ratio = max(model.tf_ratio - 0.05, 0.8)    # Decrease teacher force ratio

save_dir trained_models_librispeech/amsgrad_layer_norm_2019-12-28 14:37:25.274924

Teacher forcing ratio: 1.0
Training, Logging: Mean loss of previous 40 batches 



KeyboardInterrupt: 

### TEST

In [None]:
def decode_pred_sent(out):
    pred_sent = []
    for t in out:
        lol = t.max(dim=1)[1].item()
        pred_sent.append(token_to_char[lol])
    return ''.join(pred_sent)


def decode_true_sent(y):
    sent = []
    for t in y:
        sent.append(token_to_char[t.item()])
    return ''.join(sent)

In [None]:
num_sent = 10
model.eval()

for _ in range(num_sent):
    
    idx = random.randint(0, train_df.shape[0])
    trial_dataset = SpeechDataset(train_df, root_dir, char_to_token)

    x, y = trial_dataset.__getitem__(idx)
    # plt.imshow(x[0,:,:].detach())

    # Model output
    target = y.unsqueeze(dim=0).to(DEVICE)
    data = x.permute(0, 2, 1).to(DEVICE)
    loss, output = model(data, target)
    print("True sent : ", decode_true_sent(y), end='\n\n')
    print("Pred sent : ", decode_pred_sent(output))
    print("Loss :", loss.item())    
    print("\n")