In [1]:
import os
import sys
import datetime
import string
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

NAME = 'amsgrad_clean_1h' # helps to differentiate between various training instances

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator, TabularDataset

sys.path.append(os.path.abspath(os.path.join('..')))

from models.las_model.data import SpeechDataset, AudioDataLoader
from models.las_model.listener import Listener
from models.las_model.attend_and_spell import AttendAndSpell
from models.las_model.seq2seq import Seq2Seq
#from models.las_model.utils import  train

In [2]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

GeForce GTX 1080 Ti
Tesla P100-PCIE-16GB
Tesla P100-PCIE-16GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-16GB


In [3]:
DEVICE = torch.device('cuda:4') #if torch.cuda.is_available() else 'cpu'
print('DEVICE :', DEVICE)

DEVICE : cuda:4


### Preprocessing

In [4]:
root_dir = '../../../Dataset/sinhala_clean'
data_dir = os.path.join(root_dir, 'data')


# reading the main transcript
lines = []
with open(os.path.join(root_dir, 'si_lk.lines.txt'), 'r', encoding='utf-8') as f:
    lines = f.readlines()

examples = []
for l in lines:
    id_, sent, _ = l.split('"')
    id_ = id_.replace("(", '').strip()
    sent = sent.strip()
    examples.append((id_+'.wav',sent))


data_df = pd.DataFrame(examples, columns=['path', 'sent'])
data_df.to_csv(os.path.join(root_dir, 'data_df.csv')) # save
print("Number of Training examples:", data_df.shape[0])
data_df.head(5)

Number of Training examples: 1251


Unnamed: 0,path,sent
0,sin_2241_0329430812.wav,කෝකටත් මං වෙනදා තරම් කාලෙ ගන්නැතිව ඇඳ ගත්තා
1,sin_2241_0598895166.wav,ඇන්ජලීනා ජොලී කියන්නේ පසුගිය දිනවල බොහෝ සෙයින්...
2,sin_2241_0701577369.wav,ආර්ථික චින්තනය හා සාමාජීය දියුණුව ඇති කළ හැකිව...
3,sin_2241_0715400935.wav,ඉන් අදහස් වන්නේ විචාරාත්මක විනිවිද දැකීමෙන් තො...
4,sin_2241_0817100025.wav,අප යුද්ධයේ පළමු පියවරේදීම පරාද වී අවසානය


We have tried removing all the unnecessary characters from the dataset. The others will be replaced by unknown token, while training.

### Load data

In [5]:
from sklearn.model_selection import train_test_split

data_df = pd.read_csv(os.path.join(root_dir, 'data_df.csv'), usecols=['path', 'sent'])
train_df, val_df = train_test_split(data_df, test_size=0.1)
print("Num training example:", train_df.shape)
print("Num validation example", val_df.shape)
train_df.head()

Num training example: (1125, 2)
Num validation example (126, 2)


Unnamed: 0,path,sent
804,sin_6314_0000039087.wav,රජතුමාට වෙනදා මෙන් කරුණාවෙන් සෙත් පැතුවා
418,sin_3688_7927489278.wav,කෙනෙක් බඩගෝස්තරය නැති වෙයි කියලා බයවෙලා
140,sin_2282_2140134972.wav,හිතාගන්න පුළුවන්නේ ෆිල්ම් එක කොහොමට ඇතිද කියලා
1101,sin_7183_6716425545.wav,ඇත් ගව මුව වඳුරු නාග ආදී සත්ත්ව කුලවල ඉපිද ඇත
567,sin_4191_8578172520.wav,මේකෙත් හිටියෙ හුරුපුරුදු සුපිරි නළුවෙක් තමයි


### Vocabulary

In [6]:
def get_chars(train_df):
    chars = ['<pad>', '<unk>', '<sos>', '<eos>']
    for idx in range(train_df.shape[0]):
        id_, sent = train_df.iloc[idx]
        for c in sent:
            if c not in chars:
                chars.append(c)
    return chars
    

chars = get_chars(train_df)
char_to_token = {c:i for i,c in enumerate(chars)} 
token_to_char = {i:c for c,i in char_to_token.items()}
sos_token = char_to_token['<sos>']
eos_token = char_to_token['<eos>']
pad_token = char_to_token['<pad>']
unk_token = char_to_token['<unk>']

print("Number of characters:", len(chars))
print(chars)

Number of characters: 70
['<pad>', '<unk>', '<sos>', '<eos>', 'ර', 'ජ', 'ත', 'ු', 'ම', 'ා', 'ට', ' ', 'ව', 'ෙ', 'න', 'ද', '්', 'ක', 'ණ', 'ස', 'ප', 'ැ', 'බ', 'ඩ', 'ග', 'ෝ', 'ය', 'ි', 'ල', 'හ', 'ළ', 'ේ', 'ෆ', 'එ', 'ො', 'ඇ', 'ඳ', 'ආ', 'ී', 'ඉ', 'අ', 'ඔ', 'ධ', 'ථ', 'ඒ', 'භ', 'ූ', 'ච', 'උ', 'ඊ', 'ං', 'ඬ', 'ශ', 'ෑ', 'ඈ', 'ෂ', 'ඤ', 'ඪ', 'ඹ', 'ඟ', 'ඕ', 'ඝ', 'ෞ', 'ඓ', 'ඨ', 'ඛ', 'ඵ', 'ඡ', 'ඌ', 'ෛ']


### Instantiate model

In [7]:
input_size = 128    # num rows in instagram
hidden_dim = 512  # 256*2 nodes in each LSTM
num_layers = 4
dropout = 0.1
layer_norm = False   
encoder = Listener(input_size, hidden_dim, num_layers, dropout=dropout, layer_norm=layer_norm)

hid_sz = 512
vocab_size = len(chars)
decoder = AttendAndSpell(hid_sz, encoder.output_size, vocab_size)

hyperparams = {'input_size':input_size, 'hidden_dim':hidden_dim, 
               'num_layers':num_layers,'dropout':dropout, 
               'layer_norm':layer_norm, 'hid_sz':hid_sz, 
                'vocab_size':vocab_size}

model = Seq2Seq(encoder, decoder, tf_ratio = 1.0, device=DEVICE).to(DEVICE)
model.train()

Seq2Seq(
  (encoder): Listener(
    (layers): ModuleList(
      (0): piBLSTM(
        (lstm): LSTM(128, 512, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (1): piBLSTM(
        (lstm): LSTM(2048, 512, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (2): piBLSTM(
        (lstm): LSTM(2048, 512, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (3): piBLSTM(
        (lstm): LSTM(2048, 512, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): AttendAndSpell(
    (attention_layer): Attention(
      (linear1): Linear(in_features=2560, out_features=1280, bias=True)
      (linear2): Linear(in_features=1280, out_features=1, bias=True)
    )
    (pre_lstm_cell): LSTMCell(2118, 512)
    (post_lstm_cell): LSTMCell(2560, 512)
    (mlp): Sequential(
      (0): Linear(in_features=512, out_features=7

### Training

In [8]:
time = str(datetime.datetime.now())
save_dir = os.path.join('trained_models', f'{NAME}_{time}')
try:    
    os.mkdir(save_dir);
except FileExistsError:
    pass

# Saving hyperparmas
with open(os.path.join(save_dir, 'info.pickle'), 'wb') as f:
    pickle.dump(hyperparams, f)

In [9]:
def train(model, device, train_loader, optimizer, epoch, 
          print_interval, writer=None, log_interval=-1, scheduler=None):
    
    model.train()
    running_loss = []
    date1 = datetime.datetime.now()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        loss, _ = model(data, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss.append(loss.detach().item())    # update running loss
        
        # Writing to tensorboard
        if (batch_idx+1) % log_interval == 0:
            if writer:
                global_step = epoch * len(train_loader) + batch_idx
                writer.add_scalar('Loss', np.mean(running_loss[-log_interval:]), global_step)
                
    # After epoch ends           
    date2 = datetime.datetime.now()
    print('Epoch: {}\tMean Loss : {:.6f}\t lr {}\t time {}:'.format(
        epoch, np.mean(running_loss[-print_interval:]), 
        optimizer.state_dict()['param_groups'][0]['lr'],
        date2 - date1))
    

    
def decode_pred_sent(out):
    pred_sent = []
    out = out.squeeze(0)
    for t in out:
        lol = t.max(dim=0)[1].item()
        pred_sent.append(token_to_char[lol])
    return ''.join(pred_sent)


def decode_true_sent(y):
    sent = []
    for t in y:
        sent.append(token_to_char[t.item()])
    return ''.join(sent)

def validate_personal(model, num_sent, dataset, show=False):
    model.eval()
    for _ in range(num_sent):
        idx = random.randint(0, dataset.__len__())

        x, y = dataset.__getitem__(idx)
        plt.imshow(x[0,:,:].detach().log2())
        if show:
            plt.show()
        else:
            plt.close()
        target = y.unsqueeze(dim=0).to(DEVICE)
        data = x.permute(0, 2, 1).to(DEVICE)
        loss, output = model(data, target)
        print("\n")
        print("True sent : ", decode_true_sent(y))
        print("Pred sent : ", decode_pred_sent(output))
        print("Loss :", loss.item())    


In [10]:
# def get_loader(model):
#     train_dataset = SpeechDataset(train_df, data_dir, char_to_token, n_fft=2048, hop_length=512)
    
#     if epoch < 3:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=64, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     elif epoch >= 3 and epoch < 5:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=32, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     elif epoch >= 5:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=8, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     return train_loader

In [11]:
# optimizer = optim.SGD(model.parameters(), lr=0.001)  # lr = 0.2 used in paper
# optimizer = optim.Adadelta(model.parameters())

optimizer = optim.Adam(model.parameters(), amsgrad=True)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

log_interval = 5
print_interval = 50

epochs = 400
load = False

train_dataset = SpeechDataset(train_df, data_dir, char_to_token, n_fft=2048, hop_length=512)
train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
                                              batch_size=64, num_workers=8, 
                                              drop_last=True, shuffle=True)

writer = SummaryWriter(save_dir)
print('save_dir', save_dir)


for epoch in range(1, epochs):
    # train_loader = get_loader(epoch)
    
    train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
    
    # Decrease tf_ratio
#     if epoch % 20 == 0:
#         model.tf_ratio = model.tf_ratio - 0.05
#         validate_personal(model, 1, train_dataset)
#         print("tf_ratio", model.tf_ratio)

    if epoch % 10 == 0 and epoch>=30:
        validate_personal(model, 1, train_dataset)
    
    #scheduler.step()  # update scheduler
    
    # save model
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}')) #save the model
    torch.save(optimizer.state_dict(), os.path.join(save_dir, f'optim_{epoch}'))

save_dir trained_models/amsgrad_clean_1h_2019-12-31 07:23:15.666224
Epoch: 1	Mean Loss : 4.245530	 lr 0.001	 time 0:00:29.917670:
Epoch: 2	Mean Loss : 4.247317	 lr 0.001	 time 0:00:29.485088:
Epoch: 3	Mean Loss : 4.246732	 lr 0.001	 time 0:00:30.648765:
Epoch: 4	Mean Loss : 4.245434	 lr 0.001	 time 0:00:28.715245:
Epoch: 5	Mean Loss : 4.242517	 lr 0.001	 time 0:00:30.207478:
Epoch: 6	Mean Loss : 4.234566	 lr 0.001	 time 0:00:31.465014:
Epoch: 7	Mean Loss : 4.225525	 lr 0.001	 time 0:00:29.935297:
Epoch: 8	Mean Loss : 4.219678	 lr 0.001	 time 0:00:31.768671:
Epoch: 9	Mean Loss : 4.212035	 lr 0.001	 time 0:00:30.132690:
Epoch: 10	Mean Loss : 4.205882	 lr 0.001	 time 0:00:32.580395:
Epoch: 11	Mean Loss : 4.202010	 lr 0.001	 time 0:00:32.153438:
Epoch: 12	Mean Loss : 4.197245	 lr 0.001	 time 0:00:30.768699:
Epoch: 13	Mean Loss : 4.196337	 lr 0.001	 time 0:00:30.235401:
Epoch: 14	Mean Loss : 4.192161	 lr 0.001	 time 0:00:31.435306:
Epoch: 15	Mean Loss : 4.189893	 lr 0.001	 time 0:00:30.7732



True sent :  එතනින් පස්සෙ ජෝන්ගෙ ජීවිතේ වෙනස්ම පැත්තකට පෙරලෙනවා<eos>
Pred sent :  ොකකතතතයපෙෙසසසසසසයයේේේෙසෙසේේසෙේේේේේපෙතපතේයයපදතද්ස්ත<eos>
Loss : 4.095759391784668
Epoch: 111	Mean Loss : 3.759388	 lr 0.001	 time 0:00:32.063506:
Epoch: 112	Mean Loss : 3.757245	 lr 0.001	 time 0:00:31.770395:
Epoch: 113	Mean Loss : 3.753969	 lr 0.001	 time 0:00:32.039945:
Epoch: 114	Mean Loss : 3.746250	 lr 0.001	 time 0:00:31.744447:
Epoch: 115	Mean Loss : 3.741589	 lr 0.001	 time 0:00:30.973200:
Epoch: 116	Mean Loss : 3.748674	 lr 0.001	 time 0:00:29.892615:
Epoch: 117	Mean Loss : 3.731640	 lr 0.001	 time 0:00:32.308737:
Epoch: 118	Mean Loss : 3.740300	 lr 0.001	 time 0:00:30.725765:
Epoch: 119	Mean Loss : 3.733225	 lr 0.001	 time 0:00:32.315861:
Epoch: 120	Mean Loss : 3.732693	 lr 0.001	 time 0:00:31.060725:


True sent :  මියගිය පුද්ගලයාට කරන සැබෑ ගෞරවයක් ද නොවේ<eos>
Pred sent :  කකකටතතෙයෙෙසසසයෙසයයේේසසේසැෙසැසයයප්ඩදෙසැඩේ<eos>
Loss : 4.11738395690918
Epoch: 121	Mean Loss : 3.733336	 lr 0.001	 time 0:0

Epoch: 211	Mean Loss : 3.611342	 lr 0.001	 time 0:00:31.280132:
Epoch: 212	Mean Loss : 3.602684	 lr 0.001	 time 0:00:33.048117:
Epoch: 213	Mean Loss : 3.610093	 lr 0.001	 time 0:00:31.439191:
Epoch: 214	Mean Loss : 3.602588	 lr 0.001	 time 0:00:31.749717:
Epoch: 215	Mean Loss : 3.603895	 lr 0.001	 time 0:00:31.214540:
Epoch: 216	Mean Loss : 3.596107	 lr 0.001	 time 0:00:31.180904:
Epoch: 217	Mean Loss : 3.600955	 lr 0.001	 time 0:00:31.755955:
Epoch: 218	Mean Loss : 3.598211	 lr 0.001	 time 0:00:31.171057:
Epoch: 219	Mean Loss : 3.594671	 lr 0.001	 time 0:00:31.552420:
Epoch: 220	Mean Loss : 3.600740	 lr 0.001	 time 0:00:32.818136:


True sent :  නත්තල් කාලෙදී අමුතු අතුරුදහන්වීම් කිහිපයක් සිද්ධ වෙනවා<eos>
Pred sent :  කකකතඩඑඑළළධෙසෙෙමළුතඑළළෙෙෙෙෙෙේඇීම්ළඇළසිපයස්සසිදුධහළෙල්ඇ<eos>
Loss : 3.964757204055786
Epoch: 221	Mean Loss : 3.595829	 lr 0.001	 time 0:00:31.767928:
Epoch: 222	Mean Loss : 3.610573	 lr 0.001	 time 0:00:30.939254:
Epoch: 223	Mean Loss : 3.603961	 lr 0.001	 time 0:00:31.073

KeyboardInterrupt: 

In [None]:
### continuing with reduced lr
for param_group in optimizer.param_groups:
    param_group['lr'] = param_group['lr']*0.1

for epoch in range(241, epochs):
    # train_loader = get_loader(epoch)
    
    train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
    
    # Decrease tf_ratio
#     if epoch % 20 == 0:
#         model.tf_ratio = model.tf_ratio - 0.05
#         validate_personal(model, 1, train_dataset)
#         print("tf_ratio", model.tf_ratio)

    if epoch % 10 == 0 and epoch>=30:
        validate_personal(model, 1, train_dataset)
    
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}')) #save the model
    torch.save(optimizer.state_dict(), os.path.join(save_dir, f'optim_{epoch}'))

Epoch: 241	Mean Loss : 3.589837	 lr 0.0001	 time 0:00:30.399288:
Epoch: 242	Mean Loss : 3.587140	 lr 0.0001	 time 0:00:31.358197:
Epoch: 243	Mean Loss : 3.574845	 lr 0.0001	 time 0:00:31.290870:
Epoch: 244	Mean Loss : 3.593953	 lr 0.0001	 time 0:00:29.897299:
Epoch: 245	Mean Loss : 3.586245	 lr 0.0001	 time 0:00:30.142773:
Epoch: 246	Mean Loss : 3.595864	 lr 0.0001	 time 0:00:29.982665:
Epoch: 247	Mean Loss : 3.588057	 lr 0.0001	 time 0:00:31.735062:
Epoch: 248	Mean Loss : 3.592814	 lr 0.0001	 time 0:00:30.335721:
Epoch: 249	Mean Loss : 3.591168	 lr 0.0001	 time 0:00:31.002588:
Epoch: 250	Mean Loss : 3.592557	 lr 0.0001	 time 0:00:30.662296:


True sent :  අප යුද්ධයේ පළමු පියවරේදීම පරාද වී අවසානය<eos>
Pred sent :  ොකකඩඩතළළළළධපෙමෙළෙෙයෙෙෙෙීෙෙපෙපසෙසීපසපසථථයද
Loss : 4.078856468200684
Epoch: 251	Mean Loss : 3.584551	 lr 0.0001	 time 0:00:32.219878:
Epoch: 252	Mean Loss : 3.590463	 lr 0.0001	 time 0:00:31.509418:
Epoch: 253	Mean Loss : 3.579198	 lr 0.0001	 time 0:00:30.926085:
Epoch: 254	Mea

### TEST

In [None]:
validate_personal(model, 10, train_dataset)

### Trying with Torchtext

In [None]:
## Knowing the frequency of words

def process(s):
    return list(s)

si_field = Field(
    tokenizer_language='si',
    lower=True, 
    init_token='<sos>', 
    eos_token='<eos>',
    batch_first=True,
    preprocessing=process
)

dataset = TabularDataset(
    path=os.path.join(data_dir, 'temp.csv'),
    format='CSV',
    fields=[('index', None),('unnamed', None), ('sent', si_field)]
)

In [None]:
si_field.build_vocab(dataset, min_freq=2)
print(len(si_field.vocab.stoi))