In [1]:
import os
import datetime
import string
import random
import pickle
import numpy as np
import pandas as pd

NAME = # helps to differentiate between various training instances
#os.chdir(os.path.join(os.getcwd(), 'LAS Model'))
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from torchtext.data import Field, BucketIterator, TabularDataset
from data import SpeechDataset, AudioDataLoader
from listener import Listener
from attend_and_spell import AttendAndSpell
from seq2seq import Seq2Seq
from utils import  train

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
DEVICE = torch.device('cuda:1') if torch.cuda.is_available() else 'cpu'
print('DEVICE :', DEVICE)

DEVICE : cuda:1


### Preprocessing

In [3]:
data_dir = '../../../Dataset/Sinhala'

remove_chars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', \
                 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', \
                'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',  'x', 'y', 'z', \
                '“', '”', '\u200b', '\u200c', '\u200d', 'µ', '\x94', '»', 'ª', '’', '‘']


def preprocess(s):
    s = s.replace('\n', '')  # remove '\n'
    return s.translate(str.maketrans('', '', string.punctuation)) # remove punctuation


# reading the main transcript
lines = []
with open(os.path.join(data_dir, 'utt_spk_text.tsv'), 'r', encoding='utf-8') as f:
    lines = f.readlines()

examples = []
for l in lines:
    append = True
    id_, _, sent = l.split('\t')
    sent = preprocess(sent)
    for c in sent:
        if c in remove_chars:  # removing sentences with eng_chars
            append = False
            break
    if append:
        examples.append((id_+'.flac', sent))

data_df = pd.DataFrame(examples, columns=['path', 'sent'])
data_df.to_csv(os.path.join(data_dir, 'data_df.csv')) # save
print("Number of Training examples:", data_df.shape[0])
data_df.head(5)

Number of Training examples: 149569


Unnamed: 0,path,sent
0,0000f47c22.flac,මහවැලි ගඟට ගොස් ආපසු එන ගමනේදී
1,000101700f.flac,උන්වහන්සේ කපාපු
2,000107b539.flac,එය එතනින් අවසන් නොවී
3,00016825d3.flac,සිතින් අයහපතෙහි හැසිරීම නිසයි
4,0002205a57.flac,ඊට අවසරයද හිමිවූ බව ඇය කියන්නීය


As we can see their are some english sentences also in the dataset, so we will go ahead and clean the dataset.

### Load data

In [4]:
from sklearn.model_selection import train_test_split

data_df = pd.read_csv(os.path.join(data_dir, 'data_df.csv'), usecols=['path', 'sent'])
train_df, val_df = train_test_split(data_df, test_size=0.1)
print(train_df.shape)
print(val_df.shape)
train_df.head()

(134612, 2)
(14957, 2)


Unnamed: 0,path,sent
8800,0f0339e0fd.flac,අතට අරගෙන බැලුවම ආච්චිගෙ බොරු දත් දෙකක්ලු
72286,7c42c3e476.flac,ස්තූතියි ලංකාදීපයට
18578,1fe1bd26e0.flac,එම සිදුවීම තුළින්
87200,958b0b82c0.flac,බුදුරජාණන් වහන්සේට
89946,9a35c473e4.flac,තීන්දු කර සෝර්බා අමතයි


### Vocabulary

In [5]:
def get_chars(train_df):
    chars = ['<pad>', '<unk>', '<sos>', '<eos>']
    for idx in range(train_df.shape[0]):
        id_, sent = train_df.iloc[idx]
        for c in sent:
            if c not in chars:
                chars.append(c)
    return chars
    

chars = get_chars(train_df)
char_to_token = {c:i for i,c in enumerate(chars)} 
token_to_char = {i:c for c,i in char_to_token.items()}
sos_token = char_to_token['<sos>']
eos_token = char_to_token['<eos>']
pad_token = char_to_token['<pad>']
unk_token = char_to_token['<unk>']

print("Number of characters:", len(chars))
print(chars)

Number of characters: 82
['<pad>', '<unk>', '<sos>', '<eos>', 'අ', 'ත', 'ට', ' ', 'ර', 'ග', 'ෙ', 'න', 'බ', 'ැ', 'ල', 'ු', 'ව', 'ම', 'ආ', 'ච', '්', 'ි', 'ො', 'ද', 'ක', 'ස', 'ූ', 'ය', 'ං', 'ා', 'ී', 'ප', 'එ', 'ළ', 'ජ', 'ණ', 'හ', 'ේ', 'ෝ', 'ඇ', 'ඒ', 'ඊ', 'ඉ', 'උ', 'ථ', 'ඩ', 'ඳ', 'ෑ', 'ධ', 'ශ', 'ෆ', 'ඔ', 'ඹ', 'ඃ', 'භ', 'ෂ', 'ඥ', 'ඟ', 'ඓ', 'ඕ', 'ෛ', 'ඬ', 'ඌ', 'ෞ', 'ඡ', 'ඵ', 'ඝ', 'ෘ', 'ඤ', 'ඈ', 'ඨ', 'ඛ', 'ඞ', 'ඍ', 'ඣ', 'ඖ', 'ඪ', '–', 'ෲ', '෴', 'ෳ', 'ෟ']


In [6]:
tensorboard_dir = os.path.join('tb_summary')
train_dataset = SpeechDataset(train_df, data_dir, char_to_token)
train_loader = AudioDataLoader(pad_token, train_dataset, batch_size=32, 
                               shuffle=True, drop_last=True, num_workers=8)

### Instantiate model

In [7]:
load = False

if load:
    saved_file = 'Trained Models/Training_2019-12-25 00:09:23.921978/las_model_6'
    model.load_state_dict(torch.load(saved_file))
    start_epoch = int(saved_file[-1]) + 1
    time = os.listdir(tensorboard_dir)[-1]  # use the last one
else:
    start_epoch = 0
    time = str(datetime.datetime.now())

name = f'first_amsgrad_{time}'
save_dir = os.path.join('trained_models_librispeech', name)
try:    
    os.mkdir(save_dir);
except FileExistsError:
    pass

In [8]:
input_size = 128    # num rows in instagram
hidden_dim = 640  # 256*2 nodes in each LSTM
num_layers = 4
dropout = 0.1
layer_norm = True   
encoder = Listener(input_size, hidden_dim, num_layers, dropout=dropout, layer_norm=layer_norm)

hid_sz = 640
embed_dim = 50
vocab_size = len(chars)
decoder = AttendAndSpell(embed_dim, hid_sz, encoder.output_size, vocab_size)

hyperparams = {'input_size':input_size, 'hidden_dim':hidden_dim, 
               'num_layers':num_layers,'dropout':dropout, 
               'layer_norm':layer_norm, 'hid_sz':hid_sz, 
               'embed_dim':embed_dim, 'vocab_size':vocab_size}


# Saving hyperparmas
with open(os.path.join(save_dir, 'info.pickle'), 'wb') as f:
    pickle.dump(hyperparams, f)


model = Seq2Seq(encoder, decoder, tf_ratio = 1.0, device=DEVICE).to(DEVICE)
model.train()

Seq2Seq(
  (encoder): Listener(
    (layers): ModuleList(
      (0): piBLSTM(
        (lstm): LSTM(128, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (1): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (2): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (3): piBLSTM(
        (lstm): LSTM(2560, 640, batch_first=True, bidirectional=True)
        (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): AttendAndSpell(
    (embedding): Embedding(82, 50)
    (attention_layer): At

### Training

In [10]:
# model.load_state_dict(torch.load(os.path.join(save_dir, 'las_model_1')))
# model.train()

In [9]:
optimizer = optim.SGD(model.parameters(), lr=0.2)  # lr = 0.2 used in paper

# optimizer = optim.Adam(model.parameters(), amsgrad=True)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

log_interval = 5
print_interval = 40

epochs = 20
load = False

summary_dir = os.path.join(tensorboard_dir, save_dir)
writer = SummaryWriter(summary_dir)
print('save_dir', save_dir)

for epoch in range(0, epochs):
    print("\nTeacher forcing ratio:", model.tf_ratio)
    train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
    # scheduler.step()                                 # Decrease learning rate
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}'))
    model.tf_ratio = max(model.tf_ratio - 0.025, 0.8)    # Decrease teacher force ratio

save_dir trained_models_librispeech/first_amsgrad_2019-12-30 03:41:23.080861

Teacher forcing ratio: 1.0
Training, Logging: Mean loss of previous 40 batches 



KeyboardInterrupt: 

In [None]:
### DOES DEEPER NETWORK HELP ?
YES

### DOES AMSGRAD HELP ?

### DOES LAYER NORMALIZATION HELP ?
YES, WITH SGD

### TEST

In [10]:
def decode_pred_sent(out):
    pred_sent = []
    out = out.squeeze(0)
    for t in out:
        lol = t.max(dim=0)[1].item()
        pred_sent.append(token_to_char[lol])
    return ''.join(pred_sent)


def decode_true_sent(y):
    sent = []
    for t in y:
        sent.append(token_to_char[t.item()])
    return ''.join(sent)

In [11]:
num_sent = 10
model.eval()

for _ in range(num_sent):
    
    idx = random.randint(0, train_df.shape[0])
    trial_dataset = SpeechDataset(train_df, data_dir, char_to_token)

    x, y = trial_dataset.__getitem__(idx)
    # plt.imshow(x[0,:,:].detach())

    # Model output
    print(y.shape)
    
    target = y.unsqueeze(dim=0).to(DEVICE)
    data = x.permute(0, 2, 1).to(DEVICE)
    loss, output = model(data, target)
    print(output.shape)
    print("True sent : ", decode_true_sent(y), end='\n\n')
    print("Pred sent : ", decode_pred_sent(output))
    print("Loss :", loss.item())    
    print("\n")

torch.Size([21])
torch.Size([1, 21, 82])
True sent :  අනික රාහු කේතු කියලා<eos>

Pred sent :  ්නනනිවන<eos>නනවි<eos>ිනවින<eos>ා<eos>
Loss : 4.284475803375244


torch.Size([37])
torch.Size([1, 37, 82])
True sent :  ඒක උනත් වටිනවා මෙහෙ හරක් මඩු වලට වඩා<eos>

Pred sent :  ්නිවිනි<eos>වි<eos><eos>ාා<eos>ව<eos>නනනවනනි<eos>වින<eos>විා<eos>වාන<eos>
Loss : 4.271315574645996


torch.Size([26])
torch.Size([1, 26, 82])
True sent :  එවිට ලක්ෂ ගණන් නිරිසත්වයෝ<eos>

Pred sent :  ්නාන<eos>විි<eos><eos>විනා<eos>විනනනිි<eos>ා<eos><eos>
Loss : 4.331305027008057


torch.Size([22])
torch.Size([1, 22, 82])
True sent :  ඒවා කීපයක් සඳහන් කරමි<eos>

Pred sent :  ්නා<eos>වින<eos><eos>ි<eos>විනනා<eos>වින<eos><eos>
Loss : 4.337202072143555


torch.Size([13])
torch.Size([1, 13, 82])
True sent :  කලුරිය කරන්න<eos>

Pred sent :  ්නනනනන<eos>විනා<eos><eos>
Loss : 4.31797981262207


torch.Size([34])
torch.Size([1, 34, 82])
True sent :  එම මගුල් පොකුණ අඩියේ දක්නට ලැබුණි<eos>

Pred sent :  ්නනවනනනා<eos>විනිනනවනනන<eos><eo

### Trying with Torchtext

In [76]:
## Knowing the frequency of words

def process(s):
    return list(s)

si_field = Field(
    tokenizer_language='si',
    lower=True, 
    init_token='<sos>', 
    eos_token='<eos>',
    batch_first=True,
    preprocessing=process
)

dataset = TabularDataset(
    path=os.path.join(data_dir, 'temp.csv'),
    format='CSV',
    fields=[('index', None),('unnamed', None), ('sent', si_field)]
)

In [77]:
si_field.build_vocab(dataset, min_freq=2)
print(len(si_field.vocab.stoi))

7963
