In [1]:
import os
import sys
import datetime
import string
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

NAME = 'checking_gpu' # helps to differentiate between various training instances

import torch


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator, TabularDataset

sys.path.append(os.path.abspath(os.path.join('..')))

from models.las_model.data import SpeechDataset, AudioDataLoader
from models.las_model.listener import Listener
from models.las_model.attend_and_spell import AttendAndSpell
from models.las_model.seq2seq import Seq2Seq
#from models.las_model.utils import  train

In [2]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-16GB
Tesla P100-PCIE-16GB
Tesla P100-PCIE-16GB
GeForce GTX 1080 Ti


In [5]:
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

In [6]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-32GB
Tesla V100-PCIE-16GB
Tesla P100-PCIE-16GB
Tesla P100-PCIE-16GB
GeForce GTX 1080 Ti


In [2]:
DEVICE = torch.device('cuda:3') if torch.cuda.is_available() else 'cpu'
print('DEVICE :', DEVICE)

DEVICE : cuda:3


### Preprocessing

In [3]:
root_dir = '../../../Dataset/sinhala_clean'
data_dir = os.path.join(root_dir, 'data')


# reading the main transcript
lines = []
with open(os.path.join(root_dir, 'si_lk.lines.txt'), 'r', encoding='utf-8') as f:
    lines = f.readlines()

examples = []
for l in lines:
    id_, sent, _ = l.split('"')
    id_ = id_.replace("(", '').strip()
    sent = sent.strip()
    examples.append((id_+'.wav',sent))


data_df = pd.DataFrame(examples, columns=['path', 'sent'])
data_df.to_csv(os.path.join(root_dir, 'data_df.csv')) # save
print("Number of Training examples:", data_df.shape[0])
data_df.head(5)

Number of Training examples: 1251


Unnamed: 0,path,sent
0,sin_2241_0329430812.wav,කෝකටත් මං වෙනදා තරම් කාලෙ ගන්නැතිව ඇඳ ගත්තා
1,sin_2241_0598895166.wav,ඇන්ජලීනා ජොලී කියන්නේ පසුගිය දිනවල බොහෝ සෙයින්...
2,sin_2241_0701577369.wav,ආර්ථික චින්තනය හා සාමාජීය දියුණුව ඇති කළ හැකිව...
3,sin_2241_0715400935.wav,ඉන් අදහස් වන්නේ විචාරාත්මක විනිවිද දැකීමෙන් තො...
4,sin_2241_0817100025.wav,අප යුද්ධයේ පළමු පියවරේදීම පරාද වී අවසානය


We have tried removing all the unnecessary characters from the dataset. The others will be replaced by unknown token, while training.

### Load data

In [4]:
from sklearn.model_selection import train_test_split

data_df = pd.read_csv(os.path.join(root_dir, 'data_df.csv'), usecols=['path', 'sent'])
train_df, val_df = train_test_split(data_df, test_size=0.1)
print("Num training example:", train_df.shape)
print("Num validation example", val_df.shape)
train_df.head()

Num training example: (1125, 2)
Num validation example (126, 2)


Unnamed: 0,path,sent
1179,sin_9228_3395907889.wav,මෙන්න කෙනෙක් දැන් මගෙත් එක්ක ඔට්ටු අල්ලන්ට ආවා
1055,sin_7183_1886625053.wav,අනෙක් එවැනි ම රටක් වනුයේ බොලීවියාවයි
161,sin_2282_3763216109.wav,එදාට ඇය නවකතා පොතක් අස්සෙ දාල ලියුමක් එවනවා
1218,sin_9228_7175568893.wav,අලුත්ම චරිතය තමයි මාදුළුවාවේ සෝභිත හාමුදුරුවෝ
72,sin_2241_6534612621.wav,මේ පිළීබඳව ජනයාගේ විරෝධය විවේචනය පැන නැඟූහ


### Vocabulary

In [5]:
nn.Embedding(4, 20)(torch.randint(0,4,(32,))).dtype

torch.float32

In [6]:
def get_chars(train_df):
    chars = ['<pad>', '<unk>', '<sos>', '<eos>']
    for idx in range(train_df.shape[0]):
        id_, sent = train_df.iloc[idx]
        for c in sent:
            if c not in chars:
                chars.append(c)
    return chars
    

chars = get_chars(train_df)
char_to_token = {c:i for i,c in enumerate(chars)} 
token_to_char = {i:c for c,i in char_to_token.items()}
sos_token = char_to_token['<sos>']
eos_token = char_to_token['<eos>']
pad_token = char_to_token['<pad>']
unk_token = char_to_token['<unk>']

print("Number of characters:", len(chars))
print(chars)

Number of characters: 70
['<pad>', '<unk>', '<sos>', '<eos>', 'ම', 'ෙ', 'න', '්', ' ', 'ක', 'ද', 'ැ', 'ග', 'ත', 'එ', 'ඔ', 'ට', 'ු', 'අ', 'ල', 'ආ', 'ව', 'ා', 'ි', 'ර', 'ය', 'ේ', 'බ', 'ො', 'ී', 'ඇ', 'ප', 'ස', 'ච', 'ළ', 'ෝ', 'භ', 'හ', 'ඳ', 'ජ', 'ධ', 'ඟ', 'ූ', 'ං', 'ඉ', 'ඬ', 'ණ', 'ඒ', 'ඹ', 'ඝ', 'ෂ', 'ඨ', 'ශ', 'උ', 'ථ', 'ෑ', 'ෞ', 'ඩ', 'ඕ', 'ඈ', 'ඓ', 'ඵ', 'ඊ', 'ඡ', 'ඛ', 'ඤ', 'ෆ', 'ෛ', 'ඌ', 'ඪ']


### Instantiate model

In [12]:
input_size = 128    # num rows in instagram
hidden_dim = 768  # 256*2 nodes in each LSTM
num_layers = 4
dropout = 0.1
layer_norm = False   
encoder = Listener(input_size, hidden_dim, num_layers, dropout=dropout, layer_norm=layer_norm)

hid_sz = 768
vocab_size = len(chars)
decoder = AttendAndSpell(hid_sz, encoder.output_size, vocab_size)

hyperparams = {'input_size':input_size, 'hidden_dim':hidden_dim, 
               'num_layers':num_layers,'dropout':dropout, 
               'layer_norm':layer_norm, 'hid_sz':hid_sz, 
                'vocab_size':vocab_size}

model = Seq2Seq(encoder, decoder, tf_ratio = 1.0, device=DEVICE).to(DEVICE)
model.train()

Seq2Seq(
  (encoder): Listener(
    (layers): ModuleList(
      (0): piBLSTM(
        (lstm): LSTM(128, 768, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (1): piBLSTM(
        (lstm): LSTM(3072, 768, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (2): piBLSTM(
        (lstm): LSTM(3072, 768, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
      (3): piBLSTM(
        (lstm): LSTM(3072, 768, batch_first=True, bidirectional=True)
        (dp): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): AttendAndSpell(
    (attention_layer): Attention(
      (linear1): Linear(in_features=3840, out_features=1920, bias=True)
      (linear2): Linear(in_features=1920, out_features=1, bias=True)
    )
    (pre_lstm_cell): LSTMCell(3142, 768)
    (post_lstm_cell): LSTMCell(3840, 768)
    (mlp): Sequential(
      (0): Linear(in_features=768, out_features=7

### Training

In [8]:
# model.load_state_dict(torch.load(os.path.join(save_dir, 'las_model_1')))
# model.train()

# load = False
# if load:
#     saved_file = 'Trained Models/Training_2019-12-25 00:09:23.921978/las_model_6'
#     model.load_state_dict(torch.load(saved_file))
#     start_epoch = int(saved_file[-1]) + 1
#     time = os.listdir(tensorboard_dir)[-1]  # use the last one 

time = str(datetime.datetime.now())
save_dir = os.path.join('trained_models', f'{NAME}_{time}')
try:    
    os.mkdir(save_dir);
except FileExistsError:
    pass

# Saving hyperparmas
with open(os.path.join(save_dir, 'info.pickle'), 'wb') as f:
    pickle.dump(hyperparams, f)

In [9]:
def train(model, device, train_loader, optimizer, epoch, 
          print_interval, writer=None, log_interval=-1, scheduler=None):
    
    model.train()
    running_loss = []
    date1 = datetime.datetime.now()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        loss, _ = model(data, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss.append(loss.detach().item())    # update running loss
        
        # Writing to tensorboard
        if (batch_idx+1) % log_interval == 0:
            if writer:
                global_step = epoch * len(train_loader) + batch_idx
                writer.add_scalar('Loss', np.mean(running_loss[-log_interval:]), global_step)
                
    # After epoch ends           
    date2 = datetime.datetime.now()
    print('Epoch: {}\tMean Loss : {:.6f}\t lr {}\t time {}:'.format(
        epoch, np.mean(running_loss[-print_interval:]), 
        optimizer.state_dict()['param_groups'][0]['lr'],
        date2 - date1))
    

    
def decode_pred_sent(out):
    pred_sent = []
    out = out.squeeze(0)
    for t in out:
        lol = t.max(dim=0)[1].item()
        pred_sent.append(token_to_char[lol])
    return ''.join(pred_sent)


def decode_true_sent(y):
    sent = []
    for t in y:
        sent.append(token_to_char[t.item()])
    return ''.join(sent)

def validate_personal(model, num_sent, dataset, show=False):
    model.eval()
    for _ in range(num_sent):
        idx = random.randint(0, dataset.__len__())

        x, y = dataset.__getitem__(idx)
        plt.imshow(x[0,:,:].detach().log2())
        if show:
            plt.show()
        else:
            plt.close()
        target = y.unsqueeze(dim=0).to(DEVICE)
        data = x.permute(0, 2, 1).to(DEVICE)
        loss, output = model(data, target)
        print("\n")
        print("True sent : ", decode_true_sent(y))
        print("Pred sent : ", decode_pred_sent(output))
        print("Loss :", loss.item())    


In [10]:
# def get_loader(model):
#     train_dataset = SpeechDataset(train_df, data_dir, char_to_token, n_fft=2048, hop_length=512)
    
#     if epoch < 3:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=64, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     elif epoch >= 3 and epoch < 5:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=32, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     elif epoch >= 5:
#         train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
#                                                       batch_size=8, num_workers=8, 
#                                                       drop_last=True, shuffle=True)
#     return train_loader

In [None]:
# optimizer = optim.SGD(model.parameters(), lr=0.001)  # lr = 0.2 used in paper
optimizer = optim.Adadelta(model.parameters())

# optimizer = optim.Adam(model.parameters(), amsgrad=True)

# scheduler = optim.lr_scheduler.
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)

log_interval = 5
print_interval = 50

epochs = 100
load = False

train_dataset = SpeechDataset(train_df, data_dir, char_to_token, n_fft=2048, hop_length=512)
train_loader = train_loader = AudioDataLoader(pad_token, train_dataset, 
                                              batch_size=32, num_workers=8, 
                                              drop_last=True, shuffle=True)

writer = SummaryWriter(save_dir)
print('save_dir', save_dir)


for epoch in range(1, epochs):
    # train_loader = get_loader(epoch)
    
    train(model, DEVICE, train_loader, optimizer, epoch, print_interval, writer, log_interval)
    
    # Decrease tf_ratio
    if epoch % 10 == 0:
        model.tf_ratio = model.tf_ratio - 0.05
        validate_personal(model, 1, train_dataset)
        print("tf_ratio", model.tf_ratio)
    # scheduler.step()
    
    # save model
    torch.save(model.state_dict(), os.path.join(save_dir, f'las_model_{epoch}')) #save the model

save_dir trained_models/Adadelta_clean_one_hot_2019-12-31 03:21:10.792257
Epoch: 1	Mean Loss : 4.246294	 lr 1.0	 time 0:00:53.842809:
Epoch: 2	Mean Loss : 4.241021	 lr 1.0	 time 0:00:52.663319:


In [None]:
### DOES DEEPER NETWORK HELP ?
YES

### DOES AMSGRAD HELP ?

### DOES LAYER NORMALIZATION HELP ?
YES, WITH SGD

### TEST

In [None]:
validate_personal(model, 10, train_dataset)

### Trying with Torchtext

In [None]:
## Knowing the frequency of words

def process(s):
    return list(s)

si_field = Field(
    tokenizer_language='si',
    lower=True, 
    init_token='<sos>', 
    eos_token='<eos>',
    batch_first=True,
    preprocessing=process
)

dataset = TabularDataset(
    path=os.path.join(data_dir, 'temp.csv'),
    format='CSV',
    fields=[('index', None),('unnamed', None), ('sent', si_field)]
)

In [None]:
si_field.build_vocab(dataset, min_freq=2)
print(len(si_field.vocab.stoi))