## Language Model

Language models form an important part of any ASR or MT systems. Language models assist the model to output sentences which make more sense. They are trained with huge corporas of monolingual text, and are coupled with decoder to output sentences. 

There are two types of Language models integration techniques, namely Soft fusion and hard fusion.
We are going to integrate soft fusion into our decoder.

In [1]:
import os
import random
import datetime
import numpy as np
import pandas as pd

import torch
import torch.nn as nn       # neural Networks module of pytorch for extending
import torch.optim as optim     # Optimizers
import torch.nn.functional as F
import matplotlib.pyplot as plt

import spacy    # for English tokenization
import dill      # for saving field of the datasets

from torchtext.data import Field, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

### Creating train and validation datasets

In [2]:
# fields take in text and output tensors
# can add preprocessing pipelines
DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else 'cpu'

en_field = Field(
    tokenize='spacy', 
    tokenizer_language='en', 
    lower=True, 
    init_token='<sos>', 
    eos_token='<eos>',
    batch_first=True,
    include_lengths=True,
)

In [3]:
def save_as_csv(max):
    data = []
    with open('librispeech-lm-norm.txt', 'rb') as text_file:
        data = text_file.readlines()

    data_df = pd.DataFrame(data[:max])
    data_df.to_csv('lm_corpus.csv', header=None)
    print(data_df.head())

  
NUM_SENTENCES = 100000

if not os.path.exists('lm_corpus.csv'):
    print('Saving as csv.. may take few minutes')
    save_as_csv(max=NUM_SENTENCES)

In [66]:
dataset = TabularDataset(
    path='lm_corpus.csv',
    format='CSV',
    fields=[('id', None), ('sent', en_field)]
)

val_data, train_data = dataset.split(split_ratio=0.05)

# build vocabularies
en_field.build_vocab(dataset, min_freq=5)
print('vocab size:', len(en_field.vocab.stoi))

BATCH_SIZE = 32

train_iterator, val_iterator = BucketIterator.splits(
        (train_data, val_data),
        batch_size = BATCH_SIZE,
        sort_key = lambda x: len(x.sent),
        shuffle=True,
        device=DEVICE
)

data = next(iter(train_iterator))
print(data.sent[0].shape)
print('Num training examples', len(train_iterator)*train_iterator.batch_size)

vocab size: 22226
torch.Size([32, 58])
Num training examples 95008


In [13]:
# save the datasets for later use
with open('en_field.Field', 'wb') as f:
    dill.dump(en_field, f)

### RNNLM Model

In [72]:
class RNNLM(nn.Module):
    ''' RNN Language Model '''

    def __init__(self, vocab_size, emb_dim, dim, n_layers, pad_token, dropout):
        super().__init__()
        self.dim = dim
        self.vocab_size = vocab_size
        self.n_layers = n_layers
        self.pad_token  = pad_token 
        
        self.dp1 = nn.Dropout(dropout)
        self.dp2 = nn.Dropout(dropout)
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(emb_dim, dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(dim, vocab_size)

    def forward(self, x, lengths, hidden=None):
        
        if self.training:
            emb_x = self.dp1(self.embed(x))
            packed = pack_padded_sequence(emb_x, lengths, batch_first=True, enforce_sorted=False)
            out, hidden = self.rnn(packed, hidden)
            padded, _ = pad_packed_sequence(out, batch_first=True, padding_value=self.pad_token)
            out = self.linear(self.dp2(padded))
            return out, hidden
        else:
            self.rnn.flatten_parameters()
            
            # producing a single example
            h = torch.zeros((1*self.n_layers, 1,  self.dim)).to(DEVICE)
            c = torch.zeros((1*self.n_layers, 1, self.dim)).to(DEVICE)
            hidden = (h, c)
            outputs = []
            
            while len(outputs)<10:
                emb_x = self.dp1(self.embed(x))
                out, hidden = self.rnn(emb_x, hidden)
                out = self.linear(self.dp2(out))
                y_t = out.argmax(dim=2)
                outputs.append(y_t.item())
                x = y_t    # input for next time step
            return outputs
            

### Training Langauage Model

In [88]:
def train(model, train_iterator, optimizer, vocab_size, epoch, pad_token,
          print_interval, writer=None, log_interval=-1):
    
    running_loss = []
    date1 = datetime.datetime.now()
    for iter_n, batch in enumerate(train_iterator):
                
        X, lengths = batch.sent    # sentence
        y_out, _ = model(X, lengths)
        
        # A simple hack
        lengths = lengths - 1  # Since <eos> should not be fed to the model
        Y = F.pad(X[:,1:], (0,1), mode='constant', value=pad_token) 
     
        optimizer.zero_grad()
        loss = F.cross_entropy(y_out.view(-1, vocab_size), Y.reshape(-1))
        loss.backward()        
        optimizer.step()
        
        running_loss.append(loss.detach().item())    # update running loss
        
        # writing to console after print_interval batches
        if (iter_n+1) % print_interval == 0:
            date2 = datetime.datetime.now()
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tMean Loss : {:.6f}\t time {}:'.format(
                epoch, iter_n * len(data), len(train_iterator)*train_iterator.batch_size,
                100. * iter_n / len(train_iterator), 
                np.mean(running_loss[-print_interval:]), 
                date2 - date1))
            date1 = date2

        # Logging in tensorboard
        if (iter_n) % log_interval == 0:
            if writer:
                global_step = epoch * len(train_iterator) + iter_n
                writer.add_scalar('Loss', np.mean(running_loss[-log_interval:]), global_step)            

In [90]:
vocab_size = len(en_field.vocab.stoi)
emb_dim = 100
dim = 1024  # lstm cell dimension
num_layers = 3

pad_token = en_field.vocab.stoi['<pad>']

model = RNNLM(vocab_size, emb_dim, dim, num_layers, pad_token, dropout=0.1)

model.to(DEVICE)   # move model to GPU
epochs = 10
optimizer = optim.Adam(model.parameters())
print_interval = 200

for epoch in range(epochs):
    train(model, train_iterator, optimizer, vocab_size, epoch, pad_token, print_interval)
    print('-'*10)

----------
----------


KeyboardInterrupt: 

## Sampling Novel sentences

a. Starting with sos token

In [92]:
x_in = torch.tensor(en_field.vocab.stoi['<sos>']).view((1,1)).to(DEVICE)
model.eval()
out = model(x_in, lengths=-1)
out

[7, 68, 0, 6, 3, 1, 0, 6, 3, 1]

In [93]:
for i in out:
    print(en_field.vocab.itos[i], end=' ')

b'a certain <unk> ' <eos> <pad> <unk> ' <eos> <pad> 

b. Starting with a word

In [95]:
sent_in = en_field.tokenize('What')
tokens = [en_field.vocab.stoi[s] for s in sent_in]
out = model(torch.tensor(tokens).view(1, -1).to(DEVICE), lengths=-1)
for i in out:
    print(en_field.vocab.itos[i], end=' ')

' <eos> <pad> <unk> ' <eos> <pad> <unk> ' <eos> 