In [1]:
import random
import torch
import torch.utils.data
from torch import nn
import numpy as np
from IPython.core.debugger import set_trace
# set device
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
import csv
jp_sentences = []
en_sentences = []
with open('data/kyoto_lexicon.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    # skip the header row
    startLooking = False
    for row in reader:
        if startLooking:
            jp_sentences.append(row[0])
            en_sentences.append(row[1])
        startLooking = True
print(jp_sentences[:5])
print(en_sentences[:5])
print(len(jp_sentences))
print(len(en_sentences))

['102世吉田日厚貫首', '1月15日：成人祭、新年祭', '1月3日：家運隆盛、商売繁盛祈願祭', '1月7日：七種粥神事', '21世紀COEプログラム']
['the 102nd head priest, Nikko TOSHIDA', '15th January: Seijin-sai (Adult Festival), the New Year Festival', '3rd January: Prayer Festival for the prosperity of family fortunes and business', '7th January: Nanakusa-gayu shinji (a divine service for a rice porridge with seven spring herbs to insure health for the new year)', 'The 21st Century Center Of Excellence Program']
51982
51982


# character-by-character prediction

In [3]:
# encoding and decoding characters
class CharacterTable:
    def __init__(self, charset):
        self.charset = charset
        self.charset = frozenset(self.charset)
        self.charlist = ['<null>', '<sos>', '<eos>'] + list(self.charset)
        # it is important that null is at index 0 since padding fills with zeroes
        self.vocab_size = len(self.charlist)
    def encode(self, char):
        '''convert from character to index
        can process (nested) list of characters'''
        if type(char) is type('asdf'):
            # char is a string
            return self.charlist.index(char)
        else:
            # char is a list of strings
            return [self.encode(char) for char in char]
    def decode(self, charInd):
        '''convert from index to character
        can process (nested) list of indices'''
        if type(charInd) is type(22):
            # charInd is an int
            return self.charlist[charInd]
        else:
            # charInd is a list of ints
            return [self.encode(charInd) for charInd in charInd]
jp_chartable = CharacterTable(set(''.join(jp_sentences)))
en_chartable = CharacterTable(set(''.join(en_sentences)))
print(en_chartable.encode([['a', 'b'], ['c', 'd']]))
print(jp_chartable.decode(1234))

[[108, 170], [42, 4]]
笛


In [4]:
# character-by-character prediction model
class CharacterPredictor(nn.Module):
    def __init__(self, chartable, embedding_dimensions=64, hidden_size=100):
        super(CharacterPredictor, self).__init__()
        # model constants
        self.embedding_dimensions = embedding_dimensions
        self.hidden_size = hidden_size
        self.chartable = chartable
        self.vocab_size = self.chartable.vocab_size
        # model layers
        self.embedding = nn.Embedding(self.vocab_size, embedding_dimensions)
        self.RNN = nn.LSTM(
            input_size=self.embedding_dimensions,
            hidden_size=self.hidden_size, 
            batch_first=True
        )
        # linear layer for converting from hidden state to softmax
        self.linear = nn.Sequential(
            nn.Linear(self.hidden_size, self.vocab_size),
            nn.LogSoftmax(dim=-1)
        )
    
    
    def forward(self, padded_seq, lengths):
        '''
        predicts sequence of characters at every step
        seq (batch, seq) padded tensor of character indices
        returns (batch, seq, vocab) softmaxes
        implicit teacher forcing by torch RNN
        '''
        seq_len = padded_seq.shape[1]
        padded_seq_embed = self.embedding(padded_seq) # (batch, seq, embed)
        packed_seq_embed = torch.nn.utils.rnn.pack_padded_sequence(padded_seq_embed, lengths, batch_first=True)
        packed_hidden_states, (h_final, cell_final) = self.RNN(packed_seq_embed)
        padded_hidden_states, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(packed_hidden_states, batch_first=True, total_length=seq_len)
        # hidden_states (batch, seq, hidden) hidden states
        y_hat = self.linear(padded_hidden_states)
        # y_hat (batch, seq, vocab) softmaxes
        return y_hat
    
    
    def predict(self, padded_seq, lengths):
        pred = self.forward(padded_seq, lengths)
        # (batch, seq, vocab)
        maxInds = pred.max(2)[1]
        # (batch, seq)
        return pred, maxInds

In [5]:
# load data
def padded_train_test(sentences, chartable, train_test_split=.2, batch_size=500):
    '''
    small train_test_split means mostly train data
    ['hello world', ...], chartable, train_test_split -> (train data, test data) padded long tensors of character indices
    small train_test_split means mostly train data
    output shapes (train_size, maxlen), (test_size, maxlen)
    '''
    def pad_sequence(sentences):
        '''
        ['hello world', ...] -> train dataloader, test data loader
        outputs 2 dataloaders containing (seqs, lens)
        tensors are padded and sorted 
        '''
        sentence_indices = [chartable.encode(list(sentence)) for sentence in sentences]
        # list of list of indices
        lengths = torch.LongTensor([len(sentence) for sentence in sentence_indices])
        sentence_tensors = [torch.LongTensor(sentence).to(device) for sentence in  sentence_indices]
        padded = torch.nn.utils.rnn.pad_sequence(sentence_tensors, batch_first=True)
        lengths, perm_idx = lengths.sort(0, descending=True)
        # perm_idx is the permutation of sentence indices as sorted by length
        padded = padded[perm_idx]
        return padded, lengths
    
    length = len(sentences)
    # the index to separate train from test
    split = int(length * train_test_split)
    
    # shuffle before splitting so test doesn't just get the alphabetically sooner sentences
    sentences = random.sample(sentences, length)
    
    train_sentences = sentences[split:]
    test_sentences = sentences[:split]
    
    padded_train = pad_sequence(train_sentences)
    padded_test = pad_sequence(test_sentences)
    
    padded_trainset = torch.utils.data.TensorDataset(*padded_train)
    padded_testset = torch.utils.data.TensorDataset(*padded_test)
    
    padded_trainloader = torch.utils.data.DataLoader(padded_trainset, batch_size=batch_size, shuffle=False, num_workers=0)
    padded_testloader = torch.utils.data.DataLoader(padded_testset, batch_size=batch_size, shuffle=False, num_workers=0)
    # shuffle must be false to maintain sorting by length
    
    return padded_trainloader, padded_testloader
padded_en_trainloader, padded_en_testloader = padded_train_test(en_sentences, en_chartable)
padded_jp_trainloader, padded_jp_testloader = padded_train_test(jp_sentences, jp_chartable)

In [8]:
def train_char(jp=True, lr=.1, epochs=1):
    trainloader = padded_en_trainloader
    chartable = en_chartable
    if jp:
        trainloader = padded_jp_trainloader
        chartable = jp_chartable
    model = CharacterPredictor(chartable).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    loss_fn = nn.NLLLoss()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        num_losses = 0
        for index, data in enumerate(trainloader, 0):
            model.zero_grad()
            padded_seq, lengths = data
            pred = model(padded_seq, lengths)
            
            batch_size = padded_seq.shape[0]
            maxlen = padded_seq.shape[1]
            vocab_size = pred.shape[-1]
            padded_seq_flat = padded_seq.view(batch_size*maxlen)
            pred_flat = pred.contiguous().view(batch_size*maxlen, vocab_size)
            
            loss = loss_fn(pred_flat, padded_seq_flat)
            loss.backward()
            optimizer.step()
            total_loss += loss.data
            num_losses += 1
        avg_loss = total_loss / num_losses
        losses.append(avg_loss)
        print('loss at epoch {}: {}'.format(epoch+1, avg_loss))
    print('final loss after {} epochs: {}'.format(epochs, losses[-1]))
    return model, losses
print('jp training')
jp_model, jp_losses = train_char()
print('en training')
en_model, en_losses = train_char(False)

jp training
loss at epoch 1: 4.9767913818359375
final loss after 1 epochs: 4.9767913818359375
en training
loss at epoch 1: 2.3771841526031494
final loss after 1 epochs: 2.3771841526031494


In [None]:
# metrics
def perplexity(pred, actual):
    '''
    pred (batch, seq, vocab) logsoftmax
    actual (batch, seq) longs
    geometric mean of product of p(next word | previous words) for whole sentence
    average (arithmetic mean) by batch
    '''
    batch_size, seq_len, vocab_size = pred.shape
    pred = torch.exp(pred)
    geo_means = [] # probabilities of correct characters
    for i in range(batch_size):
        probs = torch.index_select(pred[i], 1, actual[i])
        product = torch.sum(probs)
        geo_mean = torch.pow(product, 1/seq_len)
        geo_means.append(geo_mean)
    ### left off here debugging zero perplexity
    return sum(geo_means) / len(geo_means)
def print_metrics(model, name, testloader):
    loss_fn = nn.NLLLoss()
    losses = []
    sentence_accuracies = []
    character_accuracies = []
    for index, data in enumerate(testloader, 0):
        padded_seq, lengths = data
        pred, maxInds = model.predict(padded_seq, lengths)
        perplexity(pred, padded_seq)
        
        batch_size = padded_seq.shape[0]
        maxlen = padded_seq.shape[1]
        vocab_size = pred.shape[-1]
        
        padded_seq_flat = padded_seq.view(batch_size*maxlen)
        pred_flat = pred.contiguous().view(batch_size*maxlen, vocab_size)
        loss = loss_fn(pred_flat, padded_seq_flat).data
        
        correct_characters = torch.sum(max_ind == padded_seq)
        total_characters = batch_size*maxlen
        correct_sentences = 0
        total_sentences = batch_size
        
        for i in range(batch_size):
            if torch.all(maxInds[i] == padded_seq[i]):
                correct_sentences += 1
        sentence_accuracy = correct_sentences / total_sentences
        character_accuracy = correct_characters / total_characters
        
        losses.append(loss)
        sentence_accuracies.append(sentence_accuracy)
        character_accuracies.append(character_accuracy)
    loss_avg = sum(losses) / len(losses)
    sentence_accuracy_avg = sum(sentence_accuracies) / len(sentence_accuracies)
    character_accuracy_avg = sum(character_accuracies) / len(character_accuracies)
    print('model: {}, validation loss: {}, sentence accuracy: {}, character accuracy: {}'.format(name, loss_avg, sentence_accuracy_avg, character_accuracy_avg))
print_metrics(jp_model, 'jp character predictor', padded_jp_testloader)
print_metrics(en_model, 'en character predictor', padded_en_testloader)

In [None]:
torch.cuda.empty_cache()

In [None]:
1 + 1