In [1]:
import random
import torch
import torch.utils.data
from torch import nn
import numpy as np
from IPython.core.debugger import set_trace
# set device
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
import csv
jp_sentences = []
en_sentences = []
with open('data/kyoto_lexicon.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    # skip the header row
    startLooking = False
    for row in reader:
        if startLooking:
            jp_sentences.append(row[0])
            en_sentences.append(row[1])
        startLooking = True
print(jp_sentences[:5])
print(en_sentences[:5])
print(len(jp_sentences))
print(len(en_sentences))

['102世吉田日厚貫首', '1月15日：成人祭、新年祭', '1月3日：家運隆盛、商売繁盛祈願祭', '1月7日：七種粥神事', '21世紀COEプログラム']
['the 102nd head priest, Nikko TOSHIDA', '15th January: Seijin-sai (Adult Festival), the New Year Festival', '3rd January: Prayer Festival for the prosperity of family fortunes and business', '7th January: Nanakusa-gayu shinji (a divine service for a rice porridge with seven spring herbs to insure health for the new year)', 'The 21st Century Center Of Excellence Program']
51982
51982


# character-by-character prediction

In [3]:
# encoding and decoding characters
class CharacterTable:
    def __init__(self, charset):
        self.charset = charset
        self.charset = frozenset(self.charset)
        self.charlist = ['<null>'] + list(self.charset)
        # it is important that null is at index 0 since padding fills with zeroes
        self.vocab_size = len(self.charlist)
    def encode(self, char):
        '''convert from character to index
        can process (nested) list of characters'''
        if type(char) is type('asdf'):
            # char is a string
            return self.charlist.index(char)
        else:
            # char is a list of strings
            return [self.encode(char) for char in char]
    def decode(self, charInd):
        '''convert from index to character
        can process (nested) list of indices'''
        if type(charInd) is type(22):
            # charInd is an int
            return self.charlist[charInd]
        else:
            # charInd is a list of ints
            return [self.encode(charInd) for charInd in charInd]
jp_chartable = CharacterTable(set(''.join(jp_sentences)))
en_chartable = CharacterTable(set(''.join(en_sentences)))
print(en_chartable.encode([['a', 'b'], ['c', 'd']]))
print(jp_chartable.decode(1234))
print(jp_chartable.vocab_size, en_chartable.vocab_size)

[[161, 152], [38, 75]]
彦
3910 172


In [4]:
# sequence prediction model
class Predictor(nn.Module):
    def __init__(self, table, embedding_dimensions=64, hidden_size=100):
        super(Predictor, self).__init__()
        # model constants
        self.embedding_dimensions = embedding_dimensions
        self.hidden_size = hidden_size
        self.table = table
        self.vocab_size = self.table.vocab_size
        # model layers
        self.embedding = nn.Embedding(self.vocab_size, embedding_dimensions)
        self.RNN = nn.LSTM(
            input_size=self.embedding_dimensions,
            hidden_size=self.hidden_size, 
            batch_first=True
        )
        # linear layer for converting from hidden state to softmax
        self.linear = nn.Sequential(
            nn.Linear(self.hidden_size, self.vocab_size),
            nn.LogSoftmax(dim=-1)
        )
    
    
    def forward(self, padded_seq, lengths):
        '''
        predicts sequence of characters at every step
        seq (batch, seq) padded tensor of character indices
        returns (batch, seq, vocab) softmaxes
        implicit teacher forcing by torch RNN
        '''
        seq_len = padded_seq.shape[1]
        padded_seq_embed = self.embedding(padded_seq) # (batch, seq, embed)
        packed_seq_embed = torch.nn.utils.rnn.pack_padded_sequence(padded_seq_embed, lengths, batch_first=True)
        packed_hidden_states, (h_final, cell_final) = self.RNN(packed_seq_embed)
        padded_hidden_states, input_sizes = torch.nn.utils.rnn.pad_packed_sequence(packed_hidden_states, batch_first=True, total_length=seq_len)
        # hidden_states (batch, seq, hidden) hidden states
        y_hat = self.linear(padded_hidden_states)
        # y_hat (batch, seq, vocab) softmaxes
        return y_hat
    
    
    def predict(self, padded_seq, lengths):
        pred = self.forward(padded_seq, lengths)
        # (batch, seq, vocab)
        maxInds = pred.max(2)[1]
        # (batch, seq)
        return pred, maxInds

In [5]:
# load data
def padded_train_test(sentences, table, train_test_split=.2, batch_size=500, word=False):
    '''
    small train_test_split means mostly train data
    ['hello world', ...] or [['hello', 'world',...],...], table, train_test_split -> (train data, test data) padded tensor dataloaders
    small train_test_split means mostly train data
    output "shapes" (train_size, maxlen), (test_size, maxlen) with given batch size
    '''
    def pad_sequence(sentences):
        '''
        ['hello world', ...] or [['hello', 'world',...],...] -> (padded long tensor, lengths tensor)
        tensors are padded and sorted 
        '''
        sentence_indices = [table.encode(list(sentence)) for sentence in sentences]
        if word:
            sentence_indices = [table.encode(sentence) for sentence in sentences]
        # list of list of indices
        lengths = torch.LongTensor([len(sentence) for sentence in sentence_indices])
        sentence_tensors = [torch.LongTensor(sentence).to(device) for sentence in  sentence_indices]
        padded = torch.nn.utils.rnn.pad_sequence(sentence_tensors, batch_first=True)
        lengths, perm_idx = lengths.sort(0, descending=True)
        # perm_idx is the permutation of sentence indices as sorted by length
        padded = padded[perm_idx]
        return padded, lengths
    
    length = len(sentences)
    # the index to separate train from test
    split = int(length * train_test_split)
    
    # shuffle before splitting so test doesn't just get the alphabetically sooner sentences
    sentences = random.sample(sentences, length)
    
    train_sentences = sentences[split:]
    test_sentences = sentences[:split]
    
    padded_train = pad_sequence(train_sentences)
    padded_test = pad_sequence(test_sentences)
    
    padded_trainset = torch.utils.data.TensorDataset(*padded_train)
    padded_testset = torch.utils.data.TensorDataset(*padded_test)
    
    padded_trainloader = torch.utils.data.DataLoader(padded_trainset, batch_size=batch_size, shuffle=False, num_workers=0)
    padded_testloader = torch.utils.data.DataLoader(padded_testset, batch_size=batch_size, shuffle=False, num_workers=0)
    # shuffle must be false to maintain sorting by length
    
    return padded_trainloader, padded_testloader
padded_en_trainloader, padded_en_testloader = padded_train_test(en_sentences, en_chartable)
padded_jp_trainloader, padded_jp_testloader = padded_train_test(jp_sentences, jp_chartable)

In [6]:
def train_model(trainloader, table, lr=.1, epochs=100):
    model = Predictor(table).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    loss_fn = nn.NLLLoss()
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        num_losses = 0
        for index, data in enumerate(trainloader, 0):
            model.zero_grad()
            padded_seq, lengths = data
            pred = model(padded_seq, lengths)
            
            batch_size = padded_seq.shape[0]
            maxlen = padded_seq.shape[1]
            vocab_size = pred.shape[-1]
            padded_seq_flat = padded_seq.view(batch_size*maxlen)
            pred_flat = pred.contiguous().view(batch_size*maxlen, vocab_size)
            
            loss = loss_fn(pred_flat, padded_seq_flat)
            loss.backward()
            optimizer.step()
            total_loss += loss.data
            num_losses += 1
        avg_loss = total_loss / num_losses
        losses.append(avg_loss)
        if (epoch + 1) % (epochs // 10) == 0:
            print('loss at epoch {}: {}'.format(epoch+1, avg_loss))
    print('final loss after {} epochs: {}'.format(epochs, losses[-1]))
    return model, losses

In [7]:
# save and load model
def get_state_path(name):
    return 'states/{}.pt'.format(name)
def save_model(model, name):
    torch.save(model, get_state_path(name))
def load_model(model, name):
    '''loads state dict into given model and returns it'''
    model = torch.load(get_state_path(name))
    return model

In [8]:
def initialize_models(should_train=True):
    global jp_model, en_model
    jp_model = Predictor(jp_chartable).to(device)
    jp_losses = None
    en_model = Predictor(en_chartable).to(device)
    en_losses = None
    if should_train:
        print('jp training')
        jp_model, jp_losses = train_model(padded_jp_trainloader, jp_chartable)
        save_model(jp_model, 'jp_char_model')
        print('en training')
        en_model, en_losses = train_model(padded_en_trainloader, en_chartable)
        save_model(en_model, 'en_char_model')
    else:
        jp_model = load_model(jp_model, 'jp_char_model')
        en_model = load_model(en_model, 'en_char_model')
    return jp_model, en_model
initialize_models(True)

jp training
loss at epoch 10: 0.8963164687156677
loss at epoch 20: 0.8341969847679138
loss at epoch 30: 0.803242564201355
loss at epoch 40: 0.7749859094619751
loss at epoch 50: 0.7511147856712341
loss at epoch 60: 0.730737030506134
loss at epoch 70: 0.7125517725944519
loss at epoch 80: 0.6955256462097168
loss at epoch 90: 0.6789075136184692
loss at epoch 100: 0.6623553037643433
final loss after 100 epochs: 0.6623553037643433
en training


  "type " + obj.__name__ + ". It won't be checked "


loss at epoch 10: 0.2676386535167694
loss at epoch 20: 0.18314088881015778
loss at epoch 30: 0.1319272667169571
loss at epoch 40: 0.10008544474840164
loss at epoch 50: 0.07900269329547882
loss at epoch 60: 0.06452611833810806
loss at epoch 70: 0.05410991609096527
loss at epoch 80: 0.04621864855289459
loss at epoch 90: 0.03998982906341553
loss at epoch 100: 0.034939952194690704
final loss after 100 epochs: 0.034939952194690704


(Predictor(
   (embedding): Embedding(3910, 64)
   (RNN): LSTM(64, 100, batch_first=True)
   (linear): Sequential(
     (0): Linear(in_features=100, out_features=3910, bias=True)
     (1): LogSoftmax()
   )
 ), Predictor(
   (embedding): Embedding(172, 64)
   (RNN): LSTM(64, 100, batch_first=True)
   (linear): Sequential(
     (0): Linear(in_features=100, out_features=172, bias=True)
     (1): LogSoftmax()
   )
 ))

In [9]:
# metrics
def perplexity_metric(pred, actual):
    '''
    pred (batch, seq, vocab) logsoftmax
    actual (batch, seq) longs
    geometric mean of product of p(next word | previous words) for whole sentence
    average (arithmetic mean) by batch
    '''
    batch_size, seq_len, vocab_size = pred.shape
    pred = pred.cpu()
    pred = torch.exp(pred)
    geo_means = [] # probabilities of correct characters
    for i in range(batch_size):
        product = 1
        num_factors = 0
        curr_pred = pred[i]
        curr_actual = actual[i]
        for t in range(seq_len):
            trueInd = curr_actual[t].item()
            # the character index at this timestep
            if trueInd != 0:
                # we don't care how well it predicts nulls
                predSoftmax = curr_pred[t]
                confidence = predSoftmax[trueInd].item()
                product *= confidence
                num_factors += 1
        geo_means.append(product ** (1/num_factors))
    return sum(geo_means) / len(geo_means)
def print_metrics(model, name, testloader, word=False):
    loss_fn = nn.NLLLoss()
    losses = []
    sentence_accuracies = []
    character_accuracies = []
    perplexities = []
    for index, data in enumerate(testloader, 0):
        padded_seq, lengths = data
        pred, maxInds = model.predict(padded_seq, lengths)
        
        perplexity = perplexity_metric(pred, padded_seq)
        
        batch_size = padded_seq.shape[0]
        maxlen = padded_seq.shape[1]
        vocab_size = pred.shape[-1]
        
        padded_seq_flat = padded_seq.view(batch_size*maxlen)
        pred_flat = pred.contiguous().view(batch_size*maxlen, vocab_size)
        loss = loss_fn(pred_flat, padded_seq_flat).item()
        
        correct_characters = torch.sum(maxInds == padded_seq).item()
        total_characters = batch_size*maxlen
        correct_sentences = 0
        total_sentences = batch_size
        
        for i in range(batch_size):
            if torch.all(maxInds[i] == padded_seq[i]):
                correct_sentences += 1
        sentence_accuracy = correct_sentences / total_sentences
        character_accuracy = correct_characters / total_characters
        
        losses.append(loss)
        sentence_accuracies.append(sentence_accuracy)
        character_accuracies.append(character_accuracy)
        perplexities.append(perplexity)
    loss_avg = sum(losses) / len(losses)
    sentence_accuracy_avg = sum(sentence_accuracies) / len(sentence_accuracies)
    character_accuracy_avg = sum(character_accuracies) / len(character_accuracies)
    perplexity_avg = sum(perplexities) / len(perplexities)
    if word:
        print('model: {}\n\tvalidation loss: {}\n\tsentence accuracy: {}\n\tword accuracy: {}\n\tperplexity: {}'.format(name, loss_avg, sentence_accuracy_avg, character_accuracy_avg, perplexity_avg))
    else:
        print('model: {}\n\tvalidation loss: {}\n\tsentence accuracy: {}\n\tcharacter accuracy: {}\n\tperplexity: {}'.format(name, loss_avg, sentence_accuracy_avg, character_accuracy_avg, perplexity_avg))

# english word-to-word
since the japanese model had to learn a mixture of character prediction and word prediction at the same time, let's see how the english model predicts words, and compare it to the japanese character predictor

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mthun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# tokenize sentences
tokenized_sentences = []
for sentence in en_sentences:
    tokenized = nltk.word_tokenize(sentence)
    if len(tokenized) > 0:
        tokenized_sentences.append(tokenized)
print(tokenized_sentences[0])

['the', '102nd', 'head', 'priest', ',', 'Nikko', 'TOSHIDA']


In [12]:
wordlist = []
for sentence in tokenized_sentences:
    for word in sentence:
        wordlist.append(word)
wordset = set(wordlist)
len(wordset)

43216

### that's way too many words!
let's limit the vocab size to 4000 to make the complexity theoretically similar to the japanese model

In [13]:
max_vocab_size = 4000
# word -> frequency
counts = {}
for word in wordlist:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1
sorted_wordset = sorted(list(wordset), key=lambda word: counts[word], reverse=True)
for word in sorted_wordset[:10]:
    print(word, counts[word], sep='\t')
vocab = set([])
for word in sorted_wordset:
    if len(vocab) < max_vocab_size:
        vocab.add(word)
len(vocab)

of	6995
(	6793
)	6769
the	5777
,	3457
no	2899
a	2872
Temple	1617
and	1278
in	1175


4000

In [15]:
# word encoding and decoding
class WordTable:
    def __init__(self, wordset):
        self.wordset = frozenset(wordset)
        self.wordlist = ['<null>', '<unk>'] + list(wordset)
        self.vocab_size = len(self.wordlist)
        
        
    def encode(self, word):
        '''
        expects word string or possibly nested list of word strings
        unks out-of-vocab words
        word(s) -> indices
        '''
        if type(word) == type('asdf'):
            if word in self.wordlist:
                return self.wordlist.index(word)
            else:
                # encode out-of-vocab words with unk
                return self.wordlist.index('<unk>')
        else:
            words = word
            return [self.encode(word) for word in words]
        
        
    def decode(self, wordInd):
        '''
        expects wordInd index or possibly nested list of word indices
        '''
        if type(wordInd) == type(123):
            return self.wordlist[wordInd]
        else:
            wordInds = wordInd
            return [self.decode(wordInd) for wordInd in wordInds]
wordtable = WordTable(vocab)
print(wordtable.decode(200))
print(wordtable.decode(wordtable.encode('why relu works')))
print(wordtable.vocab_size)

holding
<unk>
4002


In [16]:
# load data
padded_word_trainloader, padded_word_testloader = padded_train_test(tokenized_sentences, wordtable, word=True)

In [17]:
print('training english word model')
word_model, word_model_losses = train_model(padded_word_trainloader, wordtable)

training english word model
loss at epoch 10: 0.3745356500148773
loss at epoch 20: 0.3207111358642578
loss at epoch 30: 0.285001277923584
loss at epoch 40: 0.26759713888168335
loss at epoch 50: 0.2563556432723999
loss at epoch 60: 0.24777519702911377
loss at epoch 70: 0.24099139869213104
loss at epoch 80: 0.2355344444513321
loss at epoch 90: 0.23100298643112183
loss at epoch 100: 0.22712047398090363
final loss after 100 epochs: 0.22712047398090363


In [18]:
print_metrics(jp_model, 'jp character predictor', padded_jp_testloader)
print_metrics(en_model, 'en character predictor', padded_en_testloader)
print_metrics(word_model, 'english word to word', padded_word_testloader, word=True)

model: jp character predictor
	validation loss: 0.6796544634160542
	sentence accuracy: 0.0008571428571428572
	character accuracy: 0.9120515418737644
	perplexity: 0.002167849192874908
model: en character predictor
	validation loss: 0.037989046247232525
	sentence accuracy: 0.2675401635401635
	character accuracy: 0.9946749398749398
	perplexity: 0.6145452432555603
model: english word to word
	validation loss: 0.3671317316946529
	sentence accuracy: 0.2557873977873978
	word accuracy: 0.9556184802851468
	perplexity: 0.267146103237912


###  results and conclusions
* the english character-to-character predictor vastly outperformed the japanese character-to-character predictor
* This may be due to the higher complexity of the japanese text. There are 3912 characters in the japanese text, but only 174 in the english text.
* Since the model uses a linear layer and softmax with dimensions equal to the number of characters, that means there is a lot more parameters for the japanese model to learn.
* Due to the nature of the Japanese writing system, the model must effectively learn word-to-word and character-to-character prediction at the same time. This could be why it performed more poorly than the english model
* Although the english model had to learn english spelling (which is hard) and also how words fit together, it still performed much better
* character accuracies were pretty similar, but in all other metrics, the english model performed significantly better. Perhaps this means the japanese model only leanred some short-term patterns, but not long-term "meaning" to predict an entire sentence accurately.
* sentence accuracy is near zero with the japanese model, but 25% with the english model.
* High character accuracy despite low sentence accuracy and low perplexity shows that the japanese model is not confident in its correct predictions and does not have the ability to model language on a sentence level like the english model did.
* despite theoretically having similar model complexity, the english word predictor model significantly out-performs the japanese character-to-character model.
* perhaps the mixture of character and word prediction in japanese is what makes it so difficult. Or perhaps there are other factors that make it more difficult, such as japanese grammar being harder to model, or kanji having multiple meanings. Or maybe the translation is bad/inconsistent.
* To investigate whether the grammar is causing difficulty to model japanese, a translation dataset with Japanese text transcribed into the latin alphabet with words separated by spaces would be ideal. With such a dataset, "true" character to character and word to word prediction can be done in japanese.
* the english word prediction model significantly outperformed the japanese character prediction model. This shows that english is easier to model than japanese in both ways. It isn't just easier to model with character prediction, but word prediction as well. I think it is because there are 3 different writing systems in japanese and they are used and structured very differently. It is hard to model 3 writing systems at the same time.
* the english word prediction model had a similar sentence accuracy to the english character prediction model, despite the higher complexity. This is likely due to the fact that words are ordered in a logical way, while a sequence of characters is more arbitrary. When the character prediction model predicts a sentence, it implicitly models the words from the sequence of characters and the current word being built, all in one hidden state vector. This must be difficult. If it weren't for the increased complexity of the word prediction model due to the higher vocabulary space, it might outperform the character prediction model.
* the loss for the english character model went down do 10% of its original loss during training, but the word prediction model approximately halved its loss in the same number of epochs. This could be due to the complexity differences between the models.  
### future research
* use a parallel translated text transcribed to the latin alphabet. this would eliminate some of the effects of the writing system and come closer to comparing the grammars and vocabularies of the languages and their easiness to learn/model.
* investigate different hyperparameters (such as embedding/hidden dimensions, learning rates, etc.) maybe if all of the models had a larger hidden vector size, the japanese model might outperform the english models. I kept all hyperparameters the same across all models to reduce the number of independent variables, but maybe it would be better to increase model complexity according to the complexity of the task. perhaps scale embedding dimensions linearly with vocab size?
* compare english to other european languages to isolate the language from the writing system. Is english easier to model than italian? is spanish easier to model than italian? Here, it's hard to tell how much of the difference in performance is from the writing systems and how much is from the languages and grammars themselves.