## Import

In [1]:
import os
import argparse
import pickle as pkl


import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import tarfile
import pickle
import sys

## utility function

In [2]:
def get_batches(inputs, targets, batch_size, shuffle=True):
    """Divide a dataset (usually the training set) into mini-batches of a given size. This is a
    'generator'"""
    
    if inputs.shape[0] % batch_size != 0:
        raise RuntimeError('The number of data points must be a multiple of the batch size.')
    num_batches = inputs.shape[0] // batch_size

    if shuffle:
        idxs = np.random.permutation(inputs.shape[0])
        inputs = inputs[idxs, :]
        targets = targets[idxs]

    for m in range(num_batches):
        yield inputs[m*batch_size:(m+1)*batch_size, :], \
              targets[m*batch_size:(m+1)*batch_size]       

def split_train_test(inputs, targets, train_size, test_size, shuffle=True):
    if shuffle:
        idxs = np.random.permutation(inputs.shape[0])
    else:
        idxs = range(inputs.shape[0])
    train_idxs = idxs[:train_size]
    test_idxs = idxs[train_size: train_size+test_size]
    print(f"Split train and test, train size: {len(train_idxs)}, test_size: {len(test_idxs)}")
    return train_idxs, test_idxs

In [3]:
def decodeChinese(fpath, savePath = None):
    with open(fpath, "r", encoding="gbk") as f:
        data = f.read()
        if savePath: 
            with open(savePath, "w") as w:
                w.write(data)
    return data

def to_var(tensor, cuda=False):
    """Wraps a Tensor in a Variable, optionally placing it on the GPU.

        Arguments:
            tensor: A Tensor object.
            cuda: A boolean flag indicating whether to use the GPU.

        Returns:
            A Variable object, on the GPU if cuda==True.
    """
    if cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

def save_loss_plot(train_losses, val_losses, opts):
    """Saves a plot of the training and validation loss curves.
    """
    plt.figure()
    plt.plot(range(len(train_losses)), train_losses)
    plt.plot(range(len(val_losses)), val_losses)
    plt.title('BS={}, nhid={}'.format(opts.batch_size, opts.hidden_size), fontsize=20)
    plt.xlabel('Epochs', fontsize=16)
    plt.ylabel('Loss', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(opts.checkpoint_path, 'loss_plot.pdf'))
    plt.close()

def checkpoint(encoder, decoder, idx_dict, opts):
    """Saves the current encoder and decoder models, along with idx_dict, which
    contains the char_to_index and index_to_char mappings, and the start_token
    and end_token values.
    """
    with open(os.path.join(opts.checkpoint_path, 'encoder.pt'), 'wb') as f:
        torch.save(encoder, f)

    with open(os.path.join(opts.checkpoint_path, 'decoder.pt'), 'wb') as f:
        torch.save(decoder, f)

    with open(os.path.join(opts.checkpoint_path, 'idx_dict.pkl'), 'wb') as f:
        pkl.dump(idx_dict, f)

## data loader

In [23]:
class Corpus():
    def __init__(self, corpus):
        print("Initialize a corpus object")
        self.corpus = corpus
        
    def save(self, fpath):
        with open(fpath, "wb+") as fd:
            pickle.dump(self, fd)
            
    def preprocess(self, context_size, save=None):
        sentences = self.corpus.split()
        tokens = set()
        num_of_data = 0
        for s in sentences:
            if len(s) + 2 > context_size:
                num_of_data += len(s) + 2 - context_size
                for char in s:
                    tokens.add(char)
        tokens.add("END") #indicator for end of sentence
        tokens.add("STA") #indicator for start of sentence
        self.vocal = sorted(list(tokens))
        token_to_index = {char: index for (index, char) in enumerate(self.vocal)}
        index_to_token = {token_to_index[char]: char for char in token_to_index}
        vocSize = len(tokens)
                
        self.vocSize = vocSize
        print(f"Corpus has vocSize {self.vocSize}, including 'STA' and 'END' indicator")
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.context_size = context_size
        print(f"Corpus has context size {self.context_size}")
        # init tensors
        input_tensor = torch.zeros(num_of_data, context_size, vocSize)
        output_tensor = torch.zeros(num_of_data, vocSize)

        data_idx = 0
        for s in sentences:
            s = list(s)
            s.insert(0, "STA")
            s.append("END")
            if len(s) > context_size:
                for j in range(len(s) - context_size - 1):
                    for c in range(context_size):
                        context_index = token_to_index[s[j+c]]
                        input_tensor[data_idx][c][context_index] = 1
                    output_index = token_to_index[s[j+c + 1]]
                    output_tensor[data_idx][output_index] = 1
                    data_idx += 1
        data = {}
        data["input_tensor"] = input_tensor
        data["output_tensor"] = output_tensor
        
        print(f"input_tensor shape: {input_tensor.shape}")
        print(f"output_tensor shape: {output_tensor.shape}")
        self.data = data
        
        if save:
            self.save(save)
        
    def tensor_to_word(self, tensor):
        if len(tensor.shape) == 1:
            if tensor.shape[0] != self.vocSize:
                raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
            index = int(torch.argmax(tensor))
            return self.index_to_token[index] 
        if len(tensor.shape) == 2:
            if tensor.shape[1] != self.vocSize:
                raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
            indexes = torch.argmax(tensor, dim=1)
            text = ''.join([self.index_to_token[int(i)] for i in indexes])
            return text
        else:
            raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
     
    def words_to_tensor(self, words):
        # words is list
        batch_size = len(words)
        output_tensor = torch.zeros((batch_size, self.vocSize))
        for i in range(batch_size):
            w = words[i]
            index = self.token_to_index[w]
            output_tensor[i][index] = 1
        return output_tensor
        

## Train and evaluation

In [24]:
class LanguageModel(nn.Module):
    def __init__(self, vocSize, embedding_dim, context_size, linear=False):
        super(LanguageModel, self).__init__()
        self.vocSize = vocSize
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.linear = linear
        # layers
        if linear:
            self.embedding_layer = nn.Linear(self.vocSize, self.embedding_dim)
        else:
            self.embedding_layer = nn.Embedding(self.vocSize, self.embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocSize)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, inputs):
        # input batch x 3
        batch_size = inputs.shape[0]
        embeds = self.embedding_layer(inputs).view((batch_size, -1)) # batch x (context_size x embedding_dim)
        o1 = F.sigmoid(self.linear1(embeds)) # batch x 128
        o2 = self.linear2(o1) # batch x vocSize
        o3 = self.softmax(o2) # batch x vocSize
#         o3 = F.log_softmax(o2, dim=1)
        return o3

    

In [25]:
def validation_loss(test_input, test_output, model, loss_func, cuda, batch_size=100):
    with torch.no_grad():
        if cuda:
            model.cuda()
        loss_t = 0.0
        for i, (input_b, out_b) in enumerate(get_batches(test_input, test_output, batch_size)):
            input_b = to_var(input_b, cuda)
            out_b = to_var(out_b, cuda)
            output_predict = model(input_b)
            loss_t += loss_func(output_predict, out_b).item()
    return loss_t/(i+1)

def show_next_word(languageModel, corpus, words, top = 5):
    with torch.no_grad():
        context_size = len(words)
        if context_size != corpus.context_size:
            raise ValueError(f"Context size doesn't match need {corpus.context_size}")
        input_tensor = corpus.words_to_tensor(words).view(1, context_size, corpus.vocSize)
        if not languageModel.linear:
            input_tensor = torch.argmax(to_var(input_tensor), dim=2).type(torch.LongTensor)
        else:
            input_tensor = to_var(input_tensor)
        output = languageModel(input_tensor)
        idxs = torch.argsort(output, descending=True, dim=1)
        print(f"Top {top} candidates for {words}: \n")
        for i in range(top):
            print(f"{corpus.index_to_token[int(idxs[0][i])]},       prob:{output[0][int(idxs[0][i])]}")
        return output

def train(model, training, validation, opts):
    '''
    Return models of last training and of best validation loss 
    '''
    training_input_tensor = training["input_tensor"]
    training_output_tensor = training["output_tensor"]
    training_batch_size = training["batch_size"]
    
    validation_input_tensor = validation["input_tensor"]
    validation_output_tensor = validation["output_tensor"]
    validation_batch_size = validation["batch_size"]
    
    loss_f =  opts.get("loss_function", F.cross_entropy)
    optimizer = opts.get("optimizer", optim.Adam)
    epoches = opts.get("epoches", 10)
    lr = opts.get("learning_rate", 0.01)
    loss_report = opts.get("loss_report", 100)
    cuda = opts.get("cuda", False)
    
    optimizer = optimizer(model.parameters(), lr=lr)
    if cuda:
        model.cuda()
    
    best_loss = None
    best_param = None
    for epoch in range(epoches):
        print(f"Epoch: {epoch + 1}")
        for i, (input_b, out_b) in enumerate(get_batches(training_input_tensor, training_output_tensor, training_batch_size)):
            input_b = to_var(input_b, cuda)
            out_b = to_var(out_b, cuda)
            optimizer.zero_grad()   # zero the gradient buffers
            output = model(input_b)
            loss = loss_f(output, out_b)
            loss.backward()
            optimizer.step()    #
            if (i+1)%loss_report == 0:
                print("Batch:", i+1, "loss:",loss.item())
        # validation error
        with torch.no_grad():
            loss_t = validation_loss(validation_input_tensor, validation_output_tensor, model, loss_f, cuda, validation_batch_size)
            if best_loss is None or loss_t < best_loss:
                best_loss = loss_t
                best_param = model.state_dict()
            print("test loss:",loss_t)
    return (best_param, best_loss), model

In [26]:
corpus = decodeChinese("./renjianshige.txt", "decoded.txt")

In [27]:
my_corpus = Corpus(corpus)

Initialize a corpus object


In [28]:
my_corpus.preprocess(3, "corpus.pk")

Corpus has vocSize 2362, including 'STA' and 'END' indicator
Corpus has context size 3
input_tensor shape: torch.Size([50514, 3, 2362])
output_tensor shape: torch.Size([50514, 2362])


In [29]:
my_corpus.tensor_to_word(my_corpus.data["input_tensor"][102])

'生时代'

In [30]:
languageModel = LanguageModel(my_corpus.vocSize, 48, my_corpus.context_size)
print(languageModel)

train_idxs, validation_idxs = split_train_test(my_corpus.data["input_tensor"], my_corpus.data["output_tensor"], 45500, 5000)
train_input = my_corpus.data["input_tensor"][train_idxs]
train_output = my_corpus.data["output_tensor"][train_idxs]
validation_input = my_corpus.data["input_tensor"][validation_idxs]
validation_output = my_corpus.data["output_tensor"][validation_idxs]
train_input = torch.argmax(train_input, dim=2).type(torch.LongTensor)
train_output = torch.argmax(train_output, dim=1).type(torch.LongTensor)
validation_input = torch.argmax(validation_input, dim=2).type(torch.LongTensor)
validation_output = torch.argmax(validation_output, dim=1).type(torch.LongTensor)

training = {
    "input_tensor":train_input,
    "output_tensor": train_output,
    "batch_size": 100
}

validation = {
    "input_tensor":validation_input,
    "output_tensor": validation_output,
    "batch_size": 100
}

opts = {
    "loss_function": F.cross_entropy,
    "optimizer": optim.Adam,
    "epoches": 50,
    "learning_rate": 0.001,
    "loss_report": 100,
    "cuda": False
}

(best_param, best_loss), languageModel = train(languageModel, training, validation, opts)


LanguageModel(
  (embedding_layer): Embedding(2362, 48)
  (linear1): Linear(in_features=144, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2362, bias=True)
  (softmax): Softmax(dim=1)
)
Split train and test, train size: 45500, test_size: 5000
Epoch: 1
Batch: 100 loss: 7.720731735229492
Batch: 200 loss: 7.708575248718262
Batch: 300 loss: 7.718446731567383
Batch: 400 loss: 7.662111759185791
test loss: 7.683767595291138
Epoch: 2
Batch: 100 loss: 7.718103408813477
Batch: 200 loss: 7.677324295043945
Batch: 300 loss: 7.679450511932373
Batch: 400 loss: 7.713963031768799
test loss: 7.681263380050659
Epoch: 3
Batch: 100 loss: 7.649297714233398
Batch: 200 loss: 7.659726619720459
Batch: 300 loss: 7.7179670333862305
Batch: 400 loss: 7.698071479797363
test loss: 7.680337448120117
Epoch: 4
Batch: 100 loss: 7.679096221923828
Batch: 200 loss: 7.7183027267456055
Batch: 300 loss: 7.666772365570068
Batch: 400 loss: 7.674604415893555
test loss: 7.678219842910766
Epoch: 5
B

Batch: 200 loss: 7.648004055023193
Batch: 300 loss: 7.632894515991211
Batch: 400 loss: 7.657996654510498
test loss: 7.670269241333008
Epoch: 46
Batch: 100 loss: 7.608029842376709
Batch: 200 loss: 7.588305473327637
Batch: 300 loss: 7.577999114990234
Batch: 400 loss: 7.6479949951171875
test loss: 7.670562448501587
Epoch: 47
Batch: 100 loss: 7.63812255859375
Batch: 200 loss: 7.63809871673584
Batch: 300 loss: 7.657971382141113
Batch: 400 loss: 7.6072258949279785
test loss: 7.670093250274658
Epoch: 48
Batch: 100 loss: 7.65797233581543
Batch: 200 loss: 7.6183037757873535
Batch: 300 loss: 7.648054599761963
Batch: 400 loss: 7.6281657218933105
test loss: 7.670127363204956
Epoch: 49
Batch: 100 loss: 7.647970199584961
Batch: 200 loss: 7.647793769836426
Batch: 300 loss: 7.6279473304748535
Batch: 400 loss: 7.5690717697143555
test loss: 7.669312181472779
Epoch: 50
Batch: 100 loss: 7.648627758026123
Batch: 200 loss: 7.638344764709473
Batch: 300 loss: 7.608561992645264
Batch: 400 loss: 7.6481995582580

## Test

In [32]:
show_next_word(languageModel, my_corpus, '我对你')
show_next_word(languageModel, my_corpus, '起已希')
show_next_word(languageModel, my_corpus, ['STA', '我', '有'])
show_next_word(languageModel, my_corpus, ['STA', '我', 'END'])
show_next_word(languageModel, my_corpus, '你好吗')
show_next_word(languageModel, my_corpus, '就就就')

Top 5 candidates for 我对你: 

的,       prob:0.9999771118164062
不,       prob:1.8050252037937753e-05
。,       prob:3.6775013541046064e-06
是,       prob:1.2362371535346028e-06
？,       prob:5.76659653361844e-09
Top 5 candidates for 起已希: 

。,       prob:0.9261553883552551
？,       prob:0.05110378563404083
我,       prob:0.018049441277980804
的,       prob:0.004071414470672607
不,       prob:0.00038540392415598035
Top 5 candidates for ['STA', '我', '有']: 

，,       prob:0.9998370409011841
不,       prob:0.00014015873603057116
。,       prob:2.229893289040774e-05
的,       prob:4.910542088509828e-07
是,       prob:9.66033741889305e-09
Top 5 candidates for ['STA', '我', 'END']: 

，,       prob:0.9963118433952332
是,       prob:0.003097912995144725
不,       prob:0.0005893931956961751
。,       prob:9.716601425679983e-07
的,       prob:2.5536909392664953e-11
Top 5 candidates for 你好吗: 

？,       prob:1.0
。,       prob:2.8754064018698955e-08
是,       prob:4.575181122845606e-09
不,       prob:1.4009696180927733

tensor([[1.0953e-13, 1.5217e-13, 2.4427e-13,  ..., 1.7105e-13, 1.7842e-13,
         1.3332e-12]])

In [52]:
languageModel = LanguageModel(my_corpus.vocSize, 52, my_corpus.context_size, True)
print(languageModel)
loss_f = F.cross_entropy

epoches = 10
lr = 0.001
optimizer = optim.Adam(languageModel.parameters(), lr=lr)
loss_report = 100
for epoch in range(epoches):
    train_idxs, test_idxs = split_train_test(my_corpus.data["input_tensor"], my_corpus.data["output_tensor"], 39500, 10000)
    train_input = my_corpus.data["input_tensor"][train_idxs]
    train_output = my_corpus.data["output_tensor"][train_idxs]
    test_input = my_corpus.data["input_tensor"][test_idxs]
    test_output = my_corpus.data["output_tensor"][test_idxs]
    for i, (input_b, out_b) in enumerate(get_batches(train_input, train_output, 100)):
        input_b = to_var(input_b)
        out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
        optimizer.zero_grad()   # zero the gradient buffers
        output = languageModel(input_b)
        loss = loss_f(output, out_b)
        loss.backward()
        optimizer.step()    #
        if (i+1)%loss_report == 0:
            print("Batch:", i+1, "loss:",loss.item())
    # validation error
    with torch.no_grad():
        loss_t = validation_loss(test_input, test_output, languageModel, loss_f, 100)
#         if loss_t < lbest:
#             lbest = loss_t
#             bestp = mlp.state_dict()
       
        print("test loss:",loss_t)

Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.688051223754883
Batch: 200 loss: 7.678033828735352
Batch: 300 loss: 7.7079877853393555
test loss: 7.714195432662964
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.7279953956604
Batch: 200 loss: 7.707977294921875
Batch: 300 loss: 7.757962703704834
test loss: 7.709945483207703
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.7180376052856445
Batch: 200 loss: 7.712050914764404
Batch: 300 loss: 7.670931339263916
test loss: 7.69998601436615
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.688957691192627
Batch: 200 loss: 7.6944427490234375
Batch: 300 loss: 7.688025951385498
test loss: 7.675041689872741
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.698200225830078
Batch: 200 loss: 7.639062404632568
Batch: 300 loss: 7.647737979888916
test loss: 7.676521506309509
Split train and test, train size: 39500,

In [47]:
def test_validation_loss(test_input, test_output, model, loss_func, batch_size=100):
    with torch.no_grad():
        loss_t = 0.0
        for i, (input_b, out_b) in enumerate(get_batches(test_input, test_output, batch_size)):
            if model.linear:
                input_b = to_var(input_b)
            output_predict = model(input_b)
            loss_t += loss_func(output_predict, out_b).item()
    return loss_t/(i+1)
languageModel = LanguageModel(250, 16, 3, False)
print(languageModel)
loss_f = F.cross_entropy
epoches = 50
lr = 0.1
optimizer = optim.Adam(languageModel.parameters(), lr=lr)
loss_report = 1000
for epoch in range(epoches):
    print(f"Epoch: {epoch}")
    train_input = torch.tensor(train_inputs).type(torch.LongTensor)
    train_output = torch.tensor(train_targets).type(torch.LongTensor)
    test_input = torch.tensor(valid_inputs).type(torch.LongTensor)
    test_output = torch.tensor(valid_targets).type(torch.LongTensor)
    for i, (input_b, out_b) in enumerate(get_batches(train_input, train_output, 100)):
#         input_b = torch.argmax(to_var(input_b), dim=2).type(torch.LongTensor)
#         out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
        optimizer.zero_grad()   # zero the gradient buffers
        output = languageModel(input_b)
        loss = loss_f(output, out_b)
        loss.backward()
        optimizer.step()    #
        if (i+1)%loss_report == 0:
            print("Batch:", i+1, "loss:",loss.item())
    # validation error
    with torch.no_grad():
        loss_t = test_validation_loss(test_input, test_output, languageModel, loss_f, 100)
#         if loss_t < lbest:
#             lbest = loss_t
#             bestp = mlp.state_dict()
       
        print("test loss:",loss_t)

LanguageModel(
  (embedding_layer): Embedding(250, 16)
  (linear1): Linear(in_features=48, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=250, bias=True)
)
Epoch: 0




Batch: 1000 loss: 4.065740585327148
Batch: 2000 loss: 3.846798896789551
Batch: 3000 loss: 3.850691795349121
test loss: 3.7236439556203864
Epoch: 1
Batch: 1000 loss: 3.903912305831909
Batch: 2000 loss: 3.47682523727417
Batch: 3000 loss: 3.4247121810913086
test loss: 3.6603608218572474
Epoch: 2
Batch: 1000 loss: 3.6736810207366943
Batch: 2000 loss: 3.6680121421813965
Batch: 3000 loss: 4.133667945861816
test loss: 3.6497012225530483
Epoch: 3
Batch: 1000 loss: 3.8373239040374756
Batch: 2000 loss: 3.752830743789673
Batch: 3000 loss: 3.7197813987731934
test loss: 3.657488513249223
Epoch: 4
Batch: 1000 loss: 3.8906915187835693
Batch: 2000 loss: 3.8080813884735107
Batch: 3000 loss: 3.625755548477173
test loss: 3.6180806816265147
Epoch: 5
Batch: 1000 loss: 3.8452494144439697
Batch: 2000 loss: 3.647465705871582
Batch: 3000 loss: 3.6182546615600586
test loss: 3.635104233731506
Epoch: 6
Batch: 1000 loss: 4.3388590812683105
Batch: 2000 loss: 3.405104637145996
Batch: 3000 loss: 3.182595729827881
tes