## Import

In [2]:
import os
import argparse
import pickle as pkl


import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import tarfile
import pickle
import sys

## utility function

In [3]:
def get_batches(inputs, targets, batch_size, shuffle=True):
    """Divide a dataset (usually the training set) into mini-batches of a given size. This is a
    'generator'"""
    
    if inputs.shape[0] % batch_size != 0:
        raise RuntimeError('The number of data points must be a multiple of the batch size.')
    num_batches = inputs.shape[0] // batch_size

    if shuffle:
        idxs = np.random.permutation(inputs.shape[0])
        inputs = inputs[idxs, :]
        targets = targets[idxs]

    for m in range(num_batches):
        yield inputs[m*batch_size:(m+1)*batch_size, :], \
              targets[m*batch_size:(m+1)*batch_size]       

def split_train_test(inputs, targets, train_size, test_size, shuffle=True):
    if shuffle:
        idxs = np.random.permutation(inputs.shape[0])
    else:
        idxs = range(inputs.shape[0])
    train_idxs = idxs[:train_size]
    test_idxs = idxs[train_size: train_size+test_size]
    print(f"Split train and test, train size: {len(train_idxs)}, test_size: {len(test_idxs)}")
    return train_idxs, test_idxs

In [4]:
def decodeChinese(fpath, savePath = None):
    with open(fpath, "r", encoding="gbk") as f:
        data = f.read()
        if savePath: 
            with open(savePath, "w") as w:
                w.write(data)
    return data

def to_var(tensor, cuda=False):
    """Wraps a Tensor in a Variable, optionally placing it on the GPU.

        Arguments:
            tensor: A Tensor object.
            cuda: A boolean flag indicating whether to use the GPU.

        Returns:
            A Variable object, on the GPU if cuda==True.
    """
    if cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

def save_loss_plot(train_losses, val_losses, opts):
    """Saves a plot of the training and validation loss curves.
    """
    plt.figure()
    plt.plot(range(len(train_losses)), train_losses)
    plt.plot(range(len(val_losses)), val_losses)
    plt.title('BS={}, nhid={}'.format(opts.batch_size, opts.hidden_size), fontsize=20)
    plt.xlabel('Epochs', fontsize=16)
    plt.ylabel('Loss', fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(opts.checkpoint_path, 'loss_plot.pdf'))
    plt.close()

def checkpoint(encoder, decoder, idx_dict, opts):
    """Saves the current encoder and decoder models, along with idx_dict, which
    contains the char_to_index and index_to_char mappings, and the start_token
    and end_token values.
    """
    with open(os.path.join(opts.checkpoint_path, 'encoder.pt'), 'wb') as f:
        torch.save(encoder, f)

    with open(os.path.join(opts.checkpoint_path, 'decoder.pt'), 'wb') as f:
        torch.save(decoder, f)

    with open(os.path.join(opts.checkpoint_path, 'idx_dict.pkl'), 'wb') as f:
        pkl.dump(idx_dict, f)

## data loader

In [5]:
class Corpus():
    def __init__(self, corpus):
        print("Initialize a corpus object")
        self.corpus = corpus
        
    def save(self, fpath):
        with open(fpath, "wb+") as fd:
            pickle.dump(self, fd)
            
    def preprocess(self, context_size, save=None):
        sentences = self.corpus.split()
        tokens = set()
        num_of_data = 0
        for s in sentences:
            if len(s) + 2 > context_size:
                num_of_data += len(s) + 2 - context_size
                for char in s:
                    tokens.add(char)
        tokens.add("END") #indicator for end of sentence
        tokens.add("STA") #indicator for start of sentence
        self.vocal = sorted(list(tokens))
        token_to_index = {char: index for (index, char) in enumerate(self.vocal)}
        index_to_token = {token_to_index[char]: char for char in token_to_index}
        vocSize = len(tokens)
                
        self.vocSize = vocSize
        print(f"Corpus has vocSize {self.vocSize}, including 'STA' and 'END' indicator")
        self.token_to_index = token_to_index
        self.index_to_token = index_to_token
        self.context_size = context_size
        print(f"Corpus has context size {self.context_size}")
        # init tensors
        input_tensor = torch.zeros(num_of_data, context_size, vocSize)
        output_tensor = torch.zeros(num_of_data, vocSize)

        data_idx = 0
        for s in sentences:
            s = list(s)
            s.insert(0, "STA")
            s.append("END")
            if len(s) > context_size:
                for j in range(len(s) - context_size - 1):
                    for c in range(context_size):
                        context_index = token_to_index[s[j+c]]
                        input_tensor[data_idx][c][context_index] = 1
                    output_index = token_to_index[s[j+c + 1]]
                    output_tensor[data_idx][output_index] = 1
                    data_idx += 1
        data = {}
        data["input_tensor"] = input_tensor
        data["output_tensor"] = output_tensor

        self.data = data

        if save:
            self.save(save)
        
    def tensor_to_word(self, tensor):
        if len(tensor.shape) == 1:
            if tensor.shape[0] != self.vocSize:
                raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
            index = int(torch.argmax(tensor))
            return self.index_to_token[index] 
        if len(tensor.shape) == 2:
            if tensor.shape[1] != self.vocSize:
                raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
            indexes = torch.argmax(tensor, dim=1)
            text = ''.join([self.index_to_token[int(i)] for i in indexes])
            return text
        else:
            raise ValueError(f"Bad tensor input. Should be either in shape(vocSize) or in shape(batch, vocSize), vocSize={self.vocSize}")
     
    def words_to_tensor(self, words):
        # words is list
        batch_size = len(words)
        output_tensor = torch.zeros((batch_size, self.vocSize))
        for i in range(batch_size):
            w = words[i]
            index = self.token_to_index[w]
            output_tensor[i][index] = 1
        return output_tensor
        

## Train and evaluation

In [6]:
def validation_loss(test_input, test_output, model, loss_func, batch_size=100):
    with torch.no_grad():
        loss_t = 0.0
        for i, (input_b, out_b) in enumerate(get_batches(test_input, test_output, batch_size)):
            if model.linear:
                input_b = to_var(input_b)
            else:
                input_b = torch.argmax(to_var(input_b), dim=2).type(torch.LongTensor)
            out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
            output_predict = model(input_b)
            loss_t += loss_func(output_predict, out_b).item()
    return loss_t/(i+1)

def show_next_word(languageModel, corpus, words):
    with torch.no_grad():
        context_size = len(words)
        if context_size != corpus.context_size:
            raise ValueError(f"Context size doesn't match need {corpus.context_size}")
        input_tensor = corpus.words_to_tensor(words).view(1, context_size, corpus.vocSize)
        if not languageModel.linear:
            input_tensor = torch.argmax(to_var(input_tensor), dim=2).type(torch.LongTensor)
        else:
            input_tensor = to_var(input_tensor)
        output = languageModel(input_tensor)
        return output

In [7]:
class LanguageModel(nn.Module):
    def __init__(self, vocSize, embedding_dim, context_size, linear=True):
        super(LanguageModel, self).__init__()
        self.vocSize = vocSize
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.linear = linear
        # layers
        if linear:
            self.embedding_layer = nn.Linear(self.vocSize, self.embedding_dim)
        else:
            self.embedding_layer = nn.Embedding(self.vocSize, self.embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocSize)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, inputs):
        batch_size = inputs.shape[0]
        embeds = self.embedding_layer(inputs).view((batch_size,1, -1))
        o1 = F.sigmoid(self.linear1(embeds))
        o2 = self.linear2(o1)
        o3 = self.softmax(o2).view(batch_size, self.vocSize)
        return o3

    

In [8]:
corpus = decodeChinese("./renjianshige.txt", "decoded.txt")

In [9]:
my_corpus = Corpus(corpus)

Initialize a corpus object


In [10]:
my_corpus.preprocess(3, "data.pk")

Corpus has vocSize 2362, including 'STA' and 'END' indicator
Corpus has context size 3


In [11]:
my_corpus.tensor_to_word(my_corpus.data["input_tensor"][102])

'生时代'

## With linear

In [52]:
languageModel = LanguageModel(my_corpus.vocSize, 52, my_corpus.context_size, True)
print(languageModel)
loss_f = F.cross_entropy

epoches = 10
lr = 0.001
optimizer = optim.Adam(languageModel.parameters(), lr=lr)
loss_report = 100
for epoch in range(epoches):
    train_idxs, test_idxs = split_train_test(my_corpus.data["input_tensor"], my_corpus.data["output_tensor"], 39500, 10000)
    train_input = my_corpus.data["input_tensor"][train_idxs]
    train_output = my_corpus.data["output_tensor"][train_idxs]
    test_input = my_corpus.data["input_tensor"][test_idxs]
    test_output = my_corpus.data["output_tensor"][test_idxs]
    for i, (input_b, out_b) in enumerate(get_batches(train_input, train_output, 100)):
        input_b = to_var(input_b)
        out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
        optimizer.zero_grad()   # zero the gradient buffers
        output = languageModel(input_b)
        loss = loss_f(output, out_b)
        loss.backward()
        optimizer.step()    #
        if (i+1)%loss_report == 0:
            print("Batch:", i+1, "loss:",loss.item())
    # validation error
    with torch.no_grad():
        loss_t = validation_loss(test_input, test_output, languageModel, loss_f, 100)
#         if loss_t < lbest:
#             lbest = loss_t
#             bestp = mlp.state_dict()
       
        print("test loss:",loss_t)

Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.688051223754883
Batch: 200 loss: 7.678033828735352
Batch: 300 loss: 7.7079877853393555
test loss: 7.714195432662964
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.7279953956604
Batch: 200 loss: 7.707977294921875
Batch: 300 loss: 7.757962703704834
test loss: 7.709945483207703
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.7180376052856445
Batch: 200 loss: 7.712050914764404
Batch: 300 loss: 7.670931339263916
test loss: 7.69998601436615
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.688957691192627
Batch: 200 loss: 7.6944427490234375
Batch: 300 loss: 7.688025951385498
test loss: 7.675041689872741
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.698200225830078
Batch: 200 loss: 7.639062404632568
Batch: 300 loss: 7.647737979888916
test loss: 7.676521506309509
Split train and test, train size: 39500,

## With Embedding

In [73]:
languageModel = LanguageModel(my_corpus.vocSize, 52, my_corpus.context_size, False)
print(languageModel)
loss_f = F.cross_entropy
epoches = 10
lr = 0.001
optimizer = optim.Adam(languageModel.parameters(), lr=lr)
loss_report = 100
for epoch in range(epoches):
    train_idxs, test_idxs = split_train_test(my_corpus.data["input_tensor"], my_corpus.data["output_tensor"], 39500, 10000)
    train_input = my_corpus.data["input_tensor"][train_idxs]
    train_output = my_corpus.data["output_tensor"][train_idxs]
    test_input = my_corpus.data["input_tensor"][test_idxs]
    test_output = my_corpus.data["output_tensor"][test_idxs]
    for i, (input_b, out_b) in enumerate(get_batches(train_input, train_output, 100)):
        input_b = torch.argmax(to_var(input_b), dim=2).type(torch.LongTensor)
        out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
        optimizer.zero_grad()   # zero the gradient buffers
        output = languageModel(input_b)
        loss = loss_f(output, out_b)
        loss.backward()
        optimizer.step()    #
        if (i+1)%loss_report == 0:
            print("Batch:", i+1, "loss:",loss.item())
    # validation error
    with torch.no_grad():
        loss_t = validation_loss(test_input, test_output, languageModel, loss_f, 100)
#         if loss_t < lbest:
#             lbest = loss_t
#             bestp = mlp.state_dict()
       
        print("test loss:",loss_t)

LanguageModel(
  (embedding_layer): Embedding(2362, 52)
  (linear1): Linear(in_features=156, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2362, bias=True)
  (softmax): Softmax(dim=2)
)
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.657498359680176
Batch: 200 loss: 7.714248180389404
Batch: 300 loss: 7.688559532165527
test loss: 7.682587690353394
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.6651458740234375
Batch: 200 loss: 7.641382217407227
Batch: 300 loss: 7.72933292388916
test loss: 7.6657306623458865
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.670405864715576
Batch: 200 loss: 7.652737617492676
Batch: 300 loss: 7.701430797576904
test loss: 7.65942198753357
Split train and test, train size: 39500, test_size: 10000
Batch: 100 loss: 7.6443047523498535
Batch: 200 loss: 7.5981950759887695
Batch: 300 loss: 7.640997409820557
test loss: 7.65095675945282
Split train 

In [85]:
v = show_next_word(languageModel, my_corpus, '学生时')

In [86]:
my_corpus.tensor_to_word(v)

'，'

In [13]:
data_obj = pickle.load(open('csc421/a1-release/data.pk', 'rb'))
vocab = data_obj['vocab']
train_inputs, train_targets = data_obj['train_inputs'], data_obj['train_targets']
valid_inputs, valid_targets = data_obj['valid_inputs'], data_obj['valid_targets']
test_inputs, test_targets = data_obj['test_inputs'], data_obj['test_targets']

In [14]:
len(vocab)

250

In [15]:
train_inputs.shape

(372500, 3)

In [16]:
train_targets.shape

(372500,)

In [None]:
def test_validation_loss(test_input, test_output, model, loss_func, batch_size=100):
    with torch.no_grad():
        loss_t = 0.0
        for i, (input_b, out_b) in enumerate(get_batches(test_input, test_output, batch_size)):
            if model.linear:
                input_b = to_var(input_b)
            output_predict = model(input_b)
            loss_t += loss_func(output_predict, out_b).item()
    return loss_t/(i+1)
languageModel = LanguageModel(250, 16, 3, False)
print(languageModel)
loss_f = F.cross_entropy
epoches = 50
lr = 0.1
optimizer = optim.Adam(languageModel.parameters(), lr=lr)
loss_report = 1000
for epoch in range(epoches):
    print(f"Epoch: {epoch}")
    train_input = torch.tensor(train_inputs).type(torch.LongTensor)
    train_output = torch.tensor(train_targets).type(torch.LongTensor)
    test_input = torch.tensor(valid_inputs).type(torch.LongTensor)
    test_output = torch.tensor(valid_targets).type(torch.LongTensor)
    for i, (input_b, out_b) in enumerate(get_batches(train_input, train_output, 100)):
#         input_b = torch.argmax(to_var(input_b), dim=2).type(torch.LongTensor)
#         out_b = torch.argmax(to_var(out_b), dim=1).type(torch.LongTensor)
        optimizer.zero_grad()   # zero the gradient buffers
        output = languageModel(input_b)
        loss = loss_f(output, out_b)
        loss.backward()
        optimizer.step()    #
        if (i+1)%loss_report == 0:
            print("Batch:", i+1, "loss:",loss.item())
    # validation error
    with torch.no_grad():
        loss_t = test_validation_loss(test_input, test_output, languageModel, loss_f, 100)
#         if loss_t < lbest:
#             lbest = loss_t
#             bestp = mlp.state_dict()
       
        print("test loss:",loss_t)

LanguageModel(
  (embedding_layer): Embedding(250, 16)
  (linear1): Linear(in_features=48, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=250, bias=True)
  (softmax): Softmax(dim=2)
)
Epoch: 0




Batch: 1000 loss: 5.308314323425293
Batch: 2000 loss: 5.2883148193359375
Batch: 3000 loss: 5.29075813293457
test loss: 5.337990455217259
Epoch: 1
Batch: 1000 loss: 5.328314781188965
Batch: 2000 loss: 5.3483147621154785
Batch: 3000 loss: 5.378314971923828
test loss: 5.33863225342125
Epoch: 2
Batch: 1000 loss: 5.288314342498779
Batch: 2000 loss: 5.3583149909973145
Batch: 3000 loss: 5.308314323425293
test loss: 5.3385598921006725
Epoch: 3
Batch: 1000 loss: 5.398313522338867
Batch: 2000 loss: 5.358314037322998
Batch: 3000 loss: 5.368308067321777
test loss: 5.338095194293607
Epoch: 4
Batch: 1000 loss: 5.288314342498779
Batch: 2000 loss: 5.268314361572266
Batch: 3000 loss: 5.3483147621154785
test loss: 5.33824655061127
Epoch: 5
Batch: 1000 loss: 5.318315029144287
Batch: 2000 loss: 5.388314723968506
Batch: 3000 loss: 5.328314781188965
test loss: 5.3400213272340835
Epoch: 6
Batch: 1000 loss: 5.3583149909973145


In [132]:
def predict_next_word( word1, word2, word3, languageModel, k=10):
        """List the top k predictions for the next word along with their probabilities.
        Inputs:
            word1: The first word as a string.
            word2: The second word as a string.
            word3: The third word as a string.
            k: The k most probable predictions are shown.
        Example usage:
            model.predict_next_word('john', 'might', 'be', 3)
            model.predict_next_word('life', 'in', 'new', 3)"""
            
        if word1 not in vocab:
            raise RuntimeError('Word "{}" not in vocabulary.'.format(word1))
        if word2 not in vocab:
            raise RuntimeError('Word "{}" not in vocabulary.'.format(word2))
        if word3 not in vocab:
            raise RuntimeError('Word "{}" not in vocabulary.'.format(word3))

        idx1, idx2, idx3 = vocab.index(word1), vocab.index(word2), vocab.index(word3)
        input = torch.tensor(np.array([idx1, idx2, idx3]).reshape((1, -1))).type(torch.LongTensor)
        output = languageModel(input)
        return output

In [133]:
v =predict_next_word("life", "in", "the", languageModel)

In [134]:
j = torch.argmax(v, dim=1)

In [135]:
vocab[j]

'.'