In [2]:
import torch
import random
import zipfile
import time
import math
import numpy as np
from torch import nn, optim
import torch.nn.functional as F

In [10]:
def load_data_jay_lyrics():
    with zipfile.ZipFile('./Dataset/JayLyrics/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n',' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx_dict = dict([(char, i) for i, char in enumerate(idx_to_char)])
    idx_to_char_dict = dict([(i, char) for i, char in enumerate(idx_to_char)])
    corpus_indice = [char_to_idx_dict[char] for char in corpus_chars]
    return corpus_chars, corpus_indice, idx_to_char_dict, char_to_idx_dict, len(idx_to_char_dict)

corpus_chars, corpus_indice, idx_to_char_dict, char_to_idx_dict, vocab_size = load_data_jay_lyrics()

In [4]:
def grad_clipping(params, theta):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    norm = torch.tensor([0.0], device = device)
    for param in params:
        norm += (param.grad.data**2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta/norm) 

In [5]:
def data_iter_random(corpus_indice, batch_size, num_step, device = None):
    num_example = (len(corpus_indice)-1)//num_step
    epoch_size = num_example//batch_size
    example_indice = list(range(num_example))
    random.shuffle(example_indice)
    
    def _data(pos):
        return corpus_indice[pos:pos+num_step]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    for i in range(epoch_size):
        i = i*batch_size
        X = [_data(j*num_step) for j in example_indice[i:i+batch_size]]
        Y = [_data(j*num_step+1) for j in example_indice[i: i+batch_size]]
        yield torch.tensor(X, dtype = torch.float, device = device), torch.tensor(Y, dtype = torch.float, device = device)

In [6]:
def data_iter_consecutive(corpus_indice, batch_size, num_step, device = None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indice = torch.tensor(corpus_indice, dtype = torch.float, device = device)
    data_len = len(corpus_indice)
    batch_len = data_len//batch_size
    indice = corpus_indice[0:batch_len*batch_size].view(batch_size, batch_len)
    epoch_size = batch_len//num_step
    
    for i in range(epoch_size):
        i = i*num_step
        X = indice[:, i: i+num_step]
        X1 = indice[:, i+1:i+num_step+1]
        yield X, X1

In [7]:
class RnnModel(nn.Module):
    def __init__(self, vocab_size, num_hiddens, num_layers):
        super().__init__()
        self.rnn_layer = nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, num_layers=num_layers)
        self.hidden_size = num_hiddens
        self.vocab_size = vocab_size
        self.fc_layer = nn.Linear(num_hiddens, vocab_size)
        self.state = None
        
    def forward(self, inputs, state):
        X = self.to_onehot(inputs, self.vocab_size)
        Y, self.state = self.rnn_layer(X, state)              #n*b*v, n*b*h
        output = self.fc_layer(Y.view(-1, Y.shape[-1]))
        return output, self.state
    
    @staticmethod
    def to_onehot(X, n_class):
        def one_hot(x,n_class):
            x = x.long()
            res=torch.zeros(x.shape[0],n_class, dtype = torch.float, device = x.device)
            res.scatter_(1,x.view(-1,1),1)
            return res        
        return torch.stack([one_hot(X[:,i],n_class) for i in range(X.shape[1])])   #n*b*v

In [8]:
class GruRnnModel(nn.Module):
    def __init__(self, vocab_size, num_hiddens, num_layers):
        super().__init__()
        self.gru_layer = nn.GRU(input_size=vocab_size, hidden_size=num_hiddens, num_layers=num_layers)
        self.hidden_size = num_hiddens
        self.vocab_size = vocab_size
        self.fc_layer = nn.Linear(num_hiddens, vocab_size)
        self.state = None
        
    def forward(self, inputs, state):
        X = self.to_onehot(inputs, self.vocab_size)
        H, self.state = self.gru_layer(X, state)              #n*b*v, n*b*h
        Y = self.fc_layer(H.view(-1, Y.shape[-1]))
        return Y, self.state
    
    @staticmethod
    def to_onehot(X, n_class):
        def one_hot(x,n_class):
            x = x.long()
            res=torch.zeros(x.shape[0],n_class, dtype = torch.float, device = x.device)
            res.scatter_(1,x.view(-1,1),1)
            return res        
        return torch.stack([one_hot(X[:,i],n_class) for i in range(X.shape[1])]) 

In [53]:
class LstmRnnModel(nn.Module):
    def __init__(self, vocab_size, num_hiddens, num_layers):
        super().__init__()
        self.lstm_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens, num_layers=num_layers)
        self.hidden_size = num_hiddens
        self.vocab_size = vocab_size
        self.fc_layer = nn.Linear(self.hidden_size, vocab_size)
        self.state = None
        
    def forward(self, inputs, state):
        X = self.to_onehot(inputs, self.vocab_size)
        H, self.state = self.lstm_layer(X, state)              #n*b*v, n*b*h
        Y = self.fc_layer(H.view(-1, H.shape[-1]))
        return Y, self.state
    
    @staticmethod
    def to_onehot(X, n_class):
        def one_hot(x,n_class):
            x = x.long()
            res=torch.zeros(x.shape[0],n_class, dtype = torch.float, device = x.device)
            res.scatter_(1,x.view(-1,1),1)
            return res        
        return torch.stack([one_hot(X[:,i],n_class) for i in range(X.shape[1])]) 

In [44]:
def predict_rnn(prefix, num_chars, model,idx_to_char_dict, char_to_ida_dict, state = None):
    vocab_size = len(idx_to_char_dict)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    output = [prefix[0]]
    for t in range(num_chars+len(prefix)-1):
        X_indice = torch.tensor([char_to_idx_dict[output[-1]]], device = device).view(-1,1)
        Y, state = model(X_indice, state)
        if t<len(prefix)-1: 
            output.append(prefix[t+1])
        else:    
            output.append(idx_to_char_dict[int(Y.argmax(dim = 1).item())])
    return ''.join(output)

In [43]:
def train_and_predict(corpus_indice, data_iter, batch_size, num_steps, 
                      num_epoches, model, lr, clipping_theta,
                      idx_to_char_dict, char_to_idx_dict, 
                      prefixes, num_chars):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(idx_to_char_dict)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),lr=lr)
    state = None
    
    for epoch in range(num_epoches): 
        train_loss_sum, n = 0.0, 0
        for X_indice_list, Y_indice_list in data_iter(corpus_indice, batch_size, num_steps, device):
            if state is not None:
                state = (state[0].detach(), state[1].detach())
            Y_hat, state = model(X_indice_list, state) 
            Y_indice_list = torch.transpose(Y_indice_list, 0, 1).contiguous().view(-1).long()
            loss = loss_fn(Y_hat, Y_indice_list)
            optimizer.zero_grad()
            loss.backward()
            grad_clipping(model.parameters(), clipping_theta)
            optimizer.step()
            train_loss_sum += loss.item()
            n += 1
        if (epoch+1)%50 == 0:
            train_loss = train_loss_sum/n
            try:
                perplexity = math.exp(train_loss)
            except OverflowError:
                perplexity = float('inf')
            print('epoch: %d, train_loss: %.2f, perplexity: %f'%(epoch+1, train_loss, perplexity))
            
            for prefix in prefixes:
                print(predict_rnn(prefix, num_chars, model, idx_to_char_dict, char_to_idx_dict, state = None))
                

In [55]:
num_epoches, batch_size, lr, clipping_theta = 250, 35, 1e-2, 1e-2
num_steps, num_chars, prefixes = 50, 50, ['分开', '不分开']
#model = RnnModel(vocab_size, 256, 1).to('cuda')
#model = GruRnnModel(vocab_size,256,1).to('cuda') 
model = LstmRnnModel(vocab_size, 256, 1).to('cuda')

In [56]:
train_and_predict(corpus_indice, data_iter_consecutive, batch_size, num_steps, 
                      num_epoches, model, lr, clipping_theta,
                      idx_to_char_dict, char_to_idx_dict, 
                      prefixes, num_chars)

epoch: 50, train_loss: 0.02, perplexity: 1.020630
分开始打呼 管家是一只会说法语举止优雅的猪 吸血前会念约翰福音做为弥补 拥有一双蓝色眼睛的凯萨琳公主 专
不分开 我不开 不能 爱情走的太快就像龙卷风 不能承受我已无处可躲 我不要再想 我不要再想 我不 我不 我
epoch: 100, train_loss: 0.01, perplexity: 1.009803
分开始打呼 妈妈 我说的话 甘会听 不要再这样打我妈妈 难道你手不会痛吗 其实我回家就想要阻止一切 让家
不分开 我不 我不能 爱情走的太快就像龙卷风 不能承受我已无处可躲 我不要再想 我不要再想 我不 我不 我
epoch: 150, train_loss: 0.01, perplexity: 1.008224
分开 没担的可以女人 温柔的让我心疼的可爱女人 透明的让我感动的可爱女人 坏坏的让我疯狂的可爱女人 坏坏
不分开 我不能再爱 我不能再想 我不 我不 我不能 爱情走的太快就像龙卷风 不能承受我已无处可躲 我不要再
epoch: 200, train_loss: 0.01, perplexity: 1.007544
分开 我打开任督二脉 干什么 干什么 东亚病夫的招牌 干什么 干什么 已被我一脚踢开 快使用双截棍 哼哼
不分开 我打开任督二脉 干什么 干什么 东亚病夫的招牌 干什么 干什么 已被我一脚踢开 快使用双截棍 哼哼
epoch: 250, train_loss: 0.01, perplexity: 1.006085
分开 别打我妈 这样对吗干嘛这样 何必让酒牵鼻子走 瞎 说都说不听听 痛是我们在痛痛 周杰伦   简单爱
不分开 我打我妈妈 我说的话你甘会听 不要再这样打我妈妈 难道你手不会痛吗 其实我回家就想要阻止一切 让家
