In [1]:
import torch
import random
import zipfile
import time
import math
import numpy as np
from torch import nn, optim
import torch.nn.functional as F

In [26]:
def load_data_jay_lyrics():
    with zipfile.ZipFile('./Dataset/JayLyrics/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n',' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx_dict = dict([(char, i) for i, char in enumerate(idx_to_char)])
    idx_to_char_dict = dict([(i, char) for i, char in enumerate(idx_to_char)])
    corpus_indice = [char_to_idx_dict[char] for char in corpus_chars]
    return corpus_chars, corpus_indice, idx_to_char_dict, char_to_idx_dict, len(idx_to_char_dict)

In [27]:
corpus_chars, corpus_indice, idx_to_char_dict, char_to_idx_dict, vocab_size = load_data_jay_lyrics()

In [4]:
def data_iter_random(corpus_indice, batch_size, num_step, device = None):
    num_example = (len(corpus_indice)-1)//num_step
    epoch_size = num_example//batch_size
    example_indice = list(range(num_example))
    random.shuffle(example_indice)
    
    def _data(pos):
        return corpus_indice[pos:pos+num_step]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    for i in range(epoch_size):
        i = i*batch_size
        X = [_data(j*num_step) for j in example_indice[i:i+batch_size]]
        Y = [_data(j*num_step+1) for j in example_indice[i: i+batch_size]]
        yield torch.tensor(X, dtype = torch.float, device = device), torch.tensor(Y, dtype = torch.float, device = device)

In [5]:
def data_iter_consecutive(corpus_indice, batch_size, num_step, device = None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_indice = torch.tensor(corpus_indice, dtype = torch.float, device = device)
    data_len = len(corpus_indice)
    batch_len = data_len//batch_size
    indice = corpus_indice[0:batch_len*batch_size].view(batch_size, batch_len)
    epoch_size = batch_len//num_step
    
    for i in range(epoch_size):
        i = i*num_step
        X = indice[:, i: i+num_step]
        X1 = indice[:, i+1:i+num_step+1]
        yield X, X1

In [6]:
def one_hot(x,n_class):
    x = x.long()
    res=torch.zeros(x.shape[0],n_class, dtype = torch.float, device = x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res
def to_onehot(X, n_class):
    return [one_hot(X[:,i],n_class) for i in range(X.shape[1])]

In [7]:
X = torch.arange(10).view(2,5)
X_onehot = to_onehot(X,vocab_size)

In [8]:
def get_params(num_inputs, num_hiddens, num_outputs, device = None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    def _one(shape):
        ts = torch.tensor(np.random.normal(0, 0.01,size = shape), device = device, dtype = torch.float)
        return nn.Parameter(ts, requires_grad = True)
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = nn.Parameter(torch.zeros(num_hiddens, device = device, dtype = torch.float, requires_grad =True))
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nn.Parameter(torch.zeros(num_outputs, device = device, dtype = torch.float, requires_grad =True))
    return nn.ParameterList([W_xh, W_hh, b_h, W_hq, b_q])    

In [9]:
def init_rnn_state(batch_size, num_hiddens, device = None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return (torch.zeros((batch_size, num_hiddens), device = device, dtype = torch.float),)

In [10]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh)+torch.matmul(H, W_hh)+b_h)
        Y = torch.matmul(H, W_hq)+b_q
        outputs.append(Y)
    return outputs, (H,)

In [11]:
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
state = init_rnn_state(X.shape[0], num_hiddens)
inputs = to_onehot(X.to('cuda'), vocab_size)
params = get_params(num_inputs, num_hiddens, num_outputs, device = None)
outputs, state_new = rnn(inputs, state, params)


In [12]:
def predict_rnn(prefix, params, num_chars, num_hiddens, idx_to_char, char_to_idx):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(idx_to_char)
    state, = init_rnn_state(1, num_hiddens)
    output_prefix = []
    for i in range(num_chars):
        if i <= len(prefix)-1:
            output_prefix.append(prefix[i])
            idx = char_to_idx[output_prefix[i-1]] 
            idx = torch.tensor([[idx]], dtype = torch.float, device = device)
            idx_onehot = one_hot(idx, vocab_size)
            output,state = rnn(idx_onehot, state, params)             
        else:
            idx = char_to_idx[output_prefix[i-1]] 
            idx = torch.tensor([[idx]], dtype = torch.float, device = device)
            idx_onehot = one_hot(idx, vocab_size)
            output,state = rnn(idx_onehot, state, params)    
            output_char = idx_to_char[output[0].argmax(dim=0).item()]
            output_prefix.append(output_char)
    return ''.join(output_prefix)

In [13]:
predict_rnn('分开', params, 10, 256, idx_to_char_dict, char_to_idx_dict)

'分开怯秒到篇翰凝光缝'

In [14]:
#clip gradient
def grad_clipping(params, theta):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    norm = torch.tensor([0.0], device = device)
    for param in params:
        norm += (param.grad.data**2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta/norm) 

In [15]:
#optimizer function
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr*param.grad/batch_size

In [22]:
def train_predict_rnn(corpus_indices, idx_to_char_dict, dict_to_idx_dict, 
                      is_random_iter, num_hiddens, num_epochs, num_steps,
                      lr, clipping_theta, batch_size,                     
                      pred_period, pred_len, prefixes):
    #initialize parameters
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    vocab_size = len(idx_to_char_dict)
    if is_random_iter:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecutive
    params = get_params(vocab_size, num_hiddens, vocab_size, device)
    loss_fn = nn.CrossEntropyLoss()
    
    #training loop
    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, device)
        loss_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter: 
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:
                for s in state:
                    s.detach_()
        inputs = to_onehot(X, vocab_size)
        (outputs, state) = rnn(inputs, state, params)
        outputs = torch.cat(outputs, dim=0)
        y = torch.transpose(Y,0,1).contiguous().view(-1)
        loss = loss_fn(outputs, y.long())
        
        #gradient zero
        if params[0].grad is not None:
            for param in params:
                param.grad.data.zero_()
        
        loss.backward()
        grad_clipping(params, clipping_theta)
        sgd(params, lr, 1)
        loss_sum = loss.item()*y.shape[0]
        n += y.shape[0]
        
        if (epoch+1)% pred_period ==0:
            print('epoch %d, perplexity %f, time %.2f sec'%(epoch+1, math.exp(loss_sum/n), time.time()-start))
            for prefix in prefixes:
                print('-',predict_rnn(prefix, params, pred_len, num_hiddens, idx_to_char_dict, char_to_idx_dict ))

In [23]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 2000, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 100, 50, ['分开', '不分开']
is_random_iter = False

In [None]:
train_predict_rnn(corpus_indice, idx_to_char_dict, char_to_idx_dict, 
                      is_random_iter, num_hiddens, num_epochs, num_steps,
                      lr, clipping_theta, batch_size,                     
                      pred_period, pred_len, prefixes)

epoch 100, perplexity 191.440885, time 0.02 sec
- 分开我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的
- 不分开我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我的我
epoch 200, perplexity 79.224255, time 0.02 sec
- 分开 我不我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果
- 不分开 我不我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如果我的 如
epoch 300, perplexity 19.089185, time 0.02 sec
- 分开 这够 没不能 爱情走不 快使耿沙 漫使用 分数怎么停快 一直哈我 你你这 分数怎么停留 一直哈
- 不分开  我在能停 我不能 爱情来的 快使定沙 漫使了 分数怎么停快 一直哈我 你你这 分数怎么停留
epoch 400, perplexity 3.643646, time 0.02 sec
- 分开一 三两银够不够 景色了我 说你说 分数怎么停留 一直在停留 谁让它停留 仙人掌事 温暖了空屋藤
- 不分开 你我去吃 如果黄一切过 塞北的客 如果我有轻功 一檐走客 如果我有轻功 一檐走客 如果我有轻
epoch 500, perplexity 1.431454, time 0.00 sec
- 分开一点秀逗 猎物死了它比谁都难过 印地安斑鸠 会学人开口 仙人掌怕羞 我不往事抽离 如果我遇见你是
- 不分开 W我的始婆 我非开始乡相信命运 感谢地心引力 让梭时间日落 一直到我们都睡着 我想就这样牵着
epoch 600, perplexity 1.127959, time 0.02 sec
- 分开一 三u银译 如果这一切 真的可以 我想要将我的寂寞封闭 然后在这胸离睡著 平言儿子重习被纵容手
- 不分开攻 我的伤过去 我将往事抽离 如果我遇见你是一场悲剧 我想我这辈子注定一个人演戏 最后儿潮不个
epoch 700, perplexity 1.059201, time 0.02 sec
- 分开一 三两  爱情来的太快就像龙卷风 离不开暴风圈来不及逃 我不能再千