In [5]:
# 6.2循环神经网络
# 6.4 循环神经网络的从零开始实现
import time
import math
import numpy as np
import torch
from torch import nn,optim
import torch.nn.functional as F

import sys
sys.path.append("..")
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

(corpus_indices,char_to_idx,idx_to_char,vocab_size) = d2l.load_data_jay_lyrics()


# one_hot 向量




def one_hot(x, n_class, dtype=torch.float32): 
    # X shape: (batch), output shape: (batch, n_class)
    x = x.long()
    res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)
    res.scatter_(1, x.view(-1, 1), 1)
    return res
    
x = torch.tensor([0, 2])
res = one_hot(x, vocab_size)

print(res.shape)


torch.Size([2, 1027])


In [6]:
def to_onehot(X,n_class):
    return [one_hot(X[:,i],n_class) for i in range(X.shape[1])]

X = torch.arange(10).view(2,5)
inputs = to_onehot(X,vocab_size)
print(len(inputs),inputs[0].shape)
    
    
    
    

5 torch.Size([2, 1027])


In [7]:
# 初始化模型参数 隐藏单元个数num_hiddens是一个超参数
num_inputs,num_hiddens,num_outputs = vocab_size,256,vocab_size
print('will use',device)

def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0, 0.01, size=shape), device=device, dtype=torch.float32)
        return torch.nn.Parameter(ts, requires_grad=True)

    # 隐藏层参数
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device, requires_grad=True))
    # 输出层参数
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device, requires_grad=True))
    return nn.ParameterList([W_xh, W_hh, b_h, W_hq, b_q])

# 定义模型
# init_rnn_state来返回初始化的隐藏状态(隐层的记忆初始化)
def init_rnn_state(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),)

def rnn(inputs,state,params):
    W_xh,W_hh,b_h,W_hq,b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = torch.tanh(torch.matmul(X,W_xh) + b_h + torch.matmul(H,W_hh))
        Y = torch.matmul(H,W_hq) + b_q
        outputs.append(Y)  
    return outputs,(H,)


# 做个简单的测试来观察输出结果的个数（时间步数），以及第一个时间步的输出层输出的
# 形状和隐藏状态的形状
num__hiddens = 256
state = init_rnn_state(X.shape[0],num_hiddens,device)
inputs = to_onehot(X.to(device),vocab_size)
params = get_params()
outputs,state_new = rnn(inputs,state,params)
print(len(outputs),outputs[0].shape,state_new[0].shape)



will use cpu
5 torch.Size([2, 1027]) torch.Size([2, 256])


In [9]:
# 定义预测函数

def predict_rnn(prefix,num_chars,rnn,params,init_rnn_state,
                num_hiddens,vocab_size,device,idx_to_char,char_to_idxa):
    state = init_rnn_state(1,num__hiddens,device)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        # 将上一个时间步的输出作为当前时间步的输入
        X = to_onehot(torch.tensor([[output[-1]]],device = device),vocab_size)
        # 计算输出和更新隐藏状态
        (Y,state) = rnn(X,state,params)
        # 下一个时间步的输入是prefix里的字符或者当前的最佳预测字符
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t+1]])
        else:
            output.append(int(Y[0].argmax(dim=1).item()))
    return ''.join([idx_to_char[i] for i in output])

predict_rnn('分开',10,rnn,params,init_rnn_state,num_hiddens,vocab_size,device,
           idx_to_char,char_to_idx)

    
    
    
    
    
    

'分开抱公我杂滴换圈重主拉'

In [9]:
# 快排算法
import numpy as np

def fast_sort(target):
    if target == []:
        return target
    right_index = []
    left_index = []
    mid_index = []
    mid = target[int(len(target)/2)]
    for n in target:
        if n < mid:
            left_index.append(n)
        elif n > mid:
            right_index.append(n)
        else:
            mid_index.append(n)
    return  fast_sort(left_index) + mid_index + fast_sort(right_index)

fast_sort([12,56,89,1,2,6,7])

[1, 2, 6, 7, 12, 56, 89]

In [None]:
# 裁剪梯度
def grad_clipping(params,theta,device):
    norm = torch.tensor([0.0],device = device)
    for param in params:
        norm += (param.grad.data ** 2)/.sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta/norm)
            
# 应对梯度爆炸

In [None]:
# 困惑度 preplexity 来评价语言模型的好坏，困惑度是对交叉熵函数做指数运算得到的数值
# 最佳情况下，模型总是把标签类别概率预测为1，此时困惑度为1
# 最坏情况下，模型总是把标签的类别概率预测为0，此时困惑度为正无穷
# 基线情况下，模型总是预测所有类别的概率都相同，此时困惑度为类别个数
# 显然，任何一个有效的模型的困惑度必须小于类别个数，在本例中，困惑度必须小于词典
# 的大小cocab_size 否则模型的预测效果将不如随机猜测