参考 https://github.com/apachecn/pytorch-doc-zh/blob/master/docs/1.0/char_rnn_classification_tutorial.md

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
from tensorboardX import SummaryWriter
writer = SummaryWriter(log_dir='./logs')
graph_name = 'dropout0.5-3'
def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

import unicodedata
import string

['data/names\\Arabic.txt', 'data/names\\Chinese.txt', 'data/names\\Czech.txt', 'data/names\\Dutch.txt', 'data/names\\English.txt', 'data/names\\French.txt', 'data/names\\German.txt', 'data/names\\Greek.txt', 'data/names\\Irish.txt', 'data/names\\Italian.txt', 'data/names\\Japanese.txt', 'data/names\\Korean.txt', 'data/names\\Polish.txt', 'data/names\\Portuguese.txt', 'data/names\\Russian.txt', 'data/names\\Scottish.txt', 'data/names\\Spanish.txt', 'data/names\\Vietnamese.txt']


In [2]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [3]:
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [4]:
n_letters

57

In [5]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

Slusarski


In [6]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

In [7]:
# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [8]:
n_categories

18

In [9]:
all_categories

['Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese']

现在我们有了category_lines，一个字典变量存储每一种语言及其对应的每一行文本(名字)列表的映射关系。

变量all_categories是全部语言种类的列表，

变量n_categories 是语言种类的数量，后续会使用

In [10]:
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


# 单词转化为张量
现在我们已经加载了所有的名字，我们需要将它们转换为张量来使用它们。

我们使用大小为<1 x n_letters>的“one-hot 向量”表示一个字母。

一个one-hot向量所有位置都填充为0，并在其表示的字母的位置表示为1，例如"b" = <0 1 0 0 0 ...>.（字母b的编号是2，第二个位置是1，其他位置是0）

我们使用一个<line_length x 1 x n_letters>的2D矩阵表示一个单词

额外的1维是batch的维度，PyTorch默认所有的数据都是成batch处理的。我们这里只设置了batch的大小为1。

In [11]:
import torch

# 从所有的字母中得到某个letter的索引编号, 例如 "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [12]:
print(lineToTensor('abcdefg').size())

torch.Size([7, 1, 57])


In [13]:
lineToTensor('abcdefg')

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0

# 构造神经网络
在autograd之前，要在Torch中构建一个可以复制之前时刻层参数的循环神经网络。

layer的隐藏状态和梯度将交给计算图自己处理。

这意味着你可以像实现的常规的 feed-forward 层一样，以很纯粹的方式实现RNN。

这个RNN组件 (几乎是从这里复制的 the PyTorch for Torch users tutorial) 仅使用两层 linear 层对输入和隐藏层做处理,

在最后添加一层 LogSoftmax 层预测最终输出。

nn.LogSoftmax作为最后一层layer时，nn.NLLLoss作为损失函数是合适的。

也可以直接使用rnn的单元。

In [24]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self,input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         # if use nn.RNN(), it hardly learns
            input_size=input_size,
            hidden_size=hidden_size,         # rnn hidden unit
            num_layers=2,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
            dropout=0.5,
        )

        self.out = nn.Linear(hidden_size, 64)
        self.out2 = nn.Linear(64,output_size)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state

        # choose r_out at the last time step
        out = self.out(r_out[:, -1, :])
        out = self.out2(out)
        return out



# 训练
## 训练前的准备
进行训练步骤之前我们需要构建一些辅助函数。

第一个是当我们知道输出结果对应每种类别的可能性时，解析神经网络的输出。

我们可以使用 Tensor.topk函数得到最大值在结果中的位置索引

我们还需要一种快速获取训练示例（得到一个名字及其所属的语言类别）的方法：

In [15]:
import numpy as np

labels = list()
index = 0
train = list()
for category in all_categories:
    for name in category_lines[category]:
        data = lineToTensor(name)
        data = data.reshape(-1,57)
        train.append(data)
        category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
        labels.append(category_tensor)
        index += 1    
labels = np.array(labels)


In [16]:
import torch
from torch import nn
import torch.nn.utils.rnn as rnn_utils

In [17]:
# 参考https://zhuanlan.zhihu.com/p/59772104，使用pad_sequence填充0
train = rnn_utils.pad_sequence(train, batch_first=True)

In [18]:
train.shape
train = train.numpy()

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2)

In [20]:
import torch.utils.data as Data
X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)
x_y_dataset = Data.TensorDataset(X_train, y_train)
test_x_y_dataset = Data.TensorDataset(X_test, y_test)


In [21]:
BATCH_SIZE = 32
train_loader = Data.DataLoader(dataset=x_y_dataset, batch_size=BATCH_SIZE,shuffle=True,num_workers=4)
test_loader = Data.DataLoader(dataset=test_x_y_dataset, batch_size=BATCH_SIZE,shuffle=False,num_workers=4)

In [22]:
def train(model, device, train_loader, optimizer, loss_func, epoch):
    all_loss = list()
    all_accuracy = list()
    model.train()
    for step, (data, target) in enumerate(train_loader):  # gives batch data, normalize x when iterate train_loader
        data, target = data.to(device), target.to(device)
        output = model(data)  # model output
        output = output.to(device)
        loss = loss_func(output, target)  # cross entropy loss
        optimizer.zero_grad()  # clear gradients for this training step
        loss.backward()  # backpropagation, compute gradients
        optimizer.step()  # apply gradients
        pred_y = output.max(1, keepdim=True)[1]  # 找到概率最大的下标
        correct = pred_y.eq(target.view_as(pred_y)).sum().item()
        accuracy = correct / len(target)

        all_loss.append(loss.item())
        all_accuracy.append(accuracy)
    average_accuracy = sum(all_accuracy) / len(all_accuracy)
    average_loss = sum(all_loss) / len(all_loss)

    print('epoch: {} train accuarcy: {} train loss: {}'.format(epoch,average_accuracy,average_loss))
    return average_accuracy,average_loss

def model_test(model, device, test_loader,loss_func):
    model.eval()
    correct = 0
    all_loss = list()
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = loss_func(output, target)  # cross entropy loss
            pred_y = output.max(1, keepdim=True)[1]  # 找到概率最大的下标
            correct += pred_y.eq(target.view_as(pred_y)).sum().item()
            all_loss.append(loss)
    test_len = len(test_loader.dataset)
    accuracy = correct / test_len
    average_loss = sum(all_loss) / len(all_loss)
    print("test accuracy:{} test loss:{}".format(accuracy, average_loss))
    return accuracy,average_loss

基础训练方法：

In [None]:
EPOCH = 10  # train the training data n times, to save time, we just train 1 epoch

n_hidden = 32
LR = 0.01


rnn = RNN(n_letters, n_hidden, n_categories)
rnn = rnn.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 让torch判断是否使用GPU，建议使用GPU环境，因为会快很多
for epoch in range(1, EPOCH + 1):
    print('epoch:{}'.format(epoch))
    train_accuracy, train_loss = train(model=rnn, device=DEVICE, train_loader=train_loader, optimizer=optimizer, loss_func=criterion, epoch=epoch)
    test_accuracy,test_loss = model_test(model=rnn, device=DEVICE, test_loader=test_loader,loss_func=criterion)

    writer.add_scalars(graph_name+'/accuracy', 
                       {'train':train_accuracy,'test':test_accuracy}, epoch)
    writer.add_scalars(graph_name+'/loss', 
                       {'train':train_loss,'test':test_loss}, epoch)

epoch:1
epoch: 1 train accuarcy: 0.5181519293197581 train loss: 1.6288203117382003
test accuracy:0.6114570361145704 test loss:1.3918300867080688
epoch:2
epoch: 2 train accuarcy: 0.6279119632580787 train loss: 1.3107510209083557
test accuracy:0.6510585305105853 test loss:1.2456732988357544
epoch:3
epoch: 3 train accuarcy: 0.6623944038660174 train loss: 1.1844908700165047
test accuracy:0.6961394769613948 test loss:1.1161497831344604
epoch:4
epoch: 4 train accuarcy: 0.7126816806846688 train loss: 1.0443944482808094
test accuracy:0.7427148194271482 test loss:0.9515500664710999
epoch:5
epoch: 5 train accuarcy: 0.7460827984358861 train loss: 0.9171627382358232
test accuracy:0.7549190535491905 test loss:0.8738342523574829
epoch:6
epoch: 6 train accuarcy: 0.7615371845949536 train loss: 0.8259638547303667
test accuracy:0.7599003735990038 test loss:0.8856461048126221
epoch:7
epoch: 7 train accuarcy: 0.7758503024937288 train loss: 0.7721514673050182
test accuracy:0.7738480697384807 test loss:0.78