# 神经网络与文本分析

如果像以上的点评数据，有星级、正负面的信息，我们当然可以使用机器学习的所有方法，结合词袋、TF-IDF以及词嵌入、深度学习等方法。

In [None]:
stop_list = []
with open("Chinese/stopword.txt") as f:
    for w in f:
        stop_list.append(w.strip())

In [None]:
ALL_words = {}
for sentence in dianping['cus_comment']:
    ws = str(sentence).split(' ')
    for w in ws:
        wstrip = w.strip()
        if wstrip not in stop_list:
            if wstrip not in ALL_words:
                ALL_words[wstrip] = 1
            else:
                ALL_words[wstrip] += 1
print(len(ALL_words))

In [None]:
word_code = {}
code_len = 1
for k in ALL_words:
    if ALL_words[k] > 5:
        if k not in word_code:
            word_code[k] = code_len
            code_len += 1

print(code_len)

In [None]:
word_series = []
word_len = []
for sentence in dianping['cus_comment']:
    sentence_series = []
    ws = str(sentence).split(' ')
    for w in ws:
        wstrip = w.strip()
        if wstrip in word_code:
            sentence_series.append(word_code[wstrip])
    word_series.append(sentence_series)
    word_len.append(len(sentence_series))
dianping['word_series'] = word_series
dianping['word_len'] = word_len
dianping = dianping[dianping['word_len'] > 0]
sub_dianping = dianping[['stars', 'word_series']]
sub_dianping = sub_dianping.dropna()
sub_dianping['random'] = np.random.random(sub_dianping.shape[0])
sub_dianping = sub_dianping.sort_values('random')
del sub_dianping['random']
sub_dianping

一个比较棘手的问题是，由于不同文本的长度是不一样的，所以在输入给神经网络的时候很容易出问题，这里我们可以使用pad的方法将短的句子用0进行填充，从而得到一个等长的序列：

In [None]:
import torch
from torch import nn

a = torch.tensor([1, 2, 3])
b = torch.tensor([1])
c = torch.tensor([1, 2, 3, 4, 5, 6])
# 记录长度
tensors_with_len = [(i, len(i)) for i in [a, b, c]]
tensors_with_len.sort(key=lambda t: t[1], reverse=True)
print(tensors_with_len)
tensors = [t[0] for t in tensors_with_len]
lens = [t[1] for t in tensors_with_len]
# 进行padding
padded_tensor = nn.utils.rnn.pad_sequence(tensors)
padded_tensor

可以看到评价为负和正的之间，平均星级并没有很大差别。

pad以后可以进行embedding操作，比如：

In [None]:
embedding = nn.Embedding(7, 2, padding_idx=0)  # 指定0为pad
embedded = embedding(padded_tensor)
print(embedded.shape)
embedded

将序列交给RNN或者LSTM模型时，可以将以上padded tensor进行打包（pack）：

In [None]:
# 长度必须从大到小排序
packed_tensor1 = nn.utils.rnn.pack_padded_sequence(embedded, lengths=lens)
packed_tensor1

如此，在RNN或者LSTM中，就不会对如果需要从packed还原，只需要使用unpack就可以了：

In [None]:
torch.nn.utils.rnn.pad_packed_sequence(packed_tensor1)

使用如上特性，我们可以定义数据了：

In [None]:
from torch.utils.data import Dataset, DataLoader

Y_train = sub_dianping.iloc[:20000, 0] >= 4
X_train = sub_dianping.iloc[:20000, 1]
Y_test = sub_dianping.iloc[20000:, 0] >= 4
X_test = sub_dianping.iloc[20000:, 1]


class dianping_data(Dataset):

    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        x = (self.X.iloc[i], len(self.X.iloc[i]), self.Y.iloc[i])
        return x


print(Y_train)

不过值得注意的是，如果使用默认的Dataloader，会在最终将每一条数据合并为一个Tensor，而由于我们这里的数据是变长的，还需要进行padding等操作，所以我们先不用原来的dataloader的collate_fn，而是定义一个新的clolate_fn，在其中完成排序、padding的过程：

In [None]:
batch_size = 5
train_data = dianping_data(X_train, Y_train)


def collate(x):
    x.sort(key=lambda t: t[1], reverse=True)
    X = [torch.tensor(t[0]).long() for t in x]
    L = [t[1] for t in x]
    Y = [t[2] for t in x]
    X = nn.utils.rnn.pad_sequence(X)
    Y = torch.tensor(Y).long()
    return X, L, Y


dl = DataLoader(train_data, batch_size=batch_size, collate_fn=collate)
for x, l, y in dl:
    print(x)
    print(l)
    break

In [None]:
batch_size = 20
train_data = dianping_data(X_train, Y_train)
dl = DataLoader(train_data,
                shuffle=True,
                batch_size=batch_size,
                pin_memory=True,
                num_workers=15,
                collate_fn=collate)

test_data = dianping_data(X_test, Y_test)
tdl = DataLoader(test_data,
                 shuffle=False,
                 batch_size=batch_size,
                 collate_fn=collate,
                 drop_last=True)


接下来就可以定义模型了：

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")


# device=torch.device("cpu")
class classifier(nn.Module):

    def __init__(self,
                 word_count,
                 batch_size,
                 embedding_size=30,
                 lstm_hidden_size=56,
                 num_nurons=128):
        super(classifier, self).__init__()
        self.batch_size = batch_size  #批大小
        self.embedding_size = embedding_size  #嵌入层词向量大小
        self.lstm_hidden_size = lstm_hidden_size  #隐藏状态大小

        self.embedding = nn.Embedding(word_count,
                                      embedding_size,
                                      padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_size,
                            hidden_size=lstm_hidden_size,
                            num_layers=3)
        self.layer3 = nn.Sequential(nn.Linear(lstm_hidden_size, num_nurons),
                                    nn.LayerNorm(num_nurons), nn.Sigmoid(),
                                    nn.LayerNorm(num_nurons), nn.Dropout(0.5),
                                    nn.Linear(num_nurons, 2), nn.Tanh())
        self.hidden = self.init_hidden()


# nn.Linear(num_nurons,num_nurons),nn.LeakyReLU(inplace=True),

    def init_hidden(self):
        h = torch.zeros(3, self.batch_size, self.lstm_hidden_size).to(device)
        c = torch.zeros(3, self.batch_size, self.lstm_hidden_size).to(device)
        return (h, c)

    def forward(self, x, l):
        x = self.embedding(x)
        x = nn.utils.rnn.pack_padded_sequence(x, lengths=l)
        lstm_out, (h, c) = self.lstm(x, self.hidden)
        y = self.layer3(h[-1]) * 8  # 为避免出现nan，做一个截断，要求y只能在-8到8之间
        return y

接下来，定义学习率和损失函数，进行求解：

In [None]:
model = classifier(code_len, batch_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.995)
losses = []
Exp_Smoothed_Loss = 0
for i in range(1000):
    model.train()
    for x, l, y in dl:
        # 将x计算预测值
        y_pred = model(x.to(device), l)
        # 计算损失
        loss = criterion(y_pred, y.to(device))
        losses.append(loss.item())
        if i == 0:
            Exp_Smoothed_Loss = loss.item()
        else:
            Exp_Smoothed_Loss = 0.01 * loss.item() + 0.99 * Exp_Smoothed_Loss
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    lr_scheduler.step()
    if i % 20 == 0:
        model.eval()
        pred = torch.tensor([]).to(device)
        true_value = torch.tensor([])
        for x, l, y in tdl:
            p = model(x.to(device), l)
            true_value = torch.cat([true_value, y])
            pred = torch.cat([pred, p])
        pred = torch.exp(pred)
        pred = pred / (torch.sum(pred, axis=1).unsqueeze(1))
        result = pd.DataFrame({
            'true_value':
            true_value.squeeze(-1).numpy(),
            'Predicted':
            pred[:, 1].squeeze(-1).cpu().detach().numpy()
        })
        oos_los = -np.mean(result['true_value'] * np.log(result['Predicted']) +
                           (1 - result['true_value']) *
                           np.log(1 - np.log(result['Predicted'])))
        print("第%s次epoch，Smoothed Loss=%s，LR=%s，out-of-sample Loss=%s" %
              (i, Exp_Smoothed_Loss, lr_scheduler.get_last_lr(), oos_los))

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (15.0, 5.0)

i = np.arange(len(losses)) + 1
plt.plot(i, np.array(losses))
plt.show()