In [1]:
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
device

device(type='cpu')

In [4]:
import pandas as pd

train = pd.read_csv('./data/Comments/train.csv')
test = pd.read_csv('./data/Comments/test.csv')
sample_submission = pd.read_csv('./data/Comments/sample_submission.csv')

In [5]:
train.head()

Unnamed: 0,label,text
0,0,酸菜鱼不错
1,0,轻食素食都是友善的饮食方式
2,0,完爆中午吃的农家乐
3,1,烤鱼很入味
4,0,有种入口即化的感觉


In [6]:
test.head()

Unnamed: 0,text
0,理由很简单
1,蘸着花生酱吃非常美味
2,味道奶香味恰到好处
3,面包片烤的恰到好处
4,属于简单经济型


In [7]:
sample_submission.head()

Unnamed: 0,ID,Prediction
0,0,0.480232
1,1,0.362658
2,2,0.148902
3,3,0.105988
4,4,0.285303


In [8]:
import jieba
# NN模型不用去除stop_words

corpus_train = []
for one in train['text']:
    mid = []
    for ele in list(jieba.cut(one, cut_all=False, HMM=True)):
        mid.append(ele)
    corpus_train.append(' '.join(mid))
    
corpus_test = []
for one in test['text']:
    mid = []
    for ele in list(jieba.cut(one, cut_all=False, HMM=True)):
        mid.append(ele)
    corpus_test.append(' '.join(mid))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/ml/gyc9l97n0cq3pfrr93x8xmy80000gn/T/jieba.cache
Loading model cost 0.718 seconds.
Prefix dict has been built successfully.


In [9]:
corpus_train[:10]

['酸菜鱼 不错',
 '轻食 素食 都 是 友善 的 饮食 方式',
 '完爆 中午 吃 的 农家乐',
 '烤鱼 很 入味',
 '有种 入口 即化 的 感觉',
 '菜品 一如既往 的 好',
 '味道 非常 好',
 '团购 很 优惠',
 '咖喱 牛腩 不错',
 '部分 菜 偏 酸辣 口']

In [10]:
corpus_test[:10]

['理由 很 简单',
 '蘸 着 花生酱 吃 非常 美味',
 '味道 奶 香味 恰到好处',
 '面包片 烤 的 恰到好处',
 '属于 简单 经济型',
 '芝士 鸡蛋 卷 一如既往 的 好吃 香香 嫩嫩的',
 '去 宁波 科技馆 非常 方便',
 '味道 一如既往 的 好',
 '兔头 真 好吃',
 '装修 风格 独特']

In [11]:
def get_vocab_comments(data):
    '''
    @params:
        data: corpus_train or corpus_test
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    counter = collections.Counter([word for sentence in data for word in sentence.split(' ')])
    return Vocab.Vocab(counter, min_freq=1)

vocab = get_vocab_comments(corpus_train)  # 训练数据
print('# words in vocab:', len(vocab))

# words in vocab: 9511


In [12]:
def preprocess_comments(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = 32  # 统一文本长度为500

    def pad(x):  # 将每条评论通过截断或者补1 (<pad>)，使得长度变成500
        return x[:max_l] if len(x) > max_l else x + [1] * (max_l - len(x))

    # vocab.stoi(word) 将word转成index
    features = torch.tensor([pad([vocab.stoi[word] for word in words.split(' ')]) for words in data])
    labels = torch.tensor(train['label'])
    return features, labels

In [13]:
train_set = Data.TensorDataset(*preprocess_comments(corpus_train, vocab))

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    print(X[0])
    break
print('#batches:', len(train_iter))

X torch.Size([64, 32]) y torch.Size([64])
tensor([  47,   35, 2386, 1686, 2117,    7,    4,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])
#batches: 250


In [14]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        '''
        @params:
            x: 输入，形状为 (batch_size, n_channels, seq_len) 的张量
        @return: 时序最大池化后的结果，形状为 (batch_size, n_channels, 1) 的张量
        '''
        return F.max_pool1d(x, kernel_size=x.shape[2]) # kenerl_size=seq_len

In [15]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        '''
        @params:
            vocab: 在数据集上创建的词典，用于获取词典大小
            embed_size: 嵌入维度大小
            kernel_sizes: 卷积核大小列表
            num_channels: 卷积通道数列表
        '''
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # 参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不参与训练的嵌入层
        
        self.pool = GlobalMaxPool1d() # 时序最大池化层没有权重，所以可以共用一个实例
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))
            
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.dropout = nn.Dropout(0.5) # 丢弃层用于防止过拟合

    def forward(self, inputs):
        '''
        @params:
            inputs: 词语下标序列，形状为 (batch_size, seq_len) 的整数张量
        @return:
            outputs: 对文本情感的预测，形状为 (batch_size, 2) 的张量
        '''
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch_size, seq_len, 2*embed_size)
        # 根据一维卷积层要求的输入格式，需要将张量进行转置
        embeddings = embeddings.permute(0, 2, 1) # (batch_size, 2*embed_size, seq_len)
        
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

In [16]:
net

TextCNN(
  (embedding): Embedding(9511, 100)
  (constant_embedding): Embedding(9511, 100)
  (pool): GlobalMaxPool1d()
  (convs): ModuleList(
    (0): Conv1d(200, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(200, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(200, 100, kernel_size=(5,), stride=(1,))
  )
  (decoder): Linear(in_features=300, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [17]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            # forward
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            
            # backward 反向传播
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [18]:
### 训练并评价模型

lr, num_epochs = 0.001, 6
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, train_iter, net, loss, optimizer, device, num_epochs)

training on  cpu
epoch 1, loss 0.4135, train acc 0.812, test acc 0.888, time 26.6 sec
epoch 2, loss 0.1529, train acc 0.873, test acc 0.926, time 26.3 sec
epoch 3, loss 0.0771, train acc 0.907, test acc 0.948, time 25.8 sec
epoch 4, loss 0.0468, train acc 0.926, test acc 0.964, time 26.3 sec
epoch 5, loss 0.0296, train acc 0.945, test acc 0.967, time 24.5 sec
epoch 6, loss 0.0200, train acc 0.954, test acc 0.977, time 26.5 sec


In [20]:
# 调整测试数据
def preprocess_test_comments(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_l = 32  # 统一文本长度为500

    def pad(x):  # 将每条评论通过截断或者补1 (<pad>)，使得长度变成500
        return x[:max_l] if len(x) > max_l else x + [1] * (max_l - len(x))

    # vocab.stoi(word) 将word转成index
    features = torch.tensor([pad([vocab.stoi[word] for word in words.split(' ')]) for words in data])
    # labels = torch.tensor(train['label'])
    return features

In [21]:
test_set = preprocess_test_comments(corpus_test, vocab)
test_set.shape

torch.Size([4189, 32])

In [22]:
# 评估
net.eval()

TextCNN(
  (embedding): Embedding(9511, 100)
  (constant_embedding): Embedding(9511, 100)
  (pool): GlobalMaxPool1d()
  (convs): ModuleList(
    (0): Conv1d(200, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(200, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(200, 100, kernel_size=(5,), stride=(1,))
  )
  (decoder): Linear(in_features=300, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [23]:
res = net(test_set.to(device))
res.shape

torch.Size([4189, 2])

In [None]:
sample_submission['Prediction'] = F.softmax(res.cpu(), dim=1).detach().numpy()[:, 1]

res_file = './data/Comments/TextCNN_submission_softmax_lre_3_epoch6.csv'
sample_submission.to_csv(res_file, encoding='utf-8', index=False)