In [23]:
import math
import numpy as np
import pandas as pd
import os
import math
import random
import codecs
from pathlib import Path

# 导入MindSpore库
import mindspore
# 导入MindSpore的数据集处理模块
import mindspore.dataset as ds
# 导入MindSpore的神经网络模块
import mindspore.nn as nn
# 从MindSpore中导入Tensor类，用于处理张量计算
from mindspore import Tensor
# 导入MindSpore的context模块，用于设定运行环境
from mindspore import context
# 导入MindSpore的Model类，用于构建和训练模型
from mindspore.train.model import Model
# 导入MindSpore的准确率计算类
from mindspore.nn.metrics import Accuracy
# 导入MindSpore的模型加载和参数加载函数
from mindspore.train.serialization import load_checkpoint, load_param_into_net
# 导入MindSpore的训练过程回调类，包括模型检查点保存、训练配置、损失监听和时间监听
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
# 导入MindSpore的运算操作模块
from mindspore.ops import operations as ops

# 导入easydict库，用于创建类字典对象，方便通过属性而非键值对的方式访问字典元素
from easydict import EasyDict as edict

# 创建一个名为cfg的配置对象，并设定相关参数
cfg = edict({
    'name': 'movie review',                # 配置名称
    'pre_trained': False,                  # 是否使用预训练模型
    'num_classes': 2,                      # 分类任务的类别数量
    'batch_size': 64,                      # 每个批次处理的数据数量
    'epoch_size': 4,                       # 总训练轮数
    'weight_decay': 3e-5,                  # 权重衰减系数
    'data_path': './data/TextCNN/data/',   # 数据集路径
    'device_target': 'CPU',                # 设备目标，此处为CPU
    'device_id': 0,                        # 设备ID，此处为0
    'keep_checkpoint_max': 1,              # 保留的最大检查点数量
    'checkpoint_path': './ckpt/train_textcnn-4_149.ckpt',  # 检查点路径
    'word_len': 51,                        # 单词长度
    'vec_length': 40                       # 向量长度
})

# 设定MindSpore的运行环境，包括运行模式、设备目标和设备ID
context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id)


In [24]:
with open("./data/TextCNN/data/rt-polarity.neg", 'r', encoding='utf-8') as f:
        print("Negative reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))
with open("./data/TextCNN/data/rt-polarity.pos", 'r', encoding='utf-8') as f:
        print("Positive reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))

Negative reivews:
[0]:simplistic , silly and tedious . 

[1]:it's so laddish and juvenile , only teenage boys could possibly find it funny . 

[2]:exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

[3]:[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

[4]:a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 

Positive reivews:
[0]:the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

[1]:the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

[2]:effective but too-tepid biopic

In [25]:
#查看所有的评论数

def count_lines(file_path):
    # 打开文件
    with open(file_path, 'r', encoding='utf-8') as f:
        # 使用readlines方法读取所有行，返回的是一个包含所有行的列表，通过len函数获取其长度即行数
        num_lines = len(f.readlines())
    return num_lines

# 计算负面评论数量
neg_reviews = count_lines("./data/TextCNN/data/rt-polarity.neg")
print("Negative reviews: ", neg_reviews)

# 计算正面评论数量
pos_reviews = count_lines("./data/TextCNN/data/rt-polarity.pos")
print("Positive reviews: ", pos_reviews)

# 计算总评论数量
total_reviews = neg_reviews + pos_reviews
print("Total reviews: ", total_reviews)


Negative reviews:  5331
Positive reviews:  5330
Total reviews:  10661


In [26]:
#定义数据生成类
class Generator():
    def __init__(self, input_list):
        self.input_list=input_list
    def __getitem__(self,item):
        return (np.array(self.input_list[item][0],dtype=np.int32),
                np.array(self.input_list[item][1],dtype=np.int32))
    def __len__(self):
        return len(self.input_list)


class MovieReview:
    '''
    影评数据集
    '''
    # 初始化函数，用于创建一个MovieReview对象
    def __init__(self, root_dir, maxlen, split):
        '''
        输入：
            root_dir: 影评数据目录
            maxlen: 设置句子最大长度
            split: 设置数据集中训练/评估的比例
        '''

        # 设置影评数据的路径
        self.path = root_dir

        # 设置感情标签到数字的映射，其中"neg"表示消极情感，"pos"表示积极情感
        self.feelMap = {
            'neg':0,
            'pos':1
        }

        # 初始化一个空的文件列表
        self.files = []

        # 设置一个标志位，表示是否已经将文本转化为向量，默认为False
        self.doConvert = False

        # 使用Path库来处理路径
        mypath = Path(self.path)

        # 检查指定的路径是否存在，是否为目录，如果不是，则抛出错误
        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # 遍历数据目录中的文件，将找到的文件名添加到文件列表中
        for root,_,filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root,each))
            break

        # 检查文件列表中是否有两个文件，即.neg文件和.pos文件，如果不是，则抛出错误
        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        # 初始化一些参数，用于存储数据的一些统计信息
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")

        # 初始化存储积极和消极影评的列表
        self.Pos = []
        self.Neg = []

        # 对文件列表中的每一个文件，调用read_data函数来读取数据
        for filename in self.files:
            self.read_data(filename)

        # 调用text2vec函数，将读取的文本数据转化为向量
        self.text2vec(maxlen=maxlen)

        # 调用split_dataset函数，将数据集按照指定的比例分割为训练集和测试集
        self.split_dataset(split=split)


    def read_data(self, filePath):
        """ 这里改为utf8编码来读，否则会报错"""
        with open(filePath,'r', encoding='utf-8') as f:
            for sentence in f.readlines():
                sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')

                # 使用空格分割句子，得到单词列表
                sentence = sentence.split(' ')

                # 使用filter函数和lambda表达式，去除单词列表中的空元素
                sentence = list(filter(lambda x: x, sentence))

                # 如果单词列表非空
                if sentence:
                    # 累加单词总数
                    self.word_num += len(sentence)

                    # 更新句子的最大长度，如果当前句子长度大于maxlen，则maxlen为当前句子长度
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)

                    # 更新句子的最小长度，如果当前句子长度小于minlen，则minlen为当前句子长度
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)

                    # 如果文件路径中包含'pos'，则认为当前句子的情感标签为积极，将其加入到Pos列表中
                    if 'pos' in filePath:
                        self.Pos.append([sentence,self.feelMap['pos']])

                    # 否则，认为当前句子的情感标签为消极，将其加入到Neg列表中
                    else:
                        self.Neg.append([sentence,self.feelMap['neg']])

    # 定义将文本转为向量的方法text2vec
    def text2vec(self, maxlen):
        '''
        将句子转化为向量
        '''
        # 定义词典Vocab，用于存储单词到索引的映射
        self.Vocab = dict()

        # 遍历正向评价和负向评价的列表
        for SentenceLabel in self.Pos+self.Neg:
            # 初始化一个长度为maxlen的向量，所有元素都为0
            vector = [0]*maxlen

            # 遍历句子中的单词以及对应的索引
            for index, word in enumerate(SentenceLabel[0]):
                # 如果索引超过了设定的最大长度，则停止转换
                if index >= maxlen:
                    break
                # 如果词典中没有当前的单词，则在词典中加入该单词，并更新向量的对应位置为词典的长度减1
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1
                # 如果词典中已经有当前的单词，则更新向量的对应位置为词典中单词的索引
                else:
                    vector[index] = self.Vocab[word]

            # 更新句子的标签为对应的向量
            SentenceLabel[0] = vector
        # 标记已完成转换
        self.doConvert = True

    # 定义分割数据集的方法split_dataset
    def split_dataset(self, split):
        '''
        将数据分割为训练集和测试集
        '''
        # 计算正向评价和负向评价应该在训练集中的数量
        trunk_pos_size = math.ceil((1-split)*len(self.Pos))
        trunk_neg_size = math.ceil((1-split)*len(self.Neg))

        # 计算需要切分的块数
        trunk_num = int(1/(1-split))

        # 初始化存储正向评价和负向评价的列表
        pos_temp=list()
        neg_temp=list()

        # 将正向评价和负向评价切分为指定数量的块，分别加入到pos_temp和neg_temp中
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size])
            neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size])

        # 取出第三块作为测试集
        self.test = pos_temp.pop(2)+neg_temp.pop(2)

        # 剩下的块作为训练集
        self.train = [i for item in pos_temp+neg_temp for i in item]

        # 打乱训练集的顺序
        random.shuffle(self.train)

        
    # 定义获取词典长度的方法get_dict_len
    def get_dict_len(self):
        '''
        获取数据集中的词汇数量，即词典的长度
        '''
        # 如果已经完成文本到向量的转换，返回词典的长度
        if self.doConvert:
            return len(self.Vocab)
        # 如果还没有完成文本到向量的转换，打印警告信息并返回-1
        else:
            print("尚未完成文本到向量的转换")
            return -1

    # 定义创建训练数据集的方法create_train_dataset
    def create_train_dataset(self, epoch_size, batch_size):
        # 使用MindSpore的数据集类GeneratorDataset，将训练数据转换为数据集对象
        dataset = ds.GeneratorDataset(
                                        # Generator是一个数据生成器，传入的参数input_list为训练数据
                                        source=Generator(input_list=self.train), 
                                        # 定义数据集的列名
                                        column_names=["data","label"], 
                                        # 不对数据进行打乱
                                        shuffle=False
                                        )
        # 对数据集进行分批，每批的大小为batch_size，drop_remainder=True表示如果最后一批数据不足batch_size，将舍弃
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        # 对数据集进行重复，重复次数为epoch_size，这样在训练模型时可以多次使用同一数据集
        dataset=dataset.repeat(epoch_size)
        # 返回生成的数据集
        return dataset

    # 定义创建测试数据集的方法create_test_dataset
    def create_test_dataset(self, batch_size):
        # 使用MindSpore的数据集类GeneratorDataset，将测试数据转换为数据集对象
        dataset = ds.GeneratorDataset(
                                        # Generator是一个数据生成器，传入的参数input_list为测试数据
                                        source=Generator(input_list=self.test), 
                                        # 定义数据集的列名
                                        column_names=["data","label"], 
                                        # 不对数据进行打乱
                                        shuffle=False
                                        )
        # 对数据集进行分批，每批的大小为batch_size，drop_remainder=True表示如果最后一批数据不足batch_size，将舍弃
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        # 返回生成的数据集
        return dataset


In [27]:
# 实例化MovieReview类，输入参数包括数据目录(cfg.data_path)，句子最大长度(cfg.word_len)和训练/评估的比例(0.9)
instance = MovieReview(root_dir=cfg.data_path, maxlen=cfg.word_len, split=0.9)
# 调用实例的create_train_dataset方法，创建训练数据集，输入参数包括每批数据的大小(cfg.batch_size)和数据集重复的次数(cfg.epoch_size)
dataset = instance.create_train_dataset(batch_size=cfg.batch_size,epoch_size=cfg.epoch_size)
# 调用数据集的get_dataset_size方法，获取数据集的大小，即批次数量
batch_num = dataset.get_dataset_size()

In [28]:
# 调用实例的get_dict_len方法，获取数据集中所有单词构成的词典的大小
vocab_size=instance.get_dict_len()
# 打印词汇表的大小
print("vocab_size:{0}".format(vocab_size))
# 使用create_dict_iterator创建数据集的迭代器
item =dataset.create_dict_iterator()
# 遍历迭代器，并打印第一个数据项的详细信息，以及第二个数据项的数据部分
for i,data in enumerate(item):
    if i<1:
        print(data)
        print(data['data'][1])
    else:
        break

vocab_size:18848
{'data': Tensor(shape=[64, 51], dtype=Int32, value=
[[    0,  6129,    10 ...     0,     0,     0],
 [   72, 16157, 12491 ...     0,     0,     0],
 [   59,   617,   726 ...     0,     0,     0],
 ...
 [ 1176,   802,   253 ...     0,     0,     0],
 [ 5649,    97,  2233 ...     0,     0,     0],
 [  317,    97,   735 ...     0,     0,     0]]), 'label': Tensor(shape=[64], dtype=Int32, value= [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 
 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 
 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0])}
[   72 16157 12491   155 16158   305 16159    10 16160   305 13333    60
 16161   247    15   258 16162  5320  6175    26 16163 16164   415     0
  3873     4 16165  5142   928     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]


In [29]:
# 初始化学习率列表
learning_rate = []
# 根据预设的参数计算预热阶段的学习率列表
warm_up = [1e-3 / math.floor(cfg.epoch_size / 5) * (i + 1) for _ in range(batch_num) 
           for i in range(math.floor(cfg.epoch_size / 5))]
# 根据预设的参数计算收缩阶段的学习率列表
shrink = [1e-3 / (16 * (i + 1)) for _ in range(batch_num) 
          for i in range(math.floor(cfg.epoch_size * 3 / 5))]
# 根据预设的参数计算正常运行阶段的学习率列表
normal_run = [1e-3 for _ in range(batch_num) for i in 
              range(cfg.epoch_size - math.floor(cfg.epoch_size / 5) 
                    - math.floor(cfg.epoch_size * 2 / 5))]
# 将三个阶段的学习率列表合并为一个列表
learning_rate = learning_rate + warm_up + normal_run + shrink

In [30]:
# 定义一个函数，用于创建模型权重，输入参数为权重的形状和一个缩放因子
def _weight_variable(shape, factor=0.01):
    init_value = np.random.randn(*shape).astype(np.float32) * factor  # 使用随机正态分布生成初始权重，然后乘以缩放因子
    return Tensor(init_value)  # 将numpy数组转为Tensor并返回

# 定义一个函数，用于创建卷积层，输入参数为卷积核的大小
def make_conv_layer(kernel_size):
    weight_shape = (96, 1, *kernel_size)  # 定义权重的形状
    weight = _weight_variable(weight_shape)  # 调用前面定义的函数，创建权重
    # 创建卷积层，输入通道为1，输出通道为96，卷积核大小为输入参数，边界填充大小为1，权重初始化为前面创建的权重，有偏置
    return nn.Conv2d(in_channels=1, out_channels=96, kernel_size=kernel_size, padding=1,
                     pad_mode="pad", weight_init=weight, has_bias=True)

# 定义TextCNN模型，继承自nn.Cell
class TextCNN(nn.Cell):
    # 构造函数，输入参数为词汇量、词长、类别数和向量长度
    def __init__(self, vocab_len, word_len, num_classes, vec_length):
        super(TextCNN, self).__init__()  # 调用父类的构造函数
        self.vec_length = vec_length  # 设置向量长度
        self.word_len = word_len  # 设置词长
        self.num_classes = num_classes  # 设置类别数

        # 初始化所需的各种操作和层
        self.unsqueeze = ops.ExpandDims()  # 初始化一个用于扩展维度的操作
        self.embedding = nn.Embedding(vocab_len, self.vec_length, embedding_table='normal')  # 初始化一个嵌入层

        self.slice = ops.Slice()  # 初始化一个用于切片的操作
        # 初始化三个卷积层，分别使用3、4、5的高度的卷积核
        self.layer1 = self.make_layer(kernel_height=3)
        self.layer2 = self.make_layer(kernel_height=4)
        self.layer3 = self.make_layer(kernel_height=5)

        self.concat = ops.Concat(1)  # 初始化一个用于在第一维度上连接张量的操作

        self.fc = nn.Dense(96*3, self.num_classes)  # 初始化一个全连接层，输入尺寸为96*3，输出尺寸为类别数
        self.drop = nn.Dropout(keep_prob=0.5)  # 初始化一个Dropout层，保持概率为0.5
        self.print = ops.Print()  # 初始化一个用于打印的操作
        self.reducemean = ops.ReduceMax(keep_dims=False)  # 初始化一个用于在不保持维度的情况下进行最大值降维的操作
    
    # 定义一个函数，用于创建一个卷积层、ReLU激活层和最大池化层的序列模型
    def make_layer(self, kernel_height):
        return nn.SequentialCell(
            [
                make_conv_layer((kernel_height,self.vec_length)),  # 调用前面定义的函数，创建卷积层
                nn.ReLU(),  # 创建ReLU激活层
                nn.MaxPool2d(kernel_size=(self.word_len-kernel_height+1,1)),  # 创建最大池化层，核的大小为(词长-卷积核高度+1,1)
            ]
        )

    # 定义模型的前向传播
    def construct(self,x):
        x = self.unsqueeze(x, 1)  # 扩展输入x的维度
        x = self.embedding(x)  # 将输入x传入嵌入层得到输出
        x1 = self.layer1(x)  # 将x传入第一个卷积层得到输出x1
        x2 = self.layer2(x)  # 将x传入第二个卷积层得到输出x2
        x3 = self.layer3(x)  # 将x传入第三个卷积层得到输出x3

        # 对x1, x2, x3进行降维操作
        x1 = self.reducemean(x1, (2, 3))
        x2 = self.reducemean(x2, (2, 3))
        x3 = self.reducemean(x3, (2, 3))

        x = self.concat((x1, x2, x3))  # 将x1, x2, x3在第一维度上连接
        x = self.drop(x)  # 对连接后的x执行Dropout操作
        x = self.fc(x)  # 将Dropout后的x传入全连接层得到最终的输出
        return x  # 返回最终的输出

# 实例化一个TextCNN模型，词汇量、词长、类别数和向量长度根据先前设定的参数进行设置
net = TextCNN(vocab_len=instance.get_dict_len(), word_len=cfg.word_len, 
              num_classes=cfg.num_classes, vec_length=cfg.vec_length)
print(net)  # 打印模型的结构




TextCNN<
  (embedding): Embedding<vocab_size=18848, embedding_size=40, use_one_hot=False, embedding_table=Parameter (name=embedding.embedding_table, shape=(18848, 40), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
  (layer1): SequentialCell<
    (0): Conv2d<input_channels=1, output_channels=96, kernel_size=(3, 40), stride=(1, 1), pad_mode=pad, padding=1, dilation=(1, 1), group=1, has_bias=True, weight_init=[[[[ 4.80047334e-03  2.59035304e-02 -1.54866586e-02 ...  3.91998654e-03
         6.25947537e-03 -4.53591533e-03]
       [ 7.83338118e-03  2.35214434e-03  8.11113277e-04 ... -1.58306938e-02
        -1.72751117e-02  2.24960013e-03]
       [-1.45516722e-02 -4.30728355e-03  1.25398869e-02 ...  2.10527536e-02
         3.87292297e-04 -1.05130754e-03]]]
    
    
     [[[-1.23882908e-02 -1.24273738e-02 -1.14165852e-02 ... -1.58078149e-02
        -1.23453392e-02 -8.79397895e-03]
       [ 3.75452242e-03 -3.78002552e-03 -1.73468534e-02 ...  1.27675552e-02
        -1.2682

In [31]:
# 优化器、损失函数、保存检查点、时间监视器等设置
# 使用Adam优化器，过滤出网络中需要梯度的参数，设置学习率和权重衰减
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), 
              learning_rate=learning_rate, weight_decay=cfg.weight_decay)
# 设置损失函数为带有对数似然的softmax交叉熵，设定标签为稀疏
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
# 创建模型，输入网络、损失函数、优化器和度量标准
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()})

# 配置模型保存的配置，设定保存模型的步骤和最大保存的模型数量
config_ck = CheckpointConfig(save_checkpoint_steps=int(cfg.epoch_size*batch_num/2), 
                             keep_checkpoint_max=cfg.keep_checkpoint_max)

# 创建时间监控回调函数，设定数据大小
time_cb = TimeMonitor(data_size=batch_num)

# 设定模型保存路径
ckpt_save_dir = "./ckpt"
# 创建模型保存的回调函数，设置保存的前缀、路径和配置
ckpoint_cb = ModelCheckpoint(prefix="train_textcnn", directory=ckpt_save_dir, config=config_ck)

# 创建损失监控的回调函数
loss_cb = LossMonitor()

In [32]:
# 训练模型，设定训练的轮数、数据集和回调函数
model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])

print("train success")



epoch: 1 step: 1, loss is 0.6931294202804565
epoch: 1 step: 2, loss is 0.6930627822875977
epoch: 1 step: 3, loss is 0.6930456757545471
epoch: 1 step: 4, loss is 0.6927811503410339
epoch: 1 step: 5, loss is 0.6926908493041992
epoch: 1 step: 6, loss is 0.6931170225143433
epoch: 1 step: 7, loss is 0.6937335729598999
epoch: 1 step: 8, loss is 0.693136990070343
epoch: 1 step: 9, loss is 0.6941984295845032
epoch: 1 step: 10, loss is 0.6942269802093506
epoch: 1 step: 11, loss is 0.6923094391822815
epoch: 1 step: 12, loss is 0.6960406303405762
epoch: 1 step: 13, loss is 0.6962171792984009
epoch: 1 step: 14, loss is 0.6948647499084473
epoch: 1 step: 15, loss is 0.6928861737251282
epoch: 1 step: 16, loss is 0.6932189464569092
epoch: 1 step: 17, loss is 0.6926476955413818
epoch: 1 step: 18, loss is 0.693326473236084
epoch: 1 step: 19, loss is 0.6929855942726135
epoch: 1 step: 20, loss is 0.6933774352073669
epoch: 1 step: 21, loss is 0.6929882764816284
epoch: 1 step: 22, loss is 0.6932856440544128

epoch: 1 step: 178, loss is 0.48210620880126953
epoch: 1 step: 179, loss is 0.6528003811836243
epoch: 1 step: 180, loss is 0.46307671070098877
epoch: 1 step: 181, loss is 0.4863908290863037
epoch: 1 step: 182, loss is 0.535204291343689
epoch: 1 step: 183, loss is 0.422556608915329
epoch: 1 step: 184, loss is 0.5165353417396545
epoch: 1 step: 185, loss is 0.4886051118373871
epoch: 1 step: 186, loss is 0.4545761048793793
epoch: 1 step: 187, loss is 0.5248510837554932
epoch: 1 step: 188, loss is 0.4569717347621918
epoch: 1 step: 189, loss is 0.5152373313903809
epoch: 1 step: 190, loss is 0.4210676848888397
epoch: 1 step: 191, loss is 0.6214286088943481
epoch: 1 step: 192, loss is 0.47772079706192017
epoch: 1 step: 193, loss is 0.44909271597862244
epoch: 1 step: 194, loss is 0.4589487910270691
epoch: 1 step: 195, loss is 0.5437012910842896
epoch: 1 step: 196, loss is 0.48615312576293945
epoch: 1 step: 197, loss is 0.3743930757045746
epoch: 1 step: 198, loss is 0.4763217270374298
epoch: 1 s

epoch: 1 step: 351, loss is 0.24092860519886017
epoch: 1 step: 352, loss is 0.2609979212284088
epoch: 1 step: 353, loss is 0.22612127661705017
epoch: 1 step: 354, loss is 0.15764354169368744
epoch: 1 step: 355, loss is 0.2490338385105133
epoch: 1 step: 356, loss is 0.1737622320652008
epoch: 1 step: 357, loss is 0.19827839732170105
epoch: 1 step: 358, loss is 0.2095729410648346
epoch: 1 step: 359, loss is 0.16658316552639008
epoch: 1 step: 360, loss is 0.22187964618206024
epoch: 1 step: 361, loss is 0.1519053429365158
epoch: 1 step: 362, loss is 0.3275908827781677
epoch: 1 step: 363, loss is 0.2237405627965927
epoch: 1 step: 364, loss is 0.24688521027565002
epoch: 1 step: 365, loss is 0.20651814341545105
epoch: 1 step: 366, loss is 0.2604478895664215
epoch: 1 step: 367, loss is 0.22888705134391785
epoch: 1 step: 368, loss is 0.28472900390625
epoch: 1 step: 369, loss is 0.19438999891281128
epoch: 1 step: 370, loss is 0.3789155185222626
epoch: 1 step: 371, loss is 0.17261230945587158
epoc

epoch: 1 step: 523, loss is 0.036893170326948166
epoch: 1 step: 524, loss is 0.1346534639596939
epoch: 1 step: 525, loss is 0.07671492546796799
epoch: 1 step: 526, loss is 0.07687612622976303
epoch: 1 step: 527, loss is 0.12714266777038574
epoch: 1 step: 528, loss is 0.11988267302513123
epoch: 1 step: 529, loss is 0.05317528545856476
epoch: 1 step: 530, loss is 0.07822000980377197
epoch: 1 step: 531, loss is 0.07220830768346786
epoch: 1 step: 532, loss is 0.09454888105392456
epoch: 1 step: 533, loss is 0.07116413116455078
epoch: 1 step: 534, loss is 0.09042485058307648
epoch: 1 step: 535, loss is 0.05846816673874855
epoch: 1 step: 536, loss is 0.08718211203813553
epoch: 1 step: 537, loss is 0.0961541011929512
epoch: 1 step: 538, loss is 0.0866866260766983
epoch: 1 step: 539, loss is 0.031959906220436096
epoch: 1 step: 540, loss is 0.049598097801208496
epoch: 1 step: 541, loss is 0.0938214659690857
epoch: 1 step: 542, loss is 0.13128870725631714
epoch: 1 step: 543, loss is 0.10612668842

epoch: 2 step: 98, loss is 0.026237521320581436
epoch: 2 step: 99, loss is 0.04399152472615242
epoch: 2 step: 100, loss is 0.04403179511427879
epoch: 2 step: 101, loss is 0.07010473310947418
epoch: 2 step: 102, loss is 0.02898583747446537
epoch: 2 step: 103, loss is 0.026590269058942795
epoch: 2 step: 104, loss is 0.02109355479478836
epoch: 2 step: 105, loss is 0.07162576168775558
epoch: 2 step: 106, loss is 0.03113730251789093
epoch: 2 step: 107, loss is 0.029665354639291763
epoch: 2 step: 108, loss is 0.025386393070220947
epoch: 2 step: 109, loss is 0.04268551617860794
epoch: 2 step: 110, loss is 0.038048576563596725
epoch: 2 step: 111, loss is 0.08531243354082108
epoch: 2 step: 112, loss is 0.011992910876870155
epoch: 2 step: 113, loss is 0.023704057559370995
epoch: 2 step: 114, loss is 0.024038149043917656
epoch: 2 step: 115, loss is 0.006830915808677673
epoch: 2 step: 116, loss is 0.014070697128772736
epoch: 2 step: 117, loss is 0.020277049392461777
epoch: 2 step: 118, loss is 0.0

epoch: 2 step: 267, loss is 0.02413078397512436
epoch: 2 step: 268, loss is 0.008266668766736984
epoch: 2 step: 269, loss is 0.01968947798013687
epoch: 2 step: 270, loss is 0.004359885584563017
epoch: 2 step: 271, loss is 0.011067138984799385
epoch: 2 step: 272, loss is 0.003340118331834674
epoch: 2 step: 273, loss is 0.009768541902303696
epoch: 2 step: 274, loss is 0.01201885286718607
epoch: 2 step: 275, loss is 0.017018459737300873
epoch: 2 step: 276, loss is 0.013010584749281406
epoch: 2 step: 277, loss is 0.007726775482296944
epoch: 2 step: 278, loss is 0.011000252328813076
epoch: 2 step: 279, loss is 0.005872378125786781
epoch: 2 step: 280, loss is 0.0070030055940151215
epoch: 2 step: 281, loss is 0.016099099069833755
epoch: 2 step: 282, loss is 0.0054572937078773975
epoch: 2 step: 283, loss is 0.06694652140140533
epoch: 2 step: 284, loss is 0.01008275430649519
epoch: 2 step: 285, loss is 0.015508952550590038
epoch: 2 step: 286, loss is 0.054250217974185944
epoch: 2 step: 287, los

epoch: 2 step: 434, loss is 0.007965873926877975
epoch: 2 step: 435, loss is 0.04077021777629852
epoch: 2 step: 436, loss is 0.008733784779906273
epoch: 2 step: 437, loss is 0.00455778744071722
epoch: 2 step: 438, loss is 0.008610434830188751
epoch: 2 step: 439, loss is 0.01322388369590044
epoch: 2 step: 440, loss is 0.007224894128739834
epoch: 2 step: 441, loss is 0.006335665471851826
epoch: 2 step: 442, loss is 0.00314535666257143
epoch: 2 step: 443, loss is 0.0038643288426101208
epoch: 2 step: 444, loss is 0.012292989529669285
epoch: 2 step: 445, loss is 0.0024204477667808533
epoch: 2 step: 446, loss is 0.006690592039376497
epoch: 2 step: 447, loss is 0.004598419182002544
epoch: 2 step: 448, loss is 0.004156787879765034
epoch: 2 step: 449, loss is 0.011935759335756302
epoch: 2 step: 450, loss is 0.002435788046568632
epoch: 2 step: 451, loss is 0.0037483624182641506
epoch: 2 step: 452, loss is 0.007442277390509844
epoch: 2 step: 453, loss is 0.006265407428145409
epoch: 2 step: 454, l

epoch: 3 step: 4, loss is 0.006485814228653908
epoch: 3 step: 5, loss is 0.005310133099555969
epoch: 3 step: 6, loss is 0.002869605552405119
epoch: 3 step: 7, loss is 0.0015485782641917467
epoch: 3 step: 8, loss is 0.0019053402356803417
epoch: 3 step: 9, loss is 0.0021884250454604626
epoch: 3 step: 10, loss is 0.0056383078917860985
epoch: 3 step: 11, loss is 0.005306471139192581
epoch: 3 step: 12, loss is 0.006432119756937027
epoch: 3 step: 13, loss is 0.006602785550057888
epoch: 3 step: 14, loss is 0.004483181983232498
epoch: 3 step: 15, loss is 0.0036771465092897415
epoch: 3 step: 16, loss is 0.0021197593305259943
epoch: 3 step: 17, loss is 0.0035662224981933832
epoch: 3 step: 18, loss is 0.0023990324698388577
epoch: 3 step: 19, loss is 0.0025927869137376547
epoch: 3 step: 20, loss is 0.005218863021582365
epoch: 3 step: 21, loss is 0.0009511513635516167
epoch: 3 step: 22, loss is 0.001463944325223565
epoch: 3 step: 23, loss is 0.002514293882995844
epoch: 3 step: 24, loss is 0.0008530

epoch: 3 step: 171, loss is 0.0005666576325893402
epoch: 3 step: 172, loss is 0.001007279846817255
epoch: 3 step: 173, loss is 0.00046625779941678047
epoch: 3 step: 174, loss is 0.001139873987995088
epoch: 3 step: 175, loss is 0.0010595516068860888
epoch: 3 step: 176, loss is 0.0038915914483368397
epoch: 3 step: 177, loss is 0.0017088211607187986
epoch: 3 step: 178, loss is 0.002161039039492607
epoch: 3 step: 179, loss is 0.009596004150807858
epoch: 3 step: 180, loss is 0.0008852881146594882
epoch: 3 step: 181, loss is 0.0008965842425823212
epoch: 3 step: 182, loss is 0.003184873377904296
epoch: 3 step: 183, loss is 0.0010544852120801806
epoch: 3 step: 184, loss is 0.0023334925062954426
epoch: 3 step: 185, loss is 0.000971881439909339
epoch: 3 step: 186, loss is 0.001969818025827408
epoch: 3 step: 187, loss is 0.00208727503195405
epoch: 3 step: 188, loss is 0.0008612776873633265
epoch: 3 step: 189, loss is 0.0012381526175886393
epoch: 3 step: 190, loss is 0.0038757063448429108
epoch: 3

epoch: 3 step: 336, loss is 0.0010563535615801811
epoch: 3 step: 337, loss is 0.0015026233159005642
epoch: 3 step: 338, loss is 0.0011576383840292692
epoch: 3 step: 339, loss is 0.0011942198034375906
epoch: 3 step: 340, loss is 0.006865449249744415
epoch: 3 step: 341, loss is 0.0005099551635794342
epoch: 3 step: 342, loss is 0.002866553608328104
epoch: 3 step: 343, loss is 0.0013312563532963395
epoch: 3 step: 344, loss is 0.0013960963115096092
epoch: 3 step: 345, loss is 0.0031881940085440874
epoch: 3 step: 346, loss is 0.0005423207185231149
epoch: 3 step: 347, loss is 0.0005054677021689713
epoch: 3 step: 348, loss is 0.0011283046333119273
epoch: 3 step: 349, loss is 0.0007520875660702586
epoch: 3 step: 350, loss is 0.0003019528812728822
epoch: 3 step: 351, loss is 0.0027607937809079885
epoch: 3 step: 352, loss is 0.0035061510279774666
epoch: 3 step: 353, loss is 0.004004169255495071
epoch: 3 step: 354, loss is 0.00048367702402174473
epoch: 3 step: 355, loss is 0.0004469208943191916
ep

epoch: 3 step: 500, loss is 0.0021120519377291203
epoch: 3 step: 501, loss is 0.0013812528923153877
epoch: 3 step: 502, loss is 0.0025696721859276295
epoch: 3 step: 503, loss is 0.0003185533278156072
epoch: 3 step: 504, loss is 0.00035687710624188185
epoch: 3 step: 505, loss is 0.0004920833162032068
epoch: 3 step: 506, loss is 0.0007473533623851836
epoch: 3 step: 507, loss is 0.0014447747962549329
epoch: 3 step: 508, loss is 0.000186846504220739
epoch: 3 step: 509, loss is 0.0005006248247809708
epoch: 3 step: 510, loss is 0.001926638768054545
epoch: 3 step: 511, loss is 0.0011603221064433455
epoch: 3 step: 512, loss is 0.0016511155990883708
epoch: 3 step: 513, loss is 0.000932565308175981
epoch: 3 step: 514, loss is 0.0012362822890281677
epoch: 3 step: 515, loss is 0.004725092556327581
epoch: 3 step: 516, loss is 0.001706699375063181
epoch: 3 step: 517, loss is 0.004799210466444492
epoch: 3 step: 518, loss is 0.0006164931692183018
epoch: 3 step: 519, loss is 0.0012729999143630266
epoch

epoch: 4 step: 69, loss is 0.0023381528444588184
epoch: 4 step: 70, loss is 0.00037186089321039617
epoch: 4 step: 71, loss is 0.0005707233212888241
epoch: 4 step: 72, loss is 0.0016999756917357445
epoch: 4 step: 73, loss is 0.00017220544395968318
epoch: 4 step: 74, loss is 0.0008891057223081589
epoch: 4 step: 75, loss is 0.0007738709100522101
epoch: 4 step: 76, loss is 0.0002724655787460506
epoch: 4 step: 77, loss is 0.0007066407706588507
epoch: 4 step: 78, loss is 0.0005547570763155818
epoch: 4 step: 79, loss is 0.0007810319075360894
epoch: 4 step: 80, loss is 0.0005122203147038817
epoch: 4 step: 81, loss is 0.0011805221438407898
epoch: 4 step: 82, loss is 0.00045754003804177046
epoch: 4 step: 83, loss is 0.0013748318888247013
epoch: 4 step: 84, loss is 0.000452680338639766
epoch: 4 step: 85, loss is 0.0005928942118771374
epoch: 4 step: 86, loss is 0.0007122340612113476
epoch: 4 step: 87, loss is 0.00021820818074047565
epoch: 4 step: 88, loss is 0.000401204772060737
epoch: 4 step: 89,

epoch: 4 step: 234, loss is 0.0010058479383587837
epoch: 4 step: 235, loss is 0.000656931078992784
epoch: 4 step: 236, loss is 0.0025092207361012697
epoch: 4 step: 237, loss is 0.0006589965196326375
epoch: 4 step: 238, loss is 0.0005778519553132355
epoch: 4 step: 239, loss is 0.0010367401409894228
epoch: 4 step: 240, loss is 0.0007606750587001443
epoch: 4 step: 241, loss is 0.000400731572881341
epoch: 4 step: 242, loss is 0.00647029047831893
epoch: 4 step: 243, loss is 0.0005277217132970691
epoch: 4 step: 244, loss is 0.000397468451410532
epoch: 4 step: 245, loss is 0.0002913043717853725
epoch: 4 step: 246, loss is 0.0006049717194400728
epoch: 4 step: 247, loss is 0.00020741153275594115
epoch: 4 step: 248, loss is 0.0004051247669849545
epoch: 4 step: 249, loss is 0.0008120929123833776
epoch: 4 step: 250, loss is 0.001063306350260973
epoch: 4 step: 251, loss is 0.0008193391840904951
epoch: 4 step: 252, loss is 0.0017520993715152144
epoch: 4 step: 253, loss is 0.0015182264614850283
epoch

epoch: 4 step: 398, loss is 0.0007388044614344835
epoch: 4 step: 399, loss is 0.0011332560097798705
epoch: 4 step: 400, loss is 0.0007275743409991264
epoch: 4 step: 401, loss is 0.0011585548054426908
epoch: 4 step: 402, loss is 0.0003401902795303613
epoch: 4 step: 403, loss is 0.00033393417834304273
epoch: 4 step: 404, loss is 0.004895361606031656
epoch: 4 step: 405, loss is 0.00027685397071763873
epoch: 4 step: 406, loss is 0.0005003921687602997
epoch: 4 step: 407, loss is 0.00031854771077632904
epoch: 4 step: 408, loss is 0.0005527150351554155
epoch: 4 step: 409, loss is 0.0007160268723964691
epoch: 4 step: 410, loss is 0.0003971375699620694
epoch: 4 step: 411, loss is 0.0004770146042574197
epoch: 4 step: 412, loss is 0.0005674067069776356
epoch: 4 step: 413, loss is 0.0010431731352582574
epoch: 4 step: 414, loss is 0.0003809751069638878
epoch: 4 step: 415, loss is 0.0002487364108674228
epoch: 4 step: 416, loss is 0.0008513011271134019
epoch: 4 step: 417, loss is 0.000744528137147426

epoch: 4 step: 562, loss is 0.00028150773141533136
epoch: 4 step: 563, loss is 0.0002577918639872223
epoch: 4 step: 564, loss is 0.0002096847165375948
epoch: 4 step: 565, loss is 0.000577963306568563
epoch: 4 step: 566, loss is 0.0005261434707790613
epoch: 4 step: 567, loss is 0.00018621466006152332
epoch: 4 step: 568, loss is 0.0006171207060106099
epoch: 4 step: 569, loss is 0.0003194824676029384
epoch: 4 step: 570, loss is 0.00014941765402909368
epoch: 4 step: 571, loss is 0.00024215105804614723
epoch: 4 step: 572, loss is 0.00029553190688602626
epoch: 4 step: 573, loss is 0.000577410391997546
epoch: 4 step: 574, loss is 0.0002791588776744902
epoch: 4 step: 575, loss is 0.0008016073843464255
epoch: 4 step: 576, loss is 0.0005238247686065733
epoch: 4 step: 577, loss is 0.00024396096705459058
epoch: 4 step: 578, loss is 0.0003717077779583633
epoch: 4 step: 579, loss is 0.0006567423115484416
epoch: 4 step: 580, loss is 0.00036913849180564284
epoch: 4 step: 581, loss is 0.000886503024958

In [33]:
def preprocess(sentence):
    # 将输入的句子转化为小写并去掉头尾的空格
    sentence = sentence.lower().strip()
    sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')\
                                    .replace("  "," ")
    # 将句子按空格分词
    sentence = sentence.split(' ')
    # 获取配置的句子最大长度
    maxlen = cfg.word_len
    # 初始化句子向量，长度为最大长度，所有元素都为0
    vector = [0]*maxlen
    # 遍历分词后的句子，如果词在词典中，则将词向量的对应位置设为该词在词典中的位置
    for index, word in enumerate(sentence):
        # 如果词的位置超过了最大长度，则退出循环
        if index >= maxlen:
            break
        # 如果词不在词典中，则打印警告信息
        if word not in instance.Vocab.keys():
            print(word,"单词未出现在字典中")
        # 如果词在词典中，则将词向量的对应位置设为该词在词典中的位置
        else:
            vector[index] = instance.Vocab[word]
    # 将处理好的句子向量返回
    sentence = vector

    return sentence

# 使用训练好的模型进行推理的函数
def inference(review_en):
    # 首先对输入的评论进行预处理，转化为词向量
    review_en = preprocess(review_en)
    # 将词向量转化为张量，并添加一个维度作为批处理的维度
    input_en = Tensor(np.array([review_en]).astype(np.int32))
    # 将输入的张量通过模型，得到输出结果
    output = net(input_en)
    # 对输出结果进行处理，如果输出结果的最大值所在的位置为1，则认为是正面评论，否则是负面评论
    if np.argmax(np.array(output[0])) == 1:
        print("Positive comments")
    else:
        print("Negative comments")


In [35]:
review_en = "the movie is so wonderful"
inference(review_en)

Positive comments
