In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
#import Variable
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy as copy
import torch.optim as optim
import random
import re
import os
import jieba
from tqdm import tqdm
from tqdm import trange
from sklearn.metrics import f1_score, classification_report
def seed_torch(seed=1122):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)  # 为了禁止hash随机化，使得实验可复现
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
BERT_PATH = 'bert-base-chinese/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [9]:
class GetBERT(nn.Module):

    def __init__(self):
        super(GetBERT, self).__init__()
        self.bert_tokenizer = BertTokenizer.from_pretrained("chinese-bert-wwm-ext")
        self.bert = BertModel.from_pretrained("chinese-bert-wwm-ext")
        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, sentence_lists):
        """
        输入句子列表(去掉了停用词的)
        """
        sentence_lists = [' '.join(x) for x in sentence_lists]
        # print('sentence_lists:'+str(sentence_lists))
        ids = self.bert_tokenizer(sentence_lists, padding=True, return_tensors="pt")
        # print('ids:'+str(ids))
        inputs = ids['input_ids']
        # print('inputs:'+str(inputs))

        embeddings = self.bert(inputs)
        # print(str(embeddings[0].shape))
        x = embeddings[0]  # 1 * 768
        # print(x.shape)
        return x


class Pre:
    def __init__(self, text):
        """
        输入一个文本
        """
        self.puncs_coarse = ['。', '!', '；', '？', '……', '\n', ' ']
        self.text = text
        self.stopwords = self.deal_wrap('dict/stop1205.txt')

    def segment(self, sentence):
        sentence_seged = jieba.cut(sentence.strip())
        outstr = ''
        for word in sentence_seged:
            if word not in self.stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        word_list = outstr.split(' ')
        pattern = '[A-Za-z]*[0-9]*[\'\"\%.\s\@\!\#\$\^\&\*\(\)\-\<\>\?\/\,\~\`\:\;]*[：；”“ ‘’+-——！，。？、~@#￥%……&*（）【】]*'
        t = [re.sub(pattern, "", x.strip()) for x in word_list]
        t = [x for x in t if x != '']
        return ''.join(t)

    def deal_wrap(self, filedict):
        temp = []
        for x in open(filedict, 'r', encoding='utf-8').readlines():
            temp.append(x.strip())
        return temp

    def split_sentence_coarse(self):
        """
        按照。！？“”等中文完整句子语义来分句
        1. 去除换行符、多余的空格、百分号
        2. 分句，存入列表
        :return:装着每个句子的列表（包括标点符号）
        """

        text = self.text
        sentences = []
        start = 0
        for i in range(len(text)):
            if text[i] in self.puncs_coarse:
                sentences.append(text[start:i + 1])
                start = i + 1
        if start == 0:
            sentences.append(text)
        return sentences

    def get_keywords(self, data):
        """
        如果句子太长，就进行关键词提取
        """
        from jieba import analyse
        textrank = analyse.textrank
        keywords = textrank(data, topK=8)
        return ''.join(keywords)

    def preprocess(self):
        # 分句
        sentences = self.split_sentence_coarse()
        # 对每个句子，去除里面的停用词，再连起来
        # 对每个句子，如果句子太长，长度大于20（我随便定的），就抽取八个关键词连起来
        new_sent = []
        for i in sentences:
            if len(i) < 5:
                new_sent.append(i)
                continue
            i = self.segment(i)
            if len(i) > 25:
                i = self.get_keywords(i)
            if i != '':
                new_sent.append(i)
        return new_sent


class GetData():
    def __init__(self, pos=4000, neg=3600):
        data = pd.read_csv('sentiment_classify_data/raw_comment_v2.csv')
        data = data[data['score'] != 3].reset_index()
        data['label'] = data['score'].map(lambda a: 1 if a in [4, 5] else 0)
        data.drop(['post_time','score','shop_url'],inplace=True,axis=1)
        
#         data = pd.read_excel('sentiment_classify_data/comments_raw_v1.xls')
#         data = data[data['score'] != 3].reset_index()
#         data['label'] = data['score'].map(lambda a: 1 if a in [4, 5] else 0)
#         data.drop(['id', 'score'], inplace=True, axis=1)
        
        data['content'] = [str(i) for i in list(data['content'])]
        # 原数据标签为0（负向情感）的数据有3632条，正向情感的有57262条
        data1 = data[data['label'] == 1].sample(pos)
        data0 = data[data['label'] == 0].sample(neg)
        data = pd.concat([data1, data0], axis=0, ignore_index=True)
        self.data = data

    def split_sen(self):
        x = []
        y = []
        for i in trange(len(self.data)):
            p = Pre(self.data['content'][i])
            sen_lst = p.preprocess()
            if sen_lst == []:
                continue
            x.append(sen_lst)
            y.append(self.data['label'][i])
        print(len(x))
        print(y.count(1))
        print(y.count(0))
        return x, y


class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm_layer = nn.LSTM(input_size=768, hidden_size=128, batch_first=True)
        self.linear_layer = nn.Linear(in_features=128, out_features=2, bias=True)

    def forward(self, x):
        out1, (h_n, h_c) = self.lstm_layer(x)
        a, b, c = h_n.shape
        out = self.linear_layer(h_n.reshape(a * b, c))
        out = F.log_softmax(out, dim=1)
        return out


def train_model(epoch, train_dataLoader, test_dataLoader):
    # 训练模型
    best_model = None
    train_loss = 0
    test_loss = 0
    best_loss = 100
    epoch_cnt = 0
    for _ in range(epoch):
        total_train_loss = 0
        total_train_num = 0
        total_test_loss = 0
        total_test_num = 0
        for x, y in tqdm(train_dataLoader,
                         desc='Epoch: {}| Train Loss: {}| Test Loss: {}'.format(_, train_loss, test_loss)):
            # for x, y in train_dataLoader:
            x_num = len(x)
            p = model(x)
            loss = loss_func(p, y.long())
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
            total_train_loss += float(loss.item())
            total_train_num += x_num
        train_loss = total_train_loss / total_train_num
        train_loss_list.append(train_loss)
        for x, y in test_dataLoader:
            x_num = len(x)
            p = model(x)
            loss = loss_func(p, y.long())
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
            total_test_loss += float(loss.item())
            total_test_num += x_num
        test_loss = total_test_loss / total_test_num
        test_loss_list.append(test_loss)

        # early stop
        if best_loss > test_loss:
            best_loss = test_loss
            best_model = copy(model)
            torch.save(best_model.state_dict(), 'textcnn.pth')
            epoch_cnt = 0
        else:
            epoch_cnt += 1

        if epoch_cnt > early_stop:
            torch.save(best_model.state_dict(), 'textcnn.pth')
            print("保存模型")
            # print(best_model.state_dict())
            break
def test_model(test_dataLoader_):
    pred = []
    label = []
    model_.load_state_dict(torch.load("textcnn.pth"))
    model_.eval()
    total_test_loss = 0
    total_test_num = 0
    for x, y in test_dataLoader_:
        x_num = len(x)
        p = model_(x)
#         print('##', len(p), len(y))
        loss = loss_func(p, y.long())
        total_test_loss += loss.item()
        total_test_num += x_num
        pred.extend(p.data.squeeze(1).tolist())
        label.extend(y.tolist())
    test_loss = total_test_loss / total_test_num
    # print('##', len(pred), len(label))
    return pred, label, test_loss, test_loss_list

In [10]:
use_cuda = torch.cuda.is_available()

def matrix_mul(inputs, weight, bias=None):
#     print('inputs:'+str(inputs.shape))#torch.Size([20, 12, 32])
#     print('weight'+str(weight.shape))# 32 * 1500
    feature_list = []
    for input in inputs:
#         print('input'+str(input.shape)) # torch.Size([12, 32])
        feature = torch.mm(input, weight)  # (T, C)*(C, A) = (T, A)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature)
        feature_list.append(feature)
    return torch.stack(feature_list, 0).squeeze()  # (B, T)


def wise_mul(inputs, alphas):
    feature_list = []
    for sequence, alpha in zip(inputs, alphas):
        alpha = alpha.unsqueeze(1)
        feature = sequence * alpha
        feature_list.append(feature)
    output = torch.stack(feature_list, 0)
    return torch.sum(output, 1)


def attention(inputs, attention_size):
    """
    :param inputs: (batch_size, time_steps, hidden_size)
    """
#     print('attention inputs:'+str(inputs.shape)) #torch.Size([20, 12, 32])
    hidden_size = inputs.shape[2]
    w = nn.Parameter(torch.randn(hidden_size, attention_size)) # 32 1500
    b = nn.Parameter(torch.randn(1, attention_size)) # 1 1500 bias
    u = nn.Parameter(torch.randn(attention_size, 1)) # 1500 1

    if use_cuda:
        w = w.cuda()
        b = b.cuda()
        u = u.cuda()

    v = matrix_mul(inputs, w, b)       # (B, T, A) 32 12 1500
    u_v = matrix_mul(v,u)              # (B, T) 32 12 1
    alphas = F.softmax(u_v)            # (B, T)
    output = wise_mul(inputs, alphas)  # (B, H)
    return output

In [11]:
class TextCNN(nn.Module):
    def __init__(self,word_embedding_dimension=768,filters=32,kernel_size=[1,2,3,4]):
        super(TextCNN, self).__init__()

        self.conv0 = nn.Conv1d(in_channels=word_embedding_dimension,
                               out_channels=filters,
                               kernel_size=kernel_size[0])

        self.conv1 = nn.Conv1d(in_channels=word_embedding_dimension,
                               out_channels=filters,
                               kernel_size=kernel_size[1])

        self.conv2 = nn.Conv1d(in_channels=word_embedding_dimension,
                               out_channels=filters,
                               kernel_size=kernel_size[2])

        self.conv3 = nn.Conv1d(in_channels=word_embedding_dimension,
                               out_channels=filters,
                               kernel_size=kernel_size[3])

        self.linear = nn.Linear(in_features=32, out_features=2)

        self.batch_0 = nn.BatchNorm1d(num_features=12)
        self.batch_1 = nn.BatchNorm1d(num_features=32)

    def k_max_pooling(self, x, dim=2, k=3):
        index = x.topk(k, dim=dim)[1].sort(dim=dim)[0]
        return x.gather(dim, index)

    def forward(self, x):
#         print('-----------forward----------------')
#         print(x.shape)#torch.Size([20, 122, 768])
        batch_size = x.size(0)
        embeddings = x.permute(0, 2, 1)
#         print(embeddings.shape)#torch.Size([20, 768, 122])
        x0 = self.conv0(embeddings)
        x0 = F.relu(x0)
        x0 = self.k_max_pooling(x0)
#         print(x0.shape)#torch.Size([20, 32, 3])
        x1 = self.conv1(embeddings)
        x1 = F.relu(x1)
        x1 = self.k_max_pooling(x1)
#         print(x1.shape)#torch.Size([20, 32, 3])
        x2 = self.conv2(embeddings)
        x2 = F.relu(x2)
        x2 = self.k_max_pooling(x2)
#         print(x2.shape)#torch.Size([20, 32, 3])
        x3 = self.conv3(embeddings)
        x3 = F.relu(x3)
        x3 = self.k_max_pooling(x3)
#         print(x3.shape)#torch.Size([20, 32, 3])
        x = torch.cat((x0, x1, x2, x3), dim=2).permute(0, 2, 1)
#         print(x.shape)#torch.Size([20, 12, 32])
        x = self.batch_0(x)
#         print(x.shape)#torch.Size([20, 12, 32])
        x = attention(x, 1500)
        x = x.view(batch_size, -1)
        x = self.batch_1(x)
        x = F.dropout(x, p=0.5, training=self.training)
        output = self.linear(x)
        return output

In [12]:
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
seed_torch(22)
epoch = 5
batch_size = 1
early_stop = 3
test_loss_list = []
train_loss_list = []

model = TextCNN()
model_ = TextCNN()

if use_cuda:
    model = model.cuda()
    model_ = model_.cuda()


# 数据处理部分
gd = GetData(pos=5000, neg=5000)
x, y = gd.split_sen()
pos = y.count(1)
neg = y.count(0)
pos_train = int(pos*0.9)
neg_train = int(neg*0.9)
x1 = x[:pos]  # 3988   --- 3589train 399test
y1 = y[:pos]
x0 = x[pos:]  # 3589   ---- 3230train 359test
y0 = y[pos:]

train_x = x0[:neg_train] + x1[:pos_train]
train_y = y0[:neg_train] + y1[:pos_train]
print("训练集有"+str(len(train_x))+"个数据")

# c = list(zip(train_x, train_y))
# random.shuffle(c)
# c = random.sample(c, 50)
# train_x[:], train_y[:] = zip(*c)

test_x = x0[neg_train:] + x1[pos_train:]
test_y = y0[neg_train:] + y1[pos_train:]
print("测试集有"+str(len(test_x))+"个数据")

bert = GetBERT()
x_train = bert(train_x)
x_test = bert(test_x)
y_train = torch.tensor(train_y).float()
y_test = torch.tensor(test_y).float()

if use_cuda:
    x_train = x_train.cuda()
    x_test = x_test.cuda()
    y_train = y_train.cuda()
    y_test = y_test.cuda()


train_data = TensorDataset(x_train, y_train)
train_dataLoader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(x_test, y_test)
test_dataLoader = DataLoader(test_data, batch_size=batch_size)

# 损失函数和优化器
loss_func = nn.CrossEntropyLoss()
if use_cuda:
    loss_func = loss_func.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
train_model(epoch, train_dataLoader, test_dataLoader)
p, y, test_loss, test_loss_list = test_model(test_dataLoader)
ans = []
for t in p:
    if t[0]<t[1]:
        ans.append(0)
    else:
        ans.append(1)
print(accuracy_score(ans,y))

100%|██████████| 10000/10000 [00:15<00:00, 654.20it/s]


9971
4988
4983
训练集有8973个数据
测试集有998个数据


Epoch: 0| Train Loss: 0| Test Loss: 0:   0%|          | 0/8973 [00:00<?, ?it/s]


RuntimeError: self must be a matrix

In [None]:
plt.xlabel('Epoch', fontsize=18)
plt.ylabel('Loss', fontsize=18)
plt.plot(test_loss_list)
plt.plot(train_loss_list)
plt.legend(["test","train"])