In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from copy import deepcopy as copy
from tqdm import tqdm
import torch.optim as optim
import random
import re
import os
import jieba
def seed_torch(seed=1122):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

AttributeError: 'Version' object has no attribute 'major'

In [3]:
BERT_PATH = 'bert-base-chinese/'

In [4]:
class GetBERT(nn.Module):

    def __init__(self):
        super(GetBERT, self).__init__()
        self.bert_tokenizer = BertTokenizer.from_pretrained("C:/Users/12968/Desktop/chinese-bert-wwm-ext")
        self.bert = BertModel.from_pretrained("C:/Users/12968/Desktop/chinese-bert-wwm-ext")
        for param in self.bert.parameters():
            param.requires_grad = True
        
    def forward(self, sentence_lists):
        """
        输入句子列表(去掉了停用词的)
        """
        sentence_lists = [' '.join(x) for x in sentence_lists]
        #print('sentence_lists:'+str(sentence_lists))
        ids = self.bert_tokenizer(sentence_lists, padding=True, return_tensors="pt")
        #print('ids:'+str(ids))
        inputs = ids['input_ids']
        #print('inputs:'+str(inputs))

        embeddings = self.bert(inputs)
        #print(str(embeddings[0].shape))
        x = embeddings[0] #1 * 768
        #print(x.shape)
        return x
    

In [5]:
class Pre:
    def __init__(self, text):
        """
        输入一个文本
        """
        self.puncs_coarse = ['。', '!', '；', '？', '……', '\n',' ']
        self.text = text
        self.stopwords = self.deal_wrap('dict/stop1205.txt')
    
    def segment(self, sentence):
        sentence_seged = jieba.cut(sentence.strip())
        outstr = ''
        for word in sentence_seged:
            if word not in self.stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        word_list = outstr.split(' ')
        pattern = '[A-Za-z]*[0-9]*[\'\"\%.\s\@\!\#\$\^\&\*\(\)\-\<\>\?\/\,\~\`\:\;]*[：；”“ ‘’+-——！，。？、~@#￥%……&*（）【】]*'
        t = [re.sub(pattern, "", x.strip()) for x in word_list]
        t = [x for x in t if x != '']
        return ''.join(t)
    
    def deal_wrap(self, filedict):
        temp = []
        for x in open(filedict, 'r', encoding='utf-8').readlines():
            temp.append(x.strip())
        return temp
        
    def split_sentence_coarse(self):
        """
        按照。！？“”等中文完整句子语义来分句
        1. 去除换行符、多余的空格、百分号
        2. 分句，存入列表
        :return:装着每个句子的列表（包括标点符号）
        """
        
        text = self.text
        sentences = []
        start = 0
        for i in range(len(text)):
            if text[i] in self.puncs_coarse:
                sentences.append(text[start:i + 1])
                start = i + 1
        if start == 0:
            sentences.append(text)
        return sentences
    
    def get_keywords(self, data):
        """
        如果句子太长，就进行关键词提取
        """
        from jieba import analyse
        textrank = analyse.textrank
        keywords = textrank(data, topK=8)
        return ''.join(keywords)

    def preprocess(self):
        # 分句
        sentences = self.split_sentence_coarse()
        # 对每个句子，去除里面的停用词，再连起来
        # 对每个句子，如果句子太长，长度大于20（我随便定的），就抽取八个关键词连起来
        new_sent = []
        for i in sentences:
            if len(i) < 5:
                new_sent.append(i)
                continue
            i = self.segment(i)
            if len(i) > 25:
                i = self.get_keywords(i)
            if i != '':
                new_sent.append(i)
        return new_sent

In [6]:
class GetData():
    def __init__(self,pos=4000, neg=3600):
        data = pd.read_excel('E:/FinancialIntelligence/sentiment_classify_data/comments_raw_v1.xls')
        data = data[data['score']!=3].reset_index()
        data['label'] = data['score'].map(lambda a : 1 if a in [4,5] else 0) 
        data.drop(['id','score'],inplace=True,axis=1)
        data['content'] = [str(i) for i in list(data['content'])]
        # 原数据标签为0（负向情感）的数据有3632条，正向情感的有57262条
        data1 = data[data['label']==1].sample(pos)
        data0 = data[data['label']==0].sample(neg)
        data = pd.concat([data1,data0],axis=0,ignore_index=True)
        self.data = data
    
    def split_sen(self):
        x = []
        y = []
        for i in trange(len(self.data)):
            p = Pre(self.data['content'][i])
            sen_lst = p.preprocess()
            if sen_lst == []:
                continue
            x.append(sen_lst)
            y.append(self.data['label'][i])
        print(len(x))
        print(y.count(1))
        print(y.count(0))
        return x, y

In [7]:
class LSTM(nn.Module):
    
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm_layer = nn.LSTM(input_size=768, hidden_size=128, batch_first=True)
        self.linear_layer = nn.Linear(in_features=128, out_features=2, bias=True)
        
    def forward(self, x):
        out1, (h_n, h_c) = self.lstm_layer(x)
        a, b, c = h_n.shape
        out = self.linear_layer(h_n.reshape(a*b, c))
        out = F.log_softmax(out,dim=1)
        return out

In [8]:
def train_model(epoch, train_dataLoader, test_dataLoader):
    # 训练模型
    best_model = None
    train_loss = 0
    test_loss = 0
    best_loss = 100
    epoch_cnt = 0
    for _ in range(epoch):
        total_train_loss = 0
        total_train_num = 0
        total_test_loss = 0
        total_test_num = 0
        for x, y in tqdm(train_dataLoader,
                         desc='Epoch: {}| Train Loss: {}| Test Loss: {}'.format(_, train_loss, test_loss)):
        #for x, y in train_dataLoader:
            x_num = len(x)
            p = model(x)
            loss = loss_func(p, y.long())
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
            total_train_loss += loss.item()
            total_train_num += x_num
        train_loss = total_train_loss / total_train_num
        train_loss_list.append(train_loss)
        for x, y in test_dataLoader:
            x_num = len(x)
            p = model(x)
            loss = loss_func(p, y.long())
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
            total_test_loss += loss.item()
            total_test_num += x_num
        test_loss = total_test_loss / total_test_num
        test_loss_list.append(test_loss)
        
        # early stop
        if best_loss > test_loss:
            best_loss = test_loss
            best_model = copy(model)
            torch.save(best_model.state_dict(), 'lstm_.pth')
            epoch_cnt = 0
        else:
            epoch_cnt += 1
            
        if epoch_cnt > early_stop:
            torch.save(best_model.state_dict(), 'lstm_.pth')
            print("保存模型")
            #print(best_model.state_dict())
            break

In [9]:
def test_model(test_dataLoader_):
    pred = []
    label = []
    model_.load_state_dict(torch.load("lstm_.pth"))
    model_.eval()
    total_test_loss = 0
    total_test_num = 0
    for x, y in test_dataLoader_:
        x_num = len(x)
        p = model_(x)
#         print('##', len(p), len(y))
        loss = loss_func(p, y.long())
        total_test_loss += loss.item()
        total_test_num += x_num
        pred.extend(p.data.squeeze(1).tolist())
        label.extend(y.tolist())
    test_loss = total_test_loss / total_test_num
    # print('##', len(pred), len(label))
    return pred, label, test_loss, test_loss_list

In [12]:
seed_torch(22)
epoch = 10
batch_size = 20
early_stop = 5
test_loss_list = []
train_loss_list = []

# 初始化模型
model = LSTM()
model_ = LSTM()

# 数据处理部分
gd = GetData()
x, y = gd.split_sen()
pos = y.count(1)
neg = y.count(0)
pos_train = int(pos*0.9)
neg_train = int(neg*0.9)
x1 = x[:pos]  # 3988   --- 3589train 399test
y1 = y[:pos]
x0 = x[pos:]  # 3589   ---- 3230train 359test
y0 = y[pos:]

train_x = x0[:neg_train] + x1[:pos_train]
train_y = y0[:neg_train] + y1[:pos_train]
print("训练集有"+str(len(train_x))+"个数据")

# c = list(zip(train_x, train_y))
# random.shuffle(c)
# c = random.sample(c, 50)
# train_x[:], train_y[:] = zip(*c)

test_x = x0[neg_train:] + x1[pos_train:]
test_y = y0[neg_train:] + y1[pos_train:]
print("测试集有"+str(len(test_x))+"个数据")

bert = GetBERT()
x_train = bert(train_x)
x_test = bert(test_x)
y_train = torch.tensor(train_y).float()
y_test = torch.tensor(test_y).float()
train_data = TensorDataset(x_train, y_train)
train_dataLoader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(x_test, y_test)
test_dataLoader = DataLoader(test_data, batch_size=batch_size)

# 损失函数和优化器
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_model(epoch, train_dataLoader, test_dataLoader)
p, y, test_loss, test_loss_list = test_model(test_dataLoader)
ans = []
for t in p:
    if t[0]>t[1]:
        ans.append(0)
    else:
        ans.append(1)
print(accuracy_score(ans,y))


  0%|                                                      | 0/7600 [00:00<?, ?it/s]
  2%|▊                                         | 157/7600 [00:00<00:04, 1558.71it/s]
  4%|█▋                                        | 304/7600 [00:00<00:04, 1527.33it/s]
  6%|██▌                                       | 455/7600 [00:00<00:04, 1518.74it/s]
  8%|███▎                                      | 608/7600 [00:00<00:04, 1518.80it/s]
 10%|████▎                                     | 770/7600 [00:00<00:04, 1544.03it/s]
 12%|████▉                                     | 899/7600 [00:00<00:05, 1332.92it/s]
 13%|█████▌                                   | 1020/7600 [00:00<00:05, 1242.88it/s]
 15%|██████▏                                  | 1137/7600 [00:00<00:05, 1082.50it/s]
 16%|██████▋                                  | 1244/7600 [00:01<00:06, 1032.83it/s]
 18%|███████▎                                 | 1352/7600 [00:01<00:05, 1044.33it/s]
 19%|███████▊                                 | 1456/7600 [00:01

7577
3988
3589
训练集有400个数据
训练集有100个数据



Epoch: 0| Train Loss: 0| Test Loss: 0:   0%|                 | 0/20 [00:00<?, ?it/s]
Epoch: 0| Train Loss: 0| Test Loss: 0:   5%|▍        | 1/20 [00:20<06:31, 20.60s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  10%|▉        | 2/20 [00:39<06:02, 20.12s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  15%|█▎       | 3/20 [00:55<05:18, 18.74s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  20%|█▊       | 4/20 [01:10<04:42, 17.63s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  25%|██▎      | 5/20 [01:27<04:21, 17.45s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  30%|██▋      | 6/20 [01:42<03:55, 16.80s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  35%|███▏     | 7/20 [01:57<03:30, 16.15s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  40%|███▌     | 8/20 [02:12<03:10, 15.84s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  45%|████     | 9/20 [02:28<02:56, 16.07s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  50%|████    | 10/20 [02:43<02:35, 15.54s/it]
Epoch: 0| Train Loss: 0| Test Loss: 0:  55%|████▍   | 11/20 [02:

Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:   5%| | 1/20 [00:14<04:40, 14.76s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  10%| | 2/20 [00:29<04:23, 14.65s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  15%|▏| 3/20 [00:44<04:10, 14.74s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  20%|▏| 4/20 [00:58<03:55, 14.71s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  25%|▎| 5/20 [01:13<03:39, 14.66s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  30%|▎| 6/20 [01:27<03:24, 14.63s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  35%|▎| 7/20 [01:42<03:10, 14.66s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  40%|▍| 8/20 [01:57<02:57, 14.77s/it]
Epoch: 7| Train Loss: 0.030436553582549094| Test Loss: 0.021654999256134032:  45

0.9


In [44]:
model = GetBERT()
x = model([['味道好饭量足'],['喜欢烤羊肉']])
lstm = LSTM()
lstm(x)

tensor([[ 0.0785],
        [-0.0127]], grad_fn=<AddmmBackward>)

In [14]:
bert = GetBERT()
model = LSTM()
model.load_state_dict(torch.load("lstm_.pth"))
model.eval()
for t in model(bert(['实在是太难吃了'])):
    if t[0]>t[1]:
        print("消极")
    else:
        print("积极")

消极
