In [152]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd 
import numpy as np
from gensim.models import Word2Vec
import jieba

In [153]:
class LSTM_model( nn.Module ):
    def __init__( self, input_size, hidden_size, num_layers, output_size ):
        super( LSTM_model, self ).__init__()
        self.lstm = nn.LSTM( input_size, hidden_size, num_layers, batch_first = True )
        self.fc = nn.Linear( hidden_size, output_size )
        
    def forward( self, x ):
        output, _ = self.lstm( x )
        output = self.fc( output[ :, -1, : ] )                              # 取序列的最后一个输出
        return output

In [154]:
def load_tsv( path ):
    data = pd.read_csv( path, sep = '\t' )
    data_x = data.iloc[ :, -1 ]
    data_y = data.iloc[ :, 1 ]
    return data_x, data_y

In [155]:
def load_txt( path ):
    with open( path, 'r', encoding = 'utf-8' ) as f:
        data = [ [ line.strip() ] for line in f.readlines() ]
        return data

In [156]:
train_x = load_txt( '../data/train.txt' )
test_x = load_txt( '../data/test.txt' )
train = train_x + test_x
x_all = []
for x in train:
    for i in x:
        x_all.append( i )

文本转向量

In [157]:
_, train_y = load_tsv( '../data/train.tsv' )
_, test_y = load_tsv( '../data/test.tsv' )
word2vec_model = Word2Vec( sentences = x_all, vector_size = 100, window = 5, min_count = 1, workers = 4 )

In [158]:
def text_to_vector( text ):
    vector = []
    for word in text:
        if word in word2vec_model.wv:
            vector.append( word2vec_model.wv[ word ] )
    
    if vector != []:
        return np.mean( vector, axis = 0 )
    else:
        return np.zeros( word2vec_model.vector_size )

In [159]:
# 将训练集、测试集都转成词向量
x_train_w2v = []
x_test_w2v = []

for line in train_x:
    for text in line:
        x_train_w2v.append( [text_to_vector( text )] )

for line in test_x:
    for text in line:
        x_test_w2v.append( [text_to_vector( text )] )
        
# 词向量转成pytorch张量
x_train_tensor = torch.Tensor( np.array( x_train_w2v, dtype = np.float32 ) )
x_test_tensor = torch.Tensor( np.array( x_test_w2v, dtype = np.float32 ) )

# 打包成dataset
train_dataset = TensorDataset( x_train_tensor, torch.LongTensor( train_y ) )
test_dataset = TensorDataset( x_test_tensor, torch.LongTensor( test_y ) )

# 转入dataloader用于训练
train_dataloader = DataLoader( train_dataset, batch_size = 32, shuffle = True )
test_dataloader = DataLoader( test_dataset, batch_size = 32, shuffle = True )

In [160]:
# 定义模型参数、实例化模型
input_size = word2vec_model.vector_size
# print( input_size )
hidden_size = 50
num_layers = 2
output_size = 2                                         # 情感态度二分类问题

model = LSTM_model( input_size, hidden_size, num_layers, output_size )
loss_f = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam( model.parameters(), lr = 0.0002 )

loss_min = 1000
for epoch in range( 10 ):
    model.train()
    for i, ( data, target ) in enumerate( train_dataloader ):
        # print( data.shape )
        outputs = model( data )
        loss = loss_f( outputs, target )
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            print( 'Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format( epoch + 1, 10, i, len( train_dataloader ), loss.item() ) )
        if loss.item() < loss_min:
            loss_min = loss.item()
            torch.save( model, '../data/LSTM_model.pth' )

Epoch [1/10], Batch [0/1772], Loss: 0.6947
Epoch [1/10], Batch [100/1772], Loss: 0.6585
Epoch [1/10], Batch [200/1772], Loss: 0.4965
Epoch [1/10], Batch [300/1772], Loss: 0.3257
Epoch [1/10], Batch [400/1772], Loss: 0.2842
Epoch [1/10], Batch [500/1772], Loss: 0.1308
Epoch [1/10], Batch [600/1772], Loss: 0.3537
Epoch [1/10], Batch [700/1772], Loss: 0.2484
Epoch [1/10], Batch [800/1772], Loss: 0.3437
Epoch [1/10], Batch [900/1772], Loss: 0.2028
Epoch [1/10], Batch [1000/1772], Loss: 0.1780
Epoch [1/10], Batch [1100/1772], Loss: 0.2822
Epoch [1/10], Batch [1200/1772], Loss: 0.3636
Epoch [1/10], Batch [1300/1772], Loss: 0.3043
Epoch [1/10], Batch [1400/1772], Loss: 0.1185
Epoch [1/10], Batch [1500/1772], Loss: 0.0941
Epoch [1/10], Batch [1600/1772], Loss: 0.2111
Epoch [1/10], Batch [1700/1772], Loss: 0.2013
Epoch [2/10], Batch [0/1772], Loss: 0.2447
Epoch [2/10], Batch [100/1772], Loss: 0.3135
Epoch [2/10], Batch [200/1772], Loss: 0.2884
Epoch [2/10], Batch [300/1772], Loss: 0.4718
Epoch 

In [161]:
with torch.no_grad():
    model.eval()
    r = 0
    total = 0
    for data, target in test_dataloader:
        outputs = model( data )
        _, predicted = torch.max( outputs.data, 1 )
        total += target.size( 0 )
        r += ( predicted == target ).sum().item()
        
    acc = r / total
    print( 'Test Accuracy: {:.2%}'.format( acc ) )

Test Accuracy: 90.10%


测试

In [165]:
def drop_stopword( datas ):                                 # 去掉停用词、无意义符号
    with open( '../data/cn_stopwords.txt', 'r', encoding = 'UTF8' ) as f:
        stop_words = [ word.strip() for word in f.readlines() ]
    pdata = []
    for x in datas:
        if x not in stop_words:
            pdata.append( x )
    return pdata



In [163]:
def input_text_process( text ):
    text = list( jieba.cut( text ) )
    text = drop_stopword( text )
    return text

In [172]:
input_text = '非常好'
# input_text = '糟糕透了'
input_text = drop_stopword( input_text )
input_text = [ [ text_to_vector( input_text ) ] ]


model = torch.load( '../data/LSTM_model.pth' )
label = [ 'negative', 'positive' ]
input_tensor = torch.Tensor( np.array( input_text, dtype = np.float32 ) )

with torch.no_grad():
    output = model( input_tensor )
pclass = label[ torch.argmax( output ).item() ]
print( f'预测结果：{pclass}' )
    

预测结果：positive
