In [1]:
import pandas as pd
import numpy as np
import torchvision as tv
import jieba
from gensim import models
from gensim.models import Word2Vec
import torchvision.transforms as transforms
from torchvision.transforms import ToPILImage
import torch as t
show = ToPILImage() # 可以把Tensor转成Image，方便可视化
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('../../data/df_sen_sub/train.csv')
data = data.fillna('')
senti = data['sentiment_value'].unique()
content = list(data['content'])
senti_word = list(data['sentiment_word'])
senti_words = [senti_word[i]*len(content) for i in range(len(senti_word))]

In [None]:
with open('../../data/df_sen_sub/add_text.txt','w') as f:
    f.write('\n'.join(list(content) + list(senti_words)))
# 迭代器，使用jieba将句子进行分词
class Sentences(object):# 这个类可以根据实际情况重写，我已经将所有的文章进行分句，并整合到了一个文件里面
    def __init__(self, dirname):
        self.dirname = dirname # 句子所在文件，没句句子占一行
        #jieba.load_userdict("wordBase.txt") # 加载词库

    def __iter__(self):
        #for fname in os.listdir(self.dirname):
        for line in open(self.dirname):
                line = line.replace('\n', '')
                yield list(jieba.cut(line))

sentences = []
def train_word2vec(folder_path, size=100):
    global sentences
    sentences = Sentences(folder_path) #生成分词后的句子，是一个二维数组

    # size是词向量长度
    # worker是线程数量，建议与物理线程数量一致
    # min_count是指出现次数小于一定程度，就忽略，0表示不忽略
    model = Word2Vec(sentences, size=size, workers=8, min_count=0)

    # 训练结束就将模型保存起来
    model.save("../../data/df_sen_sub/add_word2vec_model")

# 生成50维度的词向量模型
train_word2vec("../../data/df_sen_sub/add_text.txt",50)

# 测试训练好的词向量模型，使用model[keyWord]即可获取keyword这个词的词向量
model = Word2Vec.load("../../data/df_sen_sub/add_word2vec_model")
sentences = list(sentences)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.017 seconds.
Prefix dict has been built succesfully.


In [None]:
sentences.append([])

In [None]:
x_vecs = []
for i in range(len(content)):
    temp = []
    for j in range(len(sentences[i])):
        temp.append(model[sentences[i][j]])
    if len(sentences[i + len(content)]) != 0:
        for j in range(len(sentences[i + len(content)])):
            temp.append(model[sentences[i + len(content)][j]])
    while len(temp) < 50:
        temp.append([0.0] * 50)
    if len(temp) > 50:
        temp = temp[:50]
    x_vecs.append(temp)
x_vecs = np.array(x_vecs)
print(x_vecs.shape)
y_map = []
map_ = [0, -1, 1]
y = list(data['sentiment_value'])
for i in range(len(y)):
    y_map.append(map_.index(y[i]))
y_map = np.array(y_map)
y_map.shape


In [None]:
X = x_vecs
y = y_map
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
len(train_x), len(train_y), len(test_x)

In [None]:
trainset = [(train_x[i], train_y[i]) for i in range(len(train_x))]
testset = [(test_x[i], test_y[i]) for i in range(len(test_x))]

In [None]:
trainloader = t.utils.data.DataLoader(
                    trainset, 
                    batch_size=4,
                    shuffle=True, 
                    num_workers=2)
testloader = t.utils.data.DataLoader(
                    testset, 
                    batch_size=4,
                    shuffle=False, 
                    num_workers=2)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.rnn1 = nn.RNN(50, 50) 
        self.fc1   = nn.Linear(25 * 25, 120) 
        self.fc2   = nn.Linear(120, 50)
        self.fc3   = nn.Linear(50, 3)
    def forward(self, x): 
        output, hn =  self.rnn1(x.float())
        x = F.max_pool2d(F.relu(output), (2, 2))
        x = x.view(x.size()[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)         
        return x
    

net = Net()
print(net)
from torch import optim
criterion = nn.CrossEntropyLoss() # 交叉熵损失函数
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
t.set_num_threads(8)
for epoch in range(10):  
    
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        
        # 输入数据
        inputs, labels = data
        # 梯度清零
        optimizer.zero_grad()
        
        # forward + backward 
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()   
        
        # 更新参数 
        optimizer.step()
        
        # 打印log信息
        # loss 是一个scalar,需要使用loss.item()来获取数值，不能使用loss[0]
        running_loss += loss.item()
        if i % 200 == 199: # 每200个batch打印一下训练状态
            print('[%d, %5d] loss: %.3f' \
                  % (epoch+1, i+1, running_loss / 200))
            running_loss = 0.0
print('Finished Training')


In [None]:
dataiter = iter(testloader)
images, labels = dataiter.next() # 一个batch返回4张图片
print('实际的label: ', ' '.join(\
            '%08s'%map_[labels[j]] for j in range(4)))

# 计算图片在每个类别上的分数
outputs = net(images)
# 得分最高的那个类
_, predicted = t.max(outputs.data, 1)

print('预测结果: ', ' '.join('%5s'\
            % map_[predicted[j]] for j in range(4)))

In [None]:
correct = 0 # 预测正确的图片数
total = 0 # 总共的图片数


# 由于测试的时候不需要求导，可以暂时关闭autograd，提高速度，节约内存
with t.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = t.max(outputs, 1)
        print([map_[int(predicted[i])] for i in range(len(predicted))])
        total += labels.size(0)
        correct += (predicted == labels).sum()

print('3283张测试集中的准确率为: %d %%' % (100 * correct / total))