In [1]:
import codecs
import re
import random
import numpy as np
import tensorflow as tf
from sklearn.metrics import f1_score

In [44]:
#将人工标注的数据转化成crf模型格式
def get_crf_data(line, words):
    indexs = []
    #for m in re.finditer(r'\[(.*?)\]',string):
    #    indexs += [m.start(0),m.end(0)]
    pre,flag,res = '',False,[]
    for i in range(len(line)):
        word = line[i]
        if word != '[' or word != ']':
            words.add(word)
        if word == '[':
            pre = word
            continue
        elif pre == '[':
            res.append([word,'B'])
            flag = True
        elif word == ']':
            res[-1][-1] = 'E'
            flag = False
        elif flag:
            res.append([word, 'M'])
        else:
            res.append([word, 'O'])
        pre = word
    return res

def process_file(path):
    res, max_len, words = [], 0, set()
    with codecs.open(path) as fin:
        for line in fin:
            line = line.strip()
            count = 0
            for word, label in get_crf_data(line, words):
                res.append(word + ' ' + label + '\n')
                count += 1
            if count > max_len: max_len = count
            res.append('\n')
    return max_len, words, res

def write_data(lines,out_path):
    with open(out_path,'w') as fout:
        for line in lines:
            fout.write(line)
def write_words(words, out_path):
    with codecs.open(out_path, 'w', encoding='gbk') as fout:
        for word in words:
            fout.write(word + '\n')

In [81]:
max_len, words, res = process_file('data/bendibao.txt')
write_data(res, 'data/bendibao.crf')
print(max_len)
write_words(words, 'data/words.txt')
#print(word2id)
#print(id2word)
#get_crf_data('杭州[去哪吃夜宵]？[杭州吃夜宵好去处]')

37


In [18]:
string = 'abc[bde]ioe[oeds]in'

In [19]:
re.findall(r'\[(.*?)\]',string)

['bde', 'oeds']

In [20]:
res = re.finditer(r'\[(.*?)\]',string)

In [21]:
for m in res:
    print(m.group(0),m.start(0),m.end(0))

[bde] 3 8
[oeds] 11 17


In [7]:
#tools，工具类
#将CRf数据格式进行处理，完成数据的填充，以及batch_size的数据集获取
class Tools:
    def __init__(self, seq_length):
        self.pad_word = '<PAD>'
        self.pad_tag = 'O'
        self.seq_length = seq_length
        self.words = []
        self.word2id = {}
        self.id2word = {}
        self.tag2id = {}
        
        self.get_tag2id()
        self.load_words('data/words.txt')
        self.get_word_dict()
    
    def get_tag2id(self):
        self.tag2id = {'O':[1.,.0,.0,.0], 'B':[.0,1.,.0,.0], 'M':[.0,.0,1.,.0], 'E':[.0,.0,.0,1.]}
        
    def load_words(self, path):
        
        with codecs.open(path, 'r', encoding='gbk') as fin:
            for line in fin:
                word = line.strip()
                if len(word) == 0: continue
                self.words.append(word)
                
    def get_word_dict(self):
        for index, word in enumerate(self.words, 1):
            self.word2id[word] = index
            self.id2word[index] = word
        self.word2id[self.pad_word] = 0
        self.id2word[0] = self.pad_word
    
    def padding_sequence(self, sequence, seq_length):
        train_list, tag_list = [], []
        #print(sequence)
        #print(seq_length)
        for i in range(seq_length):
            if i >= len(sequence):
                train_list.append(self.word2id[self.pad_word])
                tag_list.append(self.tag2id[self.pad_tag])
            else:
                try:
                    train_list.append(self.word2id[sequence[i][0]])
                    tag_list.append(self.tag2id[sequence[i][1]])
                except Exception as e:
                    print(str(e))
                    print(sequence)
        return train_list, tag_list
    
    def process_file(self, path):
        X, Y, sequence = [], [], []
        with codecs.open(path, 'r', encoding='gbk') as fin:
            for line in fin:
                if line == '\n':
                    sub_x, sub_y = self.padding_sequence(sequence, self.seq_length)
                    X.append(sub_x)
                    Y.append(sub_y)
                    sequence = []
                else:
                    items = line.strip().split(' ')
                    if len(items) > 1:
                        sequence.append(items)
            return np.array(X), np.array(Y)
    
    def next_batch(self,X, Y, batch_size=64):
        length = len(X)
        num_batch = length // batch_size
        indexs = list(range(length))
        random.shuffle(indexs)
        x_shuffle = X[indexs]
        y_shuffle = Y[indexs]
        
        for i in range(num_batch):
            start = i * batch_size
            end = min((i+1) * batch_size, length)
            yield x_shuffle[start:end], y_shuffle[start:end]

In [8]:
tools = Tools(37)
X,Y = tools.process_file('data/bendibao.train')
X_val, Y_val = tools.process_file('data/bendibao.test')
print(len(tools.word2id))

1745


In [9]:
print(len(X), len(X_val))

4667 1444


In [27]:
#构建lstm 网络
class Lstm_model(object):
    def __init__(self):
        self.batch_size = 64
        self.seq_length = 37
        self.num_classes = 4
        self.lr = 0.02
        self.lr_decay = 0.9
        self.dropout_keep_pro = 0.8
        self.num_epoch = 20
        self.hidden_dim = 128
        self.num_layer = 2
        self.embedding_dim = 64
        self.vocab_size = 1745
        
        self.run()
        
    def run(self):
        #inpux
        tf.reset_default_graph()
        self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.seq_length], name='input_x')
        self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, self.seq_length, self.num_classes], name='input_y')
        self.dropout_keep_pro = tf.placeholder(dtype=tf.float32, name='keep_pro')
        
        #input_y 变形
        self.labels = tf.reshape(self.input_y, [-1, self.num_classes])
        
        #lstm
        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=True)
        
        def dropout():
            cell = lstm_cell()
            return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.dropout_keep_pro)
        
        def get_weight(shape):
            return tf.Variable(tf.random_normal(shape=shape, stddev=0.1))
        
        #embedding
        with tf.name_scope('embedding'):
            embedding = tf.get_variable('embedding', shape=[self.vocab_size, self.embedding_dim], dtype=tf.float32)
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
        
        with tf.name_scope('lstm'):
            cells = [dropout() for i in range(self.num_layer)]
            rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
            outputs,_ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
            print(embedding_inputs.shape)
            print(outputs.shape)
            print(outputs[:,-1,:].shape)
            
            #改变形状
            outputs = tf.reshape(outputs, [-1, self.hidden_dim])
        
        with tf.name_scope('full_layer'):
            weight1 = get_weight(shape=[self.hidden_dim, self.hidden_dim])
            biases = tf.Variable(tf.constant(0.1,dtype=tf.float32, shape = [self.hidden_dim]))
            fc1 = tf.matmul(outputs, weight1) + biases
            fc1 = tf.nn.dropout(fc1, keep_prob=self.dropout_keep_pro)
            fc1 = tf.nn.relu(fc1)
            
            weigth2 = get_weight(shape=[self.hidden_dim, self.num_classes])
            biases2 = tf.Variable(tf.constant(0.1,dtype=tf.float32, shape = [self.num_classes]))
            self.logits = tf.matmul(fc1, weigth2) + biases2
            
        with tf.name_scope('train_step'):
            print('labels',self.labels.shape)
            print('logits',self.logits.shape)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits)
            #tf.nn.sparse_softmax_cross_entropy_with_logits(
            loss = tf.reduce_mean(cross_entropy)
            self.train_step = tf.train.GradientDescentOptimizer(self.lr).minimize(loss)
        
        with tf.name_scope('accuracy'):
            self.y_true = tf.arg_max(self.labels, 1)
            self.y_pre = tf.arg_max(self.logits, 1)
            correct_pre = tf.equal(self.y_pre, self.y_true)
            self.accuracy = tf.reduce_mean(tf.cast(correct_pre,dtype=tf.float32))
        

In [44]:
class Main(object):
    def __init__(self, X, Y, X_val, Y_val):
        self.model = Lstm_model()
        self.save_path = 'model/lstm_model/model.ckpt'
        self.X = X
        self.Y = Y
        self.X_val = X_val
        self.Y_val = Y_val
        
    def evaluate(self, sess, x, y):
        y_true = sess.run(self.model.y_true, feed_dict=\
                          {self.model.input_x:x, self.model.input_y:y, self.model.dropout_keep_pro:1.0})
        y_pre = sess.run(self.model.y_pre, feed_dict=\
                         {self.model.input_x:x, self.model.input_y:y, self.model.dropout_keep_pro:1.0})
        score = f1_score(y_true, y_pre, [0.,1.,2.,3.], average=None)
        return 'c0 f1_score:%.3f, c1 f1_score:%.3f, c2 f1_score:%.3f, c3 f1_score:%0.3f' % tuple(score)

    #训练模型
    def train(self):
        saver = tf.train.Saver()
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            print(X.shape)
            print(Y.shape)
            for epoch in range(300):
                for xs,ys in tools.next_batch(self.X, self.Y):
                    #print(xs.shape)
                    #print(ys)
                    sess.run(self.model.train_step, feed_dict = \
                             {self.model.input_x:xs, self.model.input_y:ys, self.model.dropout_keep_pro:0.8})
                    #labels = sess.run(model.labels,feed_dict = {model.input_x:xs, model.input_y:ys, model.dropout_keep_pro:0.8})
                    #logits = sess.run(model.logits,feed_dict = {model.input_x:xs, model.input_y:ys, model.dropout_keep_pro:0.8})
                    #acc = sess.run(model.accuracy, feed_dict = {model.input_x:xs, model.input_y:ys, model.dropout_keep_pro:0.8})

                    #print('Iter: ' + str(epoch) + ', Training Accuracy:' + str(acc))
                self.model.lr *= self.model.lr_decay
                train_acc = sess.run(self.model.accuracy, feed_dict = \
                    {self.model.input_x:self.X, self.model.input_y:self.Y, self.model.dropout_keep_pro:1.0})
                test_acc = sess.run(self.model.accuracy, feed_dict = \
                    {self.model.input_x:self.X_val, self.model.input_y:self.Y_val, self.model.dropout_keep_pro:1.0})
                f1_score_str = self.evaluate(sess, self.X_val, self.Y_val)
                print('Iter: ' + str(epoch) + ', Training Accuracy:' + str(train_acc) + ', Testing Accuracy:'\
                      + str(test_acc) + ' | ' + f1_score_str)
            saver.save(sess=sess, save_path=self.save_path)
            

In [None]:
main = Main(X, Y, X_val, Y_val)
main.train()

In [182]:
### 测试embedding_lookup
'''
tf.reset_default_graph()
x = tf.placeholder(dtype=tf.int32, shape=[None, 37])
embedding = tf.get_variable('embedding', shape=[1745, 64], dtype=tf.float32)
embedding_inputs = tf.nn.embedding_lookup(embedding, x)
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for xs,ys in tools.next_batch(X, Y):
        print(xs, xs.shape)
        res_em = sess.run(embedding_inputs, feed_dict = {x:xs})
        print(res_em)
        print(res_em.shape)
        break
'''

"\ntf.reset_default_graph()\nx = tf.placeholder(dtype=tf.int32, shape=[None, 37])\nembedding = tf.get_variable('embedding', shape=[1745, 64], dtype=tf.float32)\nembedding_inputs = tf.nn.embedding_lookup(embedding, x)\ninit = tf.global_variables_initializer()\nwith tf.Session() as sess:\n    sess.run(init)\n    for xs,ys in tools.next_batch(X, Y):\n        print(xs, xs.shape)\n        res_em = sess.run(embedding_inputs, feed_dict = {x:xs})\n        print(res_em)\n        print(res_em.shape)\n        break\n"