基於Tensorflow的CNN訓練文本分類模型

In [None]:
import re,random,jieba
import datetime,os
import numpy as np 
import pandas as pd 
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
from tensorflow.contrib import learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [None]:
#讀取檔案
car_news = pd.read_csv('class_data/car_news.csv',encoding='utf-8')
car_news = car_news.dropna()

entertainment_news = pd.read_csv('class_data/entertainment_news.csv',encoding='utf-8')
entertainment_news = entertainment_news.dropna()

international_news = pd.read_csv('class_data/international_news.csv',encoding='utf-8')
international_news = international_news.dropna()

technology_news = pd.read_csv('class_data/technology_news.csv',encoding='utf-8')
technology_news = technology_news.dropna()

society_news = pd.read_csv('class_data/society_news.csv',encoding='utf-8')
society_news = society_news.dropna()

sports_news = pd.read_csv('class_data/sports_news.csv',encoding='utf-8')
sports_news = sports_news.dropna()

finance_news = pd.read_csv('class_data/finance_news.csv',encoding='utf-8')
finance_news = finance_news.dropna()

print('Car News:{}\nEntertainment News:{}\nInternational News:{}\nTechnology News:{}\nSociety News:{}\nSports News:{}\nFinance News:{}\n'.format(len(car_news),len(entertainment_news),len(international_news),len(technology_news),len(society_news),len(sports_news),len(finance_news)))

#每個新聞取出11000筆
car_news = car_news[:11000]
entertainment_news = entertainment_news[:11000]
entertainment_news = entertainment_news[:11000]
technology_news = technology_news[:11000]
society_news = society_news[:11000]
sports_news = sports_news[:11000]
finance_news = finance_news[:11000]

In [None]:
#讀取停用詞
stop_list=[]
with open('data/stopwords.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_list.append(line.strip())

In [None]:
def preprocess(data,all_data,category):
    for line in data:
        line = re.sub(r'[^\w]','',line)
        line = re.sub(r'[A-Za-z0-9]','',line)
        line = re.sub(u'[\uFF01-\uFF5A]','',line)
        segment = jieba.lcut(line)
        segment = filter(lambda x: len(x)>2,segment)
        segment = filter(lambda x: x not in stop_list,segment)
        all_data.append( (" ".join(segment),category) )

all_data = []
preprocess(car_news.content.values.tolist(),all_data,[1,0,0,0,0,0,0])
preprocess(technology_news.content.values.tolist(),all_data,[0,1,0,0,0,0,0])
preprocess(technology_news.content.values.tolist(),all_data,[0,0,1,0,0,0,0])
preprocess(technology_news.content.values.tolist(),all_data,[0,0,0,1,0,0,0])
preprocess(society_news.content.values.tolist(),all_data,[0,0,0,0,1,0,0])
preprocess(sports_news.content.values.tolist(),all_data,[0,0,0,0,0,1,0])
preprocess(finance_news.content.values.tolist(),all_data,[0,0,0,0,0,0,1])

In [None]:
# CNN參數(用CPU) Tensorflow
class TextCNN(object):
    """
    CNN文本分類器，主要結構有: a embedding layer + a convolutional, max-pooling and softmax layer
    """
    
    def __init__(self, sequence_length, num_classes, vocab_size,
                 embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        """
        :param sequence_length: 句子長度
        :param num_classes:     總共有幾個分類類別
        :param vocab_size:      詞彙量的大小
        :param embedding_size:  詞嵌入的維度
        :param filter_sizes:    每個filter處理幾個words
        :param num_filters:     每一個filter_sizes的Filter個數
        :param l2_reg_lambda:   optional
        """
        
        # 定義變數,placeholders佔位符,是tensorflow儲存外部輸入
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name='input_x') #None不限輸入個數
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        
        
        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            # 定義一個詞嵌入矩陣,並用隨機均勻分布初始化
            self.W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name='weight')
            # 透過查表將input的詞語轉換成詞嵌入,返回[None, sequence_length, embedding_size]
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            # TensorFlow的卷積層conv2d需要四維向量(batch， width，height，channel),增加通道數
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            
        
        # Convolution + Max-pooling for each filter
        pooled_outputs = [] #儲存max pooling之後得到的結果
        for i, filter_size in enumerate(filter_sizes): # 利用不同的filter去做convolution得到特徵向量
            with tf.name_scope('conv-maxpool-%s' % filter_size):
                # convolution layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W') #filter矩陣
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b')

                """ embedded_chars_expanded表示詞向量,W表示filter,
                    strides表示每次移動步長[1, width, height, 1]
                    padding='VALID' 表示不會補充0, padding='SAME' 表示會補充0
                """
                conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides=[1,1,1,1],
                                    padding='VALID', name='conv')
                
                # activation 激活函數
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
                
                # max pooling layer
                pooled = tf.nn.max_pool(h, ksize=[1, sequence_length-filter_size + 1, 1, 1],
                                        strides=[1,1,1,1], padding='VALID', name='pool')
                pooled_outputs.append(pooled)

                
        # 將所有pooling得到的fratures合併起來
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)  
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        
        # Dropout layer防止過擬合
        with tf.name_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
        
        
        # 預測得到每個分類的機率
        l2_loss = tf.constant(0.0)
        with tf.name_scope("output"):
            W = tf.get_variable('W', shape=[num_filters_total, num_classes],
                        initializer = tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b')
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.score = tf.nn.xw_plus_b(self.h_drop, W, b, name='scores')
            self.prediction = tf.argmax(self.score, 1, name='prediction')
        
        
        # 計算Loss 最優化對象
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
           
        
        # 預測準確度
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')
            

In [None]:
#模型參數: 詞向量緯度,每個filter的大小(處理幾個words),每個不同大小的filter個數,dropout機率,L2參數
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

#模型參數: 訓練批次,每一批的訓練資料數目,每多少步測一次,每多少步保存一次模型,保留幾個模型
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")

# 模型參數: 如果指定GPU沒找到則tensorflow會自動分配, 打印Log日誌 
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

tf.app.flags.DEFINE_string('f', '', 'kernel')
FLAGS = tf.flags.FLAGS

In [None]:
# 處理資料部分
x,y = zip(*all_data) # 切分訓練與測試資料集
max_document_length = 30 # 取設定每句話最長出現文本數
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) #創建詞彙表
x = np.array(list(vocab_processor.fit_transform(x))) 

# 切分train test validation資料集
X_train, x_test, Y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=666) 
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=666) 

In [None]:
#產生批次檔
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # 打亂順序
        if shuffle:
            # 隨機產生一個亂序index
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        # 劃分批次
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [None]:
# 開始訓練
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=7,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)
        
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)
        
        timestamp = str(int(time.time()))
        # 储存训练好的模型
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "CNN_Result", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        
        # 訓練模型
        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)
            
        
        # 測試模型，不需要dropout
        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)
                
        # 生成batches
        batches = batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
        #開始分批次訓練
        for batch in batches:
            # print batch,把打乱顺序后的、已经分批次的组合数据分开
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_val, y_val, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))