In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import config
import logging
import os
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from sklearn.externals import joblib
from util import *
from TextCNN import *

Using TensorFlow backend.


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger(__name__)

# hyperParameters
model_name = 'model.pkl'
max_len = 300
embedding_dim = 350
batch_size = 256
epochs = 1
end_col = None

In [3]:
#----------------------------load train data---------------------------------
logger.info("start load data")
train_data_df = load_data_from_csv(config.train_data_path)
validate_data_df = load_data_from_csv(config.validate_data_path)

content_train = train_data_df.iloc[:, 1]

logger.info("start seg train data")
content_train = seg_words(content_train)
logger.info("complete seg train data")

logger.info("prepare train format")
# train_data_format = np.asarray([content_train]).T
words = []
for x in content_train:
    for w in x:
        words.append(w)
max_words = len(set(words))
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(content_train)
joblib.dump(tokenizer, 'words_tokenizer')      # 保存索引词典
data_w = tokenizer.texts_to_sequences(content_train)
train_data_format = pad_sequences(data_w, maxlen=max_len)
logger.info("complete formate train data")

columns = train_data_df.columns.values.tolist()

2018-11-02 11:20:02,315 [INFO] <MainProcess> (MainThread) start load data
2018-11-02 11:20:03,411 [INFO] <MainProcess> (MainThread) start seg train data
Building prefix dict from the default dictionary ...
2018-11-02 11:20:03,414 [DEBUG] <MainProcess> (MainThread) Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2018-11-02 11:20:03,416 [DEBUG] <MainProcess> (MainThread) Loading model from cache /tmp/jieba.cache
Loading model cost 0.512 seconds.
2018-11-02 11:20:03,926 [DEBUG] <MainProcess> (MainThread) Loading model cost 0.512 seconds.
Prefix dict has been built succesfully.
2018-11-02 11:20:03,927 [DEBUG] <MainProcess> (MainThread) Prefix dict has been built succesfully.
2018-11-02 11:22:21,347 [INFO] <MainProcess> (MainThread) complete seg train data
2018-11-02 11:22:21,348 [INFO] <MainProcess> (MainThread) prepare train format
2018-11-02 11:22:44,692 [INFO] <MainProcess> (MainThread) complete formate train data


In [None]:
# ----------------------------model train---------------------------------
logger.info("start train model")

for column in columns[2:end_col]:
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
    sess = tf.Session(config=tf_config)
    KTF.set_session(sess)
    
    train_label = label2onehot(train_data_df[column])   # 标量label转成onehot
    logger.info("start train %s model" % column)
    textCNN_model = TextCNN(max_words+1, embedding_dim=embedding_dim, maxlen=max_len)
    textCNN_model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['acc'])
    textCNN_model.fit(train_data_format, train_label, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    logger.info("complete train %s model" % column)

    logger.info("start save %s model"%column)
    model_path = config.model_path
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    textCNN_model.save(model_path+model_name+'_%s'%column)    # 保存模型
    logger.info("complete save %s model"%column)
    
    KTF.clear_session()


logger.info("complete train model")

2018-11-02 11:29:53,322 [INFO] <MainProcess> (MainThread) start train model
2018-11-02 11:29:53,378 [INFO] <MainProcess> (MainThread) start train location_traffic_convenience model


Train on 84000 samples, validate on 21000 samples
Epoch 1/1


2018-11-02 11:30:10,452 [INFO] <MainProcess> (MainThread) complete train location_traffic_convenience model
2018-11-02 11:30:10,453 [INFO] <MainProcess> (MainThread) start save location_traffic_convenience model
2018-11-02 11:30:12,804 [INFO] <MainProcess> (MainThread) complete save location_traffic_convenience model
2018-11-02 11:30:12,883 [INFO] <MainProcess> (MainThread) start train location_distance_from_business_district model


In [None]:
# ----------------------------validation---------------------------------
content_validata = validate_data_df.iloc[:, 1]

logger.info("start seg validata data")
content_validata = seg_words(content_validata)
logger.info("complete seg validata data")

logger.info("prepare valid format")
# validata_data_format = np.asarray([content_validata]).T
data_w = tokenizer.texts_to_sequences(content_validata)
validata_data_format = pad_sequences(data_w, maxlen=max_len)
logger.info("complete formate valid data")

logger.info("start compute f1 score for validata model")
f1_score_dict = dict()
for column in columns[2:end_col]:
    true_label = np.asarray(validate_data_df[column])
    classifier = load_model(model_path+model_name+'_%s'%column)
    pred_label = []
    valid_batch = 1000          # 预测时限制每轮样本数
    for i in range(int((len(validata_data_format)-1)/valid_batch)+1):
        tail = (i+1)*valid_batch
        if tail >= len(validata_data_format):                    # 最后一批验证样本
            tail = None
        label = classifier.predict(validata_data_format[i*valid_batch:tail])
        pred_label+=onehot2label(label).tolist()
    pred_label = np.asarray(pred_label)
    f1_score = get_f1_score(true_label, pred_label)
    f1_score_dict[column] = f1_score

f1_score = np.mean(list(f1_score_dict.values()))
str_score = "\n"
for column in columns[2:end_col]:
    str_score += column + ":" + str(f1_score_dict[column]) + "\n"

logger.info("f1_scores: %s\n" % str_score)
logger.info("f1_score: %s" % f1_score)
logger.info("complete compute f1 score for validate model")