## 本篇代码用于尝试和比较不同Textual representations的效果

In [1]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle
import json
tqdm.pandas()
pd.options.mode.chained_assignment = None
import keras
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import tensorflow as tf 
from keras.models import Sequential, Model, load_model 
from keras.layers import Dense, Dropout, GRU,Input, LSTM, Embedding, Bidirectional,SimpleRNN
from keras.layers import Flatten, Conv1D, MaxPooling1D, GlobalMaxPooling1D, TimeDistributed, BatchNormalization
from keras.layers import concatenate as lconcat
from keras.optimizers import SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K


#sess_config.gpu_options.allow_growth = True
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
from sklearn.metrics import roc_auc_score
from keras.utils import np_utils,plot_model, multi_gpu_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler


In [3]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors



### 用文本数据来做不同的embedding

In [4]:
df = pd.read_pickle("Pickles/processed_data.pkl")
#df = pd.read_csv("Data/lemmatized_text.csv")

In [7]:
docs = df['processed_text']

In [13]:

vocab_size = 119398
#embedding_matrix = np.load("Pickles/embedding_matrix.npy")
max_words = 23070
embed_dim = 300 
aux_shape = 38

In [9]:
def tokenize_and_pad(docs,max_words=max_words):
    global t
    t = Tokenizer()
    t.fit_on_texts(docs)
    docs = pad_sequences(sequences = t.texts_to_sequences(docs),maxlen = max_words, padding = 'post')
    global vocab_size
    vocab_size = len(t.word_index) + 1
    
    return docs

In [10]:
docs = tokenize_and_pad(docs)

#### 使用word2vec做词嵌入

In [12]:
word_vectors = KeyedVectors.load_word2vec_format('word2vec\\GoogleNews-vectors-negative300.bin', binary=True)

In [15]:

words_not_found = []

embedding_matrix_w2v = np.zeros((vocab_size, embed_dim))
for word, i in t.word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix_w2v[i] = embedding_vector

    except:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_w2v, axis=1) == 0))

number of null word embeddings: 98770


In [19]:
np.save("Pickles/embedding_matrix_w2v.npy",embedding_matrix_w2v)
del word_vectors

#### 使用fasttext做词嵌入

In [22]:
import os, re, csv, math, codecs

In [23]:
embeddings_index = {}
f = codecs.open('fasttext//wiki300.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

999995it [04:52, 3424.47it/s]


found 999995 word vectors


In [24]:
#embedding matrix
#print('preparing embedding matrix...')
words_not_found = []
#nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix_fasttext = np.zeros((vocab_size, embed_dim))
for word, i in t.word_index.items():

    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_fasttext[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix_fasttext, axis=1) == 0))

number of null word embeddings: 96060


In [25]:
np.save("Pickles/embedding_matrix_fasttext.npy",embedding_matrix_fasttext)
del embeddings_index

与前面的deep learning代码类似，定义一个custom metric

In [4]:
from sklearn.metrics import roc_auc_score

# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.metrics.auc(y_true,y_pred)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

#### 定义可以生成不同模型的函数

In [26]:
def build_model(output_classes,architecture,aux_shape=aux_shape,vocab_size=vocab_size,embed_dim=embed_dim,embedding_matrix=embedding_matrix,max_seq_len=max_words):
    
    with tf.device('/cpu:0'): #在当前的cpu上运行
        main_input= Input(shape=(max_seq_len,),name='doc_input') #主要输入为文本，Input的维度是最长的文件max words
        main = Embedding(input_dim = vocab_size,
                            output_dim = embed_dim,
                            weights=[embedding_matrix], 
                            input_length=max_seq_len, 
                            trainable=False)(main_input)
#设置不同的模型供选择
    if architecture == 'mlp': 
        # Densely Connected Neural Network (Multi-Layer Perceptron)
        main = Dense(32, activation='relu')(main)
        main = Dropout(0.2)(main)
        main = Flatten()(main)
    elif architecture == 'cnn':
        # 1-D Convolutional Neural Network
        main = Conv1D(64, 3, strides=1, padding='same', activation='relu')(main)
        #Cuts the size of the output in half, maxing over every 2 inputs
        main = MaxPooling1D(pool_size=3)(main)
        main = Dropout(0.2)(main)
        main = Conv1D(32, 3, strides=1, padding='same', activation='relu')(main)
        main = GlobalMaxPooling1D()(main)
        #model.add(Dense(output_classes, activation='softmax'))
    elif architecture == 'rnn':
        # LSTM network
        main = SimpleRNN(32)(main)
        #main = GRU(64,activation='tanh')(main)
        #main = LSTM(32, return_sequences=False, dropout=0.2, recurrent_dropout=0.1)(main)
        #main = Bidirectional(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.1))(main)
        main = BatchNormalization()(main)
    elif architecture =="rnn_cnn":
        main = Conv1D(64, 5, padding='same', activation='relu')(main)
        main = MaxPooling1D()(main)
        main = Dropout(0.2)(main)
        #main = Bidirectional(CuDNNGRU(32,return_sequences=False),merge_mode='concat')(main)
        main = SimpleRNN(32,return_sequences=False)(main)
        main = BatchNormalization()(main)
   
    else:
        print('Error: Model type not found.')
      
    #辅助输入为其他控制变量，input维度是X的变量数
    auxiliary_input = Input(shape=(aux_shape,), name='aux_input')
    x = lconcat([main, auxiliary_input])#把两个输入合并
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    main_output = Dense(output_classes, activation='sigmoid', name='main_output')(x)#最终的输出是3个维度
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output],name=architecture)#该函数返回一个完整的模型，输入，输出，模型名
      
        #没有GPU用不了
    #sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    #model = multi_gpu_model(model)
    model.compile(optimizer='adam',loss= 'categorical_crossentropy',metrics=['accuracy',auc_roc])
    
    return model



In [18]:
X_train = pd.read_pickle("Pickles/X_train.pkl")
y_train = pd.read_pickle("Pickles/y_train.pkl")
docs_train = np.load("Pickles/docs_train.npy")


#### 分别训练两种不同embedding的模型

In [None]:
cnn = build_model(3,"cnn"，embedding_matrix=embedding_matrix_w2v)
#cnn = build_model(3,"cnn"，embedding_matrix=embedding_matrix_fasttext)

In [None]:
cnn.fit([docs_train,X_train],y_train,batch_size=128,epochs=10,verbose=1)
#10 epochs
cnn.save("Data/models/cnn13_w2v.hdf5")


#### 讲训练好的两个新模型拿来测试

In [5]:
def test(model):
    score =[]
    for i in range(10):
        score.append(model.evaluate([docs_test,X_test],y_test,batch_size=128))
    score = pd.DataFrame(score)
    return score

In [6]:
X_test = pd.read_pickle("Pickles/X_test.pkl")
y_test = pd.read_pickle("Pickles/y_test.pkl")
docs_test = np.load("Pickles/docs_test.npy")


测试word2vec

In [8]:
cnn_w2v = load_model("Data/models/cnn13_w2v.hdf5",custom_objects={"auc_roc":auc_roc})
w2v_test_score = test(cnn_w2v)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [9]:
w2v_test_score.describe()

Unnamed: 0,0,1,2
count,10.0,10.0,10.0
mean,1.111785,0.623907,0.75403
std,2.340556e-16,0.0,0.096065
min,1.111785,0.623907,0.480639
25%,1.111785,0.623907,0.783766
50%,1.111785,0.623907,0.78468
75%,1.111785,0.623907,0.785021
max,1.111785,0.623907,0.785199


In [10]:
w2v_test_score.to_pickle("Data//model_performance//w2v_test_score.pkl")

测试fasttext

In [8]:
cnn_fasttext = load_model("Data/models/cnn13_fasttext.hdf5",custom_objects={"auc_roc":auc_roc})
fasttext_test_score = test(cnn_fasttext)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [9]:
fasttext_test_score.describe()

Unnamed: 0,0,1,2
count,10.0,10.0,10.0
mean,0.931383,0.641399,0.770336
std,0.0,0.0,0.097516
min,0.931383,0.641399,0.492809
25%,0.931383,0.641399,0.800652
50%,0.931383,0.641399,0.801398
75%,0.931383,0.641399,0.801674
max,0.931383,0.641399,0.801817


In [10]:
fasttext_test_score.to_pickle("Data//model_performance//fasttext_test_score.pkl")

In [11]:
import gc

In [12]:
del cnn_fasttext
gc.collect()

9553