In [1]:
import numpy as np
import pandas as pd
import jieba as jb
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
jb.load_userdict('./userdict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.391 seconds.
Prefix dict has been built succesfully.


In [3]:
# define constant
MAX_LEN = 200
EMD_DIM = 200

In [4]:
# read data
data1 = pd.read_csv('atec_nlp_sim_train.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data2 = pd.read_csv('atec_nlp_sim_train_add.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data = pd.concat([data1, data2])

sens1, sens2, labels = data['sen1'].as_matrix(), data['sen2'].as_matrix(), data['label'].as_matrix()

In [5]:
def train_test_split(X1, X2, y, test_size=0.2, shuffle=True):
    leng = len(y)
    inds = np.arange(leng)
    if shuffle:
        np.random.shuffle(inds)
    hook = int(leng * test_size)
    X1_train, X2_train, y_train = X1[inds[:-hook]], X2[inds[:-hook]], y[inds[:-hook]]
    X1_test, X2_test, y_test = X1[inds[-hook:]], X2[inds[-hook:]], y[inds[-hook:]]
    return X1_train, X2_train, y_train, X1_test, X2_test, y_test

In [6]:
# discarded
class MyTokenizer():
    def __init__(self, filters=u'，？。！的了和是就都而及与着或'):
        self.filters = set(filters)
        self.word_dict = {'<PAD>':0, '<UNK>':1}
        
    def fit(self, texts):
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            for w in words:
                if w not in self.word_dict:
                    self.word_dict[w] = len(self.word_dict)
        return self.word_dict
    
    def texts_to_seqs(self, texts):
        result = []
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            result.append([self.word_dict[w] if w in self.word_dict else 1 for w in words])
        return result

In [7]:
def tokenize(texts, filters=u'，？。！的了和是就都而及与着或'):
    filters = set(filters)
    sens = []
    for sen in texts:
        words = jb.lcut(sen)
        words = [w for w in words if w.strip() and w not in filters]
        sens.append(words)
    return sens

In [8]:
def to_int_seqs(seqs, vocab):
    rs = []
    for s in seqs:
        rs.append([vocab[w] if w in vocab else 0 for w in s])
    return rs

In [9]:
def gen_emb_matrix(word_vecs, vocab):
    emb_matrix = np.zeros((len(vocab), EMD_DIM))
    for w,i in vocab.items():
        if i != 0:
            emb_matrix[i] = word_vecs[w]
    return emb_matrix

In [10]:
def fscore(y_true, y_pred):
    y_pred = K.cast(K.greater(y_pred, 0.5), dtype='float32')
    tp = K.sum(K.cast(K.equal(y_true+y_pred, 2.), dtype='float32'))
    pred_p = K.sum(y_pred) + 1e-7
    actual_p = K.sum(y_true) + 1e-7
    precision = tp / pred_p
    recall = tp / actual_p
    return precision #(K.constant(2) * precision * recall) / (precision + recall)

In [51]:
def my_model(vocab_size, emb_matrix):
    
    input1 = Input(shape=(MAX_LEN,))
    input2 = Input(shape=(MAX_LEN,))
    
    embedding = Embedding(vocab_size,
                          EMD_DIM,
                          weights=[emb_matrix],
                          input_length=MAX_LEN,
                          trainable=False)
    lstm = LSTM(256, return_sequences=True)
    #time_dense = TimeDistributed(Dense(1))
    #conv1d = Conv1D(128, 3)
    
    x1 = embedding(input1)
    x2 = embedding(input2)
    
    #x1 = conv1d(x1)
    #x2 = conv1d(x2)
    
    x1 = lstm(x1)
    x2 = lstm(x2)
    
    #x1 = time_dense(x1)
    #x2 = time_dense(x2)
    
    #x1 = Flatten()(x1)
    #x2 = Flatten()(x2)
    
    #x = Concatenate(axis=-1)([x1, x2])
    #x = Dot(axes=-1, normalize=True)([x1, x2])
    x = Multiply()([x1, x2])
    x = Flatten()(x)
    x = Dense(100)(x)
    
    pred = Dense(1, activation='sigmoid')(x)
    #pred = Activation('sigmoid')(x)
    #pred = Multiply()([x, x])
    
    model = Model(inputs=[input1, input2], outputs=pred)
    
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', fscore])
    
    return model

In [18]:
#split data
X1_train, X2_train, y_train, X1_test, X2_test, y_test = train_test_split(sens1, sens2, labels)

In [19]:
X1_train, X2_train = tokenize(X1_train), tokenize(X2_train)
X1_test, X2_test = tokenize(X1_test), tokenize(X2_test)

sentences = np.concatenate((X1_train, X2_train), axis=0)

In [20]:
wv_model = Word2Vec(size=EMD_DIM, min_count=1)
wv_model.build_vocab(sentences)
wv_model.train(sentences, total_examples=wv_model.corpus_count, epochs=5)
wv_model.save('./wv_model')

In [21]:
word_vectors = wv_model.wv
del wv_model
i2w = [u'<UNK>'] + word_vectors.index2entity
vocab = dict(zip(i2w, range(len(i2w))))

In [22]:
to_paded_seqs = lambda x:pad_sequences(to_int_seqs(x, vocab),\
                    maxlen=MAX_LEN, padding='post', truncating='post')

X1_train, X2_train = to_paded_seqs(X1_train), to_paded_seqs(X2_train)
X1_test, X2_test = to_paded_seqs(X1_test), to_paded_seqs(X2_test)

In [34]:
emb_matrix = gen_emb_matrix(word_vectors, vocab)

In [53]:
model = my_model(len(vocab), emb_matrix)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 200, 200)     2401800     input_27[0][0]                   
                                                                 input_28[0][0]                   
__________________________________________________________________________________________________
lstm_14 (LSTM)                  (None, 200, 256)     467968      embedding_14[0][0]               
          

In [None]:
model.fit([X1_train, X2_train], y_train, batch_size=64, epochs=5, validation_data=([X1_test, X2_test], y_test))

Train on 81982 samples, validate on 20495 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [61]:
print(1 - y_train.sum()*1.0 / len(y_train))
print(1 - y_test.sum()*1.0 / len(y_test))

0.8162157546778561
0.8234691388143449
