In [1]:
import numpy as np
import pandas as pd
import jieba as jb
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [39]:
print(K.equal(K.constant([1,2,3]), K.constant([0,2,2])).shape[0])

3


In [2]:
jb.load_userdict('./userdict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.315 seconds.
Prefix dict has been built succesfully.


In [62]:
# define constant
MAX_LEN = 150
EMD_DIM = 200

In [4]:
# read data
data1 = pd.read_csv('atec_nlp_sim_train.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data2 = pd.read_csv('atec_nlp_sim_train_add.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data = pd.concat([data1, data2])
#data.head()

In [5]:
sens1, sens2, labels = data['sen1'].as_matrix(), data['sen2'].as_matrix(), data['label'].as_matrix()

In [6]:
def train_test_split(X1, X2, y, test_size=0.2, shuffle=True):
    leng = len(y)
    inds = np.arange(leng)
    if shuffle:
        np.random.shuffle(inds)
    hook = int(leng * test_size)
    X1_train, X2_train, y_train = X1[inds[:-hook]], X2[inds[:-hook]], y[inds[:-hook]]
    X1_test, X2_test, y_test = X1[inds[-hook:]], X2[inds[-hook:]], y[inds[-hook:]]
    return X1_train, X2_train, y_train, X1_test, X2_test, y_test

In [7]:
class MyTokenizer():
    def __init__(self, filters=u'，？。！'):
        self.filters = set(filters)
        self.word_dict = {'<PAD>':0, '<UNK>':1}
        
    def fit(self, texts):
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            for w in words:
                if w not in self.word_dict:
                    self.word_dict[w] = len(self.word_dict)
        return self.word_dict
    
    def texts_to_seqs(self, texts):
        result = []
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            result.append([self.word_dict[w] if w in self.word_dict else 1 for w in words])
        return result

In [91]:
def fscore(y_true, y_pred):
    y_pred = K.cast(K.greater(y_pred, 0.5), dtype='float32')
    tp = K.sum(K.cast(K.equal(y_true+y_pred, 2.), dtype='float32'))
    pred_p = K.sum(y_pred) + 1e-7
    actual_p = K.sum(y_true) + 1e-7
    precision = tp / pred_p
    recall = tp / actual_p
    return precision #(K.constant(2) * precision * recall) / (precision + recall)

In [94]:
def my_model(vocab_size):
    
    input1 = Input(shape=(MAX_LEN,))
    input2 = Input(shape=(MAX_LEN,))
    
    embedding = Embedding(vocab_size, EMD_DIM, input_length=MAX_LEN)
    lstm = LSTM(128) # , return_sequences=True)
    #time_dense = TimeDistributed(Dense(1))
    
    x1 = embedding(input1)
    x2 = embedding(input2)
    
    x1 = lstm(x1)
    x2 = lstm(x2)
    
    #x1 = time_dense(x1)
    #x2 = time_dense(x2)
    
    #x1 = Flatten()(x1)
    #x2 = Flatten()(x2)
    
    x = Concatenate(axis=-1)([x1, x2])
    #x = Dot(axes=-1, normalize=False)([x1, x2])
    
    pred = Dense(1, activation='sigmoid')(x)
    #pred = Activation('sigmoid')(x)
    
    model = Model(inputs=[input1, input2], outputs=pred)
    
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy', fscore])
    
    return model

In [64]:
#split data
X1_train, X2_train, y_train, X1_test, X2_test, y_test = train_test_split(sens1, sens2, labels)

In [65]:
# tokenize
tokenizer = MyTokenizer()
word_dict = tokenizer.fit(np.concatenate((X1_train, X2_train), axis=0))

In [66]:
X1_train, X2_train = tokenizer.texts_to_seqs(X1_train), tokenizer.texts_to_seqs(X2_train)
X1_test, X2_test = tokenizer.texts_to_seqs(X1_test), tokenizer.texts_to_seqs(X2_test)

f_pad = lambda x:pad_sequences(x, maxlen=MAX_LEN, padding='post', truncating='pre')
X1_train, X2_train = f_pad(X1_train), f_pad(X2_train)
X1_test, X2_test = f_pad(X1_test), f_pad(X2_test)

In [98]:
model = my_model(len(word_dict))
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_45 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
input_46 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 150, 200)     2406000     input_45[0][0]                   
                                                                 input_46[0][0]                   
__________________________________________________________________________________________________
lstm_23 (LSTM)                  (None, 128)          168448      embedding_23[0][0]               
          

In [None]:
model.fit([X1_train[:50000], X2_train[:50000]], y_train[:50000], batch_size=64, epochs=5, validation_data=([X1_test[:5000], X2_test[:5000]], y_test[:5000]))

Train on 50000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [61]:
print(1 - y_train.sum()*1.0 / len(y_train))
print(1 - y_test.sum()*1.0 / len(y_test))

0.8162157546778561
0.8234691388143449
