In [1]:
import numpy as np
import pandas as pd
import jieba as jb
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import Model
from keras.optimizers import RMSprop
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
jb.load_userdict('./dict.txt')

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 2.111 seconds.
Prefix dict has been built succesfully.


In [3]:
# define constant
MAX_LEN = 280
EMD_DIM = 200

with open('stopwords.txt') as f:
    STOP_WORDS = f.read().decode('utf-8').split(' ')

In [4]:
# read data
data1 = pd.read_csv('atec_nlp_sim_train.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data2 = pd.read_csv('atec_nlp_sim_train_add.csv', sep='\t', index_col=0, names=['sen1', 'sen2', 'label'])
data = pd.concat([data1, data2])

sens1, sens2, labels = data['sen1'].as_matrix(), data['sen2'].as_matrix(), data['label'].as_matrix()

In [5]:
def train_test_split(X1, X2, y, test_size=0.2, shuffle=True):
    leng = len(y)
    inds = np.arange(leng)
    if shuffle:
        np.random.shuffle(inds)
    hook = int(leng * test_size)
    X1_train, X2_train, y_train = X1[inds[:-hook]], X2[inds[:-hook]], y[inds[:-hook]]
    X1_test, X2_test, y_test = X1[inds[-hook:]], X2[inds[-hook:]], y[inds[-hook:]]
    return X1_train, X2_train, y_train, X1_test, X2_test, y_test

In [6]:
# discarded
class MyTokenizer():
    def __init__(self, filters=u'，？。！的了和是就都而及与着或'):
        self.filters = set(filters)
        self.word_dict = {'<PAD>':0, '<UNK>':1}
        
    def fit(self, texts):
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            for w in words:
                if w not in self.word_dict:
                    self.word_dict[w] = len(self.word_dict)
        return self.word_dict
    
    def texts_to_seqs(self, texts):
        result = []
        for seq in texts:
            words = jb.lcut(seq)
            words = [w for w in words if w.strip() and w not in self.filters]
            result.append([self.word_dict[w] if w in self.word_dict else 1 for w in words])
        return result

In [7]:
def tokenize(texts, filters=u'，？。！'):
    filters = list(filters) + STOP_WORDS
    sens = []
    for sen in texts:
        words = jb.lcut(sen)
        words = [w for w in words if w.strip() and w not in filters]
        sens.append(words)
    return sens

In [8]:
def to_int_seqs(seqs, vocab):
    rs = []
    for s in seqs:
        rs.append([vocab[w] if w in vocab else 0 for w in s])
    return rs

In [9]:
def gen_emb_matrix(word_vecs, vocab):
    emb_matrix = np.zeros((len(vocab), EMD_DIM))
    for w,i in vocab.items():
        if i != 0:
            emb_matrix[i] = word_vecs[w]
    return emb_matrix

In [39]:
def fscore(y_true, y_pred):
    y_pred = K.cast(K.greater(y_pred, 0.5), dtype='float32')
    tp = K.sum(K.cast(K.equal(y_true+y_pred, 2.), dtype='float32'))
    pred_p = K.sum(y_pred) + K.epsilon()
    actual_p = K.sum(y_true) + K.epsilon()
    precision = tp / pred_p
    recall = tp / actual_p
    return (2 * precision * recall) / (precision + recall + K.epsilon())

In [42]:
def my_model(vocab_size, emb_matrix, lr=0.001):
    
    input1 = Input(shape=(MAX_LEN,))
    input2 = Input(shape=(MAX_LEN,))
    
    embedding = Embedding(vocab_size,
                          EMD_DIM,
                          weights=[emb_matrix],
                          input_length=MAX_LEN,
                          trainable=False)    
    conv1d = Conv1D(128, 3)
    lstm1 = Bidirectional(LSTM(128, return_sequences=True))
    lstm2 = Bidirectional(LSTM(128, return_sequences=True))
    pooling1d = MaxPooling1D(pool_size=2)
    
    x1 = embedding(input1)
    x2 = embedding(input2)
    
    x1 = conv1d(x1)
    x2 = conv1d(x2)
    
    x1 = pooling1d(x1)
    x2 = pooling1d(x2)
    
    x1 = lstm1(x1)
    x2 = lstm1(x2)
    
    #x1 = lstm2(x1)
    #x2 = lstm2(x2)
    
    x3 = Multiply()([x1, x2])    
    
    x4 = Subtract()([x1, x2])
    x4 = Lambda(lambda x: K.abs(x))(x4)
    
    x = Concatenate(axis=-1)([x3, x4])
    x = Flatten()(x)
    x = Dense(10)(x)
    
    pred = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[input1, input2], outputs=pred)
    
    model.compile(optimizer=RMSprop(lr=lr),
              loss='binary_crossentropy',
              metrics=['accuracy', fscore])
    
    return model

In [12]:
#split data
X1_train, X2_train, y_train, X1_test, X2_test, y_test = train_test_split(sens1, sens2, labels)

In [13]:
X1_train, X2_train = tokenize(X1_train), tokenize(X2_train)
X1_test, X2_test = tokenize(X1_test), tokenize(X2_test)

sentences = np.concatenate((X1_train, X2_train), axis=0)

In [17]:
wv_model = Word2Vec(size=EMD_DIM, min_count=1)
wv_model.build_vocab(sentences)
wv_model.train(sentences, total_examples=wv_model.corpus_count, epochs=8)
wv_model.save('./wv_model')

In [18]:
word_vectors = wv_model.wv
del wv_model
i2w = [u'<UNK>'] + word_vectors.index2entity
vocab = dict(zip(i2w, range(len(i2w))))

In [19]:
to_paded_seqs = lambda x:pad_sequences(to_int_seqs(x, vocab),\
                    maxlen=MAX_LEN, padding='post', truncating='post')

X1_train, X2_train = to_paded_seqs(X1_train), to_paded_seqs(X2_train)
X1_test, X2_test = to_paded_seqs(X1_test), to_paded_seqs(X2_test)

In [20]:
emb_matrix = gen_emb_matrix(word_vectors, vocab)

In [43]:
model = my_model(len(vocab), emb_matrix, lr=0.01)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 280)          0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 280)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 280, 200)     2375200     input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 278, 128)     76928       embedding_9[0][0]                
          

In [None]:
model.fit([X1_train, X2_train], y_train, batch_size=64, epochs=5, validation_data=([X1_test, X2_test], y_test))

Train on 81982 samples, validate on 20495 samples
Epoch 1/5
 1664/81982 [..............................] - ETA: 58:01 - loss: 3.1135 - acc: 0.7686 - fscore: 0.0208

In [61]:
print(1 - y_train.sum()*1.0 / len(y_train))
print(1 - y_test.sum()*1.0 / len(y_test))

0.8162157546778561
0.8234691388143449


#### 以下代码只是辅助找出数据里的一些常用词以补充jieba词典

In [22]:
def clean(sens):
    new_sens = []
    for s in sens:
        new_sens.append(re.sub(u'[，。？！]', ' ', s.decode('utf-8')).strip())
    return new_sens

In [45]:
def n_counts(sens, n, min_count=5, inverse=False):
    counts = {}
    for s in sens:
        if len(s) < n:
            continue
        if inverse:
            s = s[::-1]
        for i in range(len(s)-n+1):
            k = s[i].strip() if n==1 else tuple(s[i+j] for j in range(n) if s[i+j].strip())
            if len(k) != n:
                continue
            if k in counts:
                counts[k] += 1
            else:
                counts[k] = 1
    del_ks = [k for k,v in counts.items() if v<min_count]
    for k in del_ks:
        del counts[k]
    return counts

In [53]:
def extract_words(sens, min_freq=0.8, min_count=5):
    
    uni_counts = n_counts(sens, 1, min_count)
    bi_counts = n_counts(sens, 2, min_count)
    tri_counts = n_counts(sens, 3, min_count)
    four_counts = n_counts(sens, 4, min_count)
    
    ibi_counts = n_counts(sens, 2, min_count, inverse=True)
    itri_counts = n_counts(sens, 3, min_count, inverse=True)
    ifour_counts = n_counts(sens, 4, min_count, inverse=True)
    
    bi_words = set()
    for k,v in bi_counts.items():
        freq = v*1.0 / uni_counts[k[0]] if k[0] in uni_counts else 0
        if freq >= min_freq:
            bi_words.add(u''.join(k))
            
    tri_words = set()
    for k,v in tri_counts.items():
        freq = v*1.0 / bi_counts[(k[0],k[1])] if (k[0],k[1]) in bi_counts else 0
        if freq >= min_freq:
            tri_words.add(u''.join(k))
            
    four_words = set()
    for k,v in four_counts.items():
        freq = v*1.0 / tri_counts[(k[0],k[1],k[2])] if (k[0],k[1],k[2]) in tri_counts else 0
        if freq >= min_freq:
            four_words.add(u''.join(k))
            
    ibi_words = set()
    for k,v in ibi_counts.items():
        freq = v*1.0 / uni_counts[k[0]] if k[0] in uni_counts else 0
        if freq >= min_freq:
            ibi_words.add(u''.join(k[::-1]))
            
    itri_words = set()
    for k,v in itri_counts.items():
        freq = v*1.0 / ibi_counts[(k[0],k[1])] if (k[0],k[1]) in ibi_counts else 0
        if freq >= min_freq:
            itri_words.add(u''.join(k[::-1]))
            
    ifour_words = set()
    for k,v in ifour_counts.items():
        freq = v*1.0 / itri_counts[(k[0],k[1],k[2])] if (k[0],k[1],k[2]) in itri_counts else 0
        if freq >= min_freq:
            ifour_words.add(u''.join(k[::-1]))
    
    bi_words = list(bi_words & ibi_words)
    tri_words = list(tri_words & itri_words)
    four_words = list(four_words & ifour_words)
    
    #bi_words = [w for w in bi_words if not any([w in tri for tri in tri_words])]
    #tri_words = [w for w in tri_words if not any([w in four for four in four_words])]
    
    return bi_words + tri_words + four_words

In [57]:
#import re
newsens = clean(np.concatenate((sens1, sens2)))
words = extract_words(newsens, 0.8, 50)

In [60]:
with open('dict.txt', 'w') as f:
    f.write(u'\n'.join(words).encode('utf-8'))