### Read sentences

In [1]:
import os
import random
import time
from time import strftime

from keras import backend as K

import numpy as np
from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Input, LSTM
from keras.layers import Embedding, Dropout, Bidirectional
from keras.models import Model, Sequential
from keras.layers import Activation, Conv1D, MaxPooling1D, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.regularizers import l2
from keras.utils import to_categorical
from keras.layers.normalization import BatchNormalization
import keras

Using TensorFlow backend.


In [2]:
ROOT_PATH  = "data/"

In [3]:
es_e_l = []
es_s_l = []
es_e_r = []
es_s_r = []
es_labels = []
## english-spanish text 
with open(os.path.join(ROOT_PATH, "cikm_english_train_20180516.txt"), 'r', encoding='utf-8') as esf:
    for line in esf:
        segs = line.strip().replace('?','').split('\t')
        es_e_l.append(segs[0].lower())
        es_e_r.append(segs[2].lower())
        es_s_l.append(segs[1].lower())
        es_s_r.append(segs[3].lower())
        es_labels.append(int(segs[4]))
        
se_e_l = []
se_s_l = []
se_e_r = []
se_s_r = []
se_labels = []
## spanish-english text
with open(os.path.join(ROOT_PATH, "cikm_spanish_train_20180516.txt"), 'r', encoding='utf-8') as ssf:
    for line in ssf:
        segs = line.strip().replace('?','').split('\t')
        se_e_l.append(segs[1].lower())
        se_e_r.append(segs[3].lower())
        se_s_l.append(segs[0].lower())
        se_s_r.append(segs[2].lower())
        se_labels.append(int(segs[4]))

test_s_1 = []
test_s_2 = []
## spanish test file
with open(os.path.join(ROOT_PATH, "cikm_test_a_20180516.txt"), 'r', encoding='utf-8') as tef:
    for line in tef:
        segs = line.strip().replace('?','').replace('¿', '').split('\t')
        test_s_1.append(segs[0].lower())
        test_s_2.append(segs[1].lower())

print("es data size:", len(es_s_l))
print("se data size:", len(se_e_l))
print("test data size:", len(test_s_1))

es data size: 20000
se data size: 1400
test data size: 5000


### Load word_vec

In [4]:
## es.vec
es_vec = {}
with open(os.path.join(ROOT_PATH, "wiki.es.vec"), 'r', encoding='utf-8') as vecf:
    i = 0
    for line in vecf:
        if i == 0:
            continue
        i = 1
        segs = line.strip().split(' ')
        es_vec[segs[0]] = map(eval, segs[1:])

In [5]:
L_MAX_SEQUENCE_LENGTH = 60  #左边最大句子长度
R_MAX_SEQUENCE_LENGTH = 50  #右边最大句子长度
MAX_NB_WORDS = 20000      #词典大小，词的个数
EMBEDDING_DIM = 300       #词向量维度
VALIDATION_SPLIT = 0.2    # 测试集比例

In [6]:
##add data sets (s0, s1, y) + (s1, s0, y)
left_texts = se_s_l + es_s_l
right_texts = se_s_r + es_s_r
y = se_labels + es_labels
print("left data size:", len(left_texts))
print("right data size:", len(right_texts))
print("label size:", len(y))

left data size: 21400
right data size: 21400
label size: 21400


In [7]:
# prepare left embedding matrix
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(left_texts)
sequences = tokenizer.texts_to_sequences(left_texts)
MAX_LENGTH = 0
for s in sequences:
    if len(s) > MAX_LENGTH:
        MAX_LENGTH = len(s)
print(MAX_LENGTH)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
left_data = pad_sequences(sequences, maxlen=L_MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', left_data.shape)

### Prepare left embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
left_embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
print('Preparing embedding matrix. :', left_embedding_matrix.shape)
for word, i in word_index.items():
    embedding_vector = es_vec.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        left_embedding_matrix[i] = embedding_vector

        
left_embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[left_embedding_matrix],
                            input_length=L_MAX_SEQUENCE_LENGTH,
                            trainable=False)



53
Found 3286 unique tokens.
Shape of data tensor: (21400, 60)
Preparing embedding matrix. : (3287, 300)


In [8]:
# prepare right embedding matrix
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(right_texts)
sequences = tokenizer.texts_to_sequences(right_texts)
MAX_LENGTH = 0
for s in sequences:
    if len(s) > MAX_LENGTH:
        MAX_LENGTH = len(s)
print(MAX_LENGTH)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
right_data = pad_sequences(sequences, maxlen=R_MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', right_data.shape)

### Prepare right embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
right_embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
print('Preparing embedding matrix. :', right_embedding_matrix.shape)
for word, i in word_index.items():
    embedding_vector = es_vec.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        right_embedding_matrix[i] = embedding_vector

        
right_embedding_layer = Embedding(num_words + 1,
                            EMBEDDING_DIM,
                            weights=[right_embedding_matrix],
                            input_length=R_MAX_SEQUENCE_LENGTH,
                            trainable=False)



26
Found 2787 unique tokens.
Shape of data tensor: (21400, 50)
Preparing embedding matrix. : (2788, 300)


In [9]:
## split train and val sets
indices = np.arange(left_data.shape[0])
np.random.shuffle(indices)
data = left_data[indices]
y = np.array(y)
labels = y[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
print(num_validation_samples)

left_x_train = data[:-num_validation_samples]
left_x_val = data[-num_validation_samples:]

data = right_data[indices]
right_x_train = data[:-num_validation_samples]
right_x_val = data[-num_validation_samples:]

y_train = labels[:-num_validation_samples]
y_val = labels[-num_validation_samples:]

4280


In [10]:
## prepare test data
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(test_s_1)
sequences = tokenizer.texts_to_sequences(test_s_1)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
test_1 = pad_sequences(sequences, maxlen=L_MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test_1.shape)

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(test_s_2)
sequences = tokenizer.texts_to_sequences(test_s_2)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
test_2 = pad_sequences(sequences, maxlen=R_MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test_2.shape)



Found 3587 unique tokens.
Shape of data tensor: (5000, 60)
Found 1666 unique tokens.
Shape of data tensor: (5000, 50)


### Build model

In [11]:
input1 = keras.layers.Input(shape=(L_MAX_SEQUENCE_LENGTH,), dtype='float32')
print(input1.shape)
left_embedded_sequences = left_embedding_layer(input1)
x1 = LSTM(128, dropout_W=0.2, dropout_U=0.2)(left_embedded_sequences)
x1 = Dropout(0.5)(x1)
#x1 = BatchNormalization()(x1)
x1 = Dense(64, activation='relu')(x1) ## acitivation = tanh, relu, sigmoid

(?, 60)


  after removing the cwd from sys.path.


In [12]:
input2 = keras.layers.Input(shape=(R_MAX_SEQUENCE_LENGTH,), dtype='float32')
print(input2.shape)
right_embedded_sequences = right_embedding_layer(input2)
x2 = LSTM(128, dropout_W=0.2, dropout_U=0.2)(right_embedded_sequences)
x2 = Dropout(0.5)(x2)
#x1 = BatchNormalization()(x1)
x2 = Dense(64, activation='relu')(x2) ## acitivation = tanh, relu, sigmoid

(?, 50)


  after removing the cwd from sys.path.


In [13]:
# new metric
def logloss(y_true, y_pred):
    return -K.mean(y_true*K.log(y_pred) + (1-y_true)*K.log(1-y_pred))

In [14]:
merged = keras.layers.add([x1, x2])  # add, concatenate, maximum
# We stack a deep densely-connected network on top
#merged = Conv1D(filters=50, kernel_size=5, activation='relu')(merged)
#merged = MaxPooling1D(pool_size=5)(merged)
#merged = Flatten()(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = BatchNormalization()(merged)
merged = Dense(32, activation='tanh')(merged)
merged = BatchNormalization()(merged)

model_file = "Model_" + strftime("%Y-%m-%d %H-%M", time.localtime()) + ".mdl"
model_checkpoint = ModelCheckpoint(model_file, monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto')

output = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[input1, input2], outputs=output)
model.compile(loss='binary_crossentropy',  optimizer='adam', metrics=[logloss])  ## optimizer= sgd, adam, rmsprop
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
model.fit([left_x_train, right_x_train], y_train, batch_size=100, nb_epoch=20, validation_data=([left_x_val, right_x_val], y_val), callbacks=[early_stopping, model_checkpoint])
predicts = model.predict([test_1, test_2], batch_size=100, verbose=1)



Train on 17120 samples, validate on 4280 samples
Epoch 1/20
Epoch 2/20




 1900/17120 [==>...........................] - ETA: 41s - loss: 0.5543 - logloss: 0.5543

KeyboardInterrupt: 

In [None]:
[left_x_train, right_x_train]