In [1]:
import pandas as pd
import numpy as np
import re
from keras.layers import Dense, Dropout, GRU, Embedding 
from keras.layers import Input, Activation, concatenate, GlobalAveragePooling1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.optimizers import Adam, RMSprop
from keras import backend as K
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
from keras.utils import np_utils, get_custom_objects
from keras.preprocessing import text, sequence
from string import ascii_letters, punctuation, digits

def swish(x):
    return (K.sigmoid(x) * x)

get_custom_objects().update({'swish': Activation(swish)})

SEQ_LENGTH = 100
EMBED_SIZE = 300
VOCAB = 238590
np.random.seed(2017)

Using TensorFlow backend.
  return f(*args, **kwds)


### model

In [2]:
def define_model(matrix, status, learn):
    rnn = {}
    rnn['units'] = 75
    rnn['return_sequences'] = True
    rnn['recurrent_dropout'] = 0.2
    rnn['dropout'] = 0.1
    rnn['activation'] = 'tanh'
    inputs = Input(shape=(SEQ_LENGTH,), name='sequence')
    embed = Embedding(VOCAB,EMBED_SIZE, weights=[matrix], trainable=status)(inputs)
    lstm = Bidirectional(GRU(**rnn))(embed)
    lstm = BatchNormalization()(lstm)
    max_pool = GlobalMaxPooling1D()(lstm)
    avg_pool = GlobalAveragePooling1D()(lstm)
    pool = concatenate([max_pool, avg_pool])
    pool = BatchNormalization()(pool)
    lstm = Dropout(0.2)(pool)
    dense = Dense(200, activation='swish')(lstm)
    dense = Dropout(0.2)(dense)
    predict = Dense(6, activation='sigmoid')(dense)
    model = Model(inputs=[inputs], output=predict)
    optimizer = Adam(lr=learn)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

### generator

In [3]:
embeddings_index = {}

f = open('../data/download/glove.6B.300d.txt')

for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(values[0])
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [4]:
def dataflow(train_text, valid_text):
    train_text['comment_text'] = train_text['comment_text'].fillna('nan')
    valid_text['comment_text'] = valid_text['comment_text'].fillna('nan')
    train_text = list(train_text['comment_text'].values)
    valid_text = list(valid_text['comment_text'].values)
    tokenizer = text.Tokenizer(lower=True, char_level=False, num_words=20000)
    tokenizer.fit_on_texts(train_text + valid_text)
    word_index = tokenizer.word_index
    print('total words:', len(word_index))
    intersect = 0
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            intersect += 1
    print('common words:', intersect)
    train_token = tokenizer.texts_to_sequences(train_text)
    valid_token = tokenizer.texts_to_sequences(valid_text)
    train_seq = sequence.pad_sequences(train_token, maxlen=SEQ_LENGTH)
    valid_seq = sequence.pad_sequences(valid_token, maxlen=SEQ_LENGTH)
    return train_seq, valid_seq, embedding_matrix

def callbacks(suffix):
    stop = EarlyStopping('val_loss', patience=3, mode="min")
    path = '../data/data/source_1/model_2/model_{}.hdf5'.format(suffix)
    save = ModelCheckpoint(path, save_best_only=True, save_weights_only=True)
    logger = CSVLogger('../data/data/source_1/model_2/logger_{}.log'.format(suffix))
    reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=2, verbose=0, mode='min')
    return [stop, save, reduce, logger]

### data : fold - 1

In [5]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_1.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_1.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_1.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_1.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 15
params['verbose'] = 1
params['callbacks'] = callbacks(1)
# round-1
model = define_model(embedding_matrix, False, 1e-3)
model.fit(**params)
# round-2
params['epochs'] = 3
model = define_model(embedding_matrix, True, 1e-4)
path = '../data/data/source_1/model_2/model_{}.hdf5'.format(1)
model.load_weights(path)
model.fit(**params)

total words: 238589
common words: 94353




Train on 127656 samples, validate on 31915 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Train on 127656 samples, validate on 31915 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f006f6e2828>

### data : fold - 2 

In [7]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_2.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_2.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_2.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_2.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 15
params['verbose'] = 1
params['callbacks'] = callbacks(2)
# round-1
model = define_model(embedding_matrix, False, 1e-3)
model.fit(**params)
# round-2
params['epochs'] = 3
model = define_model(embedding_matrix, True, 1e-4)
path = '../data/data/source_1/model_2/model_{}.hdf5'.format(2)
model.load_weights(path)
model.fit(**params)

total words: 238589
common words: 94353




Train on 127657 samples, validate on 31914 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Train on 127657 samples, validate on 31914 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f003ebe2f60>

### data : fold - 3 

In [8]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_3.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_3.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_3.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_3.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 15
params['verbose'] = 0
params['callbacks'] = callbacks(3)
# round-1
model = define_model(embedding_matrix, False, 1e-3)
model.fit(**params)
# round-2
params['epochs'] = 3
model = define_model(embedding_matrix, True, 1e-4)
path = '../data/data/source_1/model_2/model_{}.hdf5'.format(3)
model.load_weights(path)
model.fit(**params)

total words: 238589
common words: 94353




<keras.callbacks.History at 0x7f00042c45f8>

### data : fold - 4

In [9]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_4.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_4.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_4.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_4.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 15
params['verbose'] = 0
params['callbacks'] = callbacks(4)
# round-1
model = define_model(embedding_matrix, False, 1e-3)
model.fit(**params)
# round-2
params['epochs'] = 3
model = define_model(embedding_matrix, True, 1e-4)
path = '../data/data/source_1/model_2/model_{}.hdf5'.format(4)
model.load_weights(path)
model.fit(**params)

total words: 238589
common words: 94353




<keras.callbacks.History at 0x7efff6a9be48>

### data : fold - 5

In [10]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_5.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_5.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_5.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_5.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 15
params['verbose'] = 0
params['callbacks'] = callbacks(5)
# round-1
model = define_model(embedding_matrix, False, 1e-3)
model.fit(**params)
# round-2
params['epochs'] = 3
model = define_model(embedding_matrix, True, 1e-4)
path = '../data/data/source_1/model_2/model_{}.hdf5'.format(5)
model.load_weights(path)
model.fit(**params)

total words: 238589
common words: 94353




<keras.callbacks.History at 0x7effee804438>