In [1]:
import pandas as pd
import numpy as np
import re
from keras.layers import Dense, Dropout, GRU, Embedding, TimeDistributed, Flatten
from keras.layers import Input, Activation, concatenate, GlobalAveragePooling1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from keras.layers import RepeatVector, Permute, merge, Lambda
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
from keras.utils import np_utils, get_custom_objects
from keras.preprocessing import text, sequence
from string import ascii_letters, punctuation, digits
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

def swish(x):
    return (K.sigmoid(x) * x)

get_custom_objects().update({'swish': Activation(swish)})

SEQ_LENGTH = 100
EMBED_SIZE = 100
VOCAB = 238590
np.random.seed(2017)

Using TensorFlow backend.
  return f(*args, **kwds)


### model

In [2]:
def define_model(matrix):
    rnn = {}
    rnn['units'] = 50
    rnn['return_sequences'] = True
    rnn['recurrent_dropout'] = 0.2
    rnn['dropout'] = 0.1
    rnn['activation'] = 'tanh'
    inputs = Input(shape=(SEQ_LENGTH,), name='sequence')
    embed = Embedding(VOCAB,EMBED_SIZE, weights=[matrix], trainable=False)(inputs)
    lstm = GRU(**rnn)(embed)
    atten = TimeDistributed(Dense(1, activation='tanh'))(lstm)
    atten = Flatten()(atten)
    atten = Activation('softmax')(atten)
    atten = RepeatVector(rnn['units'])(atten)
    atten = Permute([2, 1])(atten)
    dense = merge([lstm, atten], mode='mul')
    dense = Lambda(lambda xin: K.sum(xin, axis=1))(dense)
    dense = BatchNormalization()(dense)
    dense = Dense(200, activation='swish')(dense)
    dense = Dropout(0.2)(dense)
    predict = Dense(6, activation='sigmoid')(dense)
    model = Model(inputs=[inputs], output=predict)
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

### generator

In [3]:
embeddings_index = {}

f = open('../data/data/fasttext/vector.vec')

for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(values[0])
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 93651 word vectors.


In [4]:
def dataflow(train_text, valid_text):
    train_text['comment_text'] = train_text['comment_text'].fillna('nan')
    valid_text['comment_text'] = valid_text['comment_text'].fillna('nan')
    train_text = list(train_text['comment_text'].values)
    valid_text = list(valid_text['comment_text'].values)
    tokenizer = text.Tokenizer(lower=True, char_level=False, num_words=100000)
    tokenizer.fit_on_texts(train_text + valid_text)
    word_index = tokenizer.word_index
    print('total words:', len(word_index))
    intersect = 0
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            intersect += 1
    print('common words:', intersect)
    train_token = tokenizer.texts_to_sequences(train_text)
    valid_token = tokenizer.texts_to_sequences(valid_text)
    train_seq = sequence.pad_sequences(train_token, maxlen=SEQ_LENGTH)
    valid_seq = sequence.pad_sequences(valid_token, maxlen=SEQ_LENGTH)
    return train_seq, valid_seq, embedding_matrix

def callbacks(suffix):
    stop = EarlyStopping('val_loss', patience=5, mode="min")
    path = '../data/data/source_1/model_3/model_{}.hdf5'.format(suffix)
    save = ModelCheckpoint(path, save_best_only=True, save_weights_only=True)
    logger = CSVLogger('../data/data/source_1/model_3/logger_{}.log'.format(suffix))
    reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=2, verbose=0, mode='min')
    return [stop, save, reduce, logger]

### data : fold - 1

In [5]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_1.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_1.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_1.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_1.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 30
params['verbose'] = 1
params['callbacks'] = callbacks(1)
model = define_model(embedding_matrix)
model.fit(**params)

total words: 238589
common words: 88118


  name=name)


Train on 127656 samples, validate on 31915 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fa15b090e48>

### data : fold - 2 

In [6]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_2.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_2.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_2.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_2.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 30
params['verbose'] = 1
params['callbacks'] = callbacks(2)
model = define_model(embedding_matrix)
model.fit(**params)

total words: 238589
common words: 88118


  name=name)


Train on 127657 samples, validate on 31914 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


<keras.callbacks.History at 0x7fa150830cf8>

### data : fold - 3 

In [7]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_3.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_3.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_3.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_3.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 30
params['verbose'] = 1
params['callbacks'] = callbacks(3)
model = define_model(embedding_matrix)
model.fit(**params)

total words: 238589
common words: 88118


  name=name)


Train on 127657 samples, validate on 31914 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30


<keras.callbacks.History at 0x7fa155635ba8>

### data : fold - 4

In [8]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_4.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_4.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_4.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_4.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 30
params['verbose'] = 1
params['callbacks'] = callbacks(4)
model = define_model(embedding_matrix)
model.fit(**params)

total words: 238589
common words: 88118


  name=name)


Train on 127657 samples, validate on 31914 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30


<keras.callbacks.History at 0x7fa144d6eb70>

### data : fold - 5

In [9]:
train_text = pd.read_csv('../data/data/source_1/train/train_data_5.csv')
train_label = pd.read_csv('../data/data/source_1/train/train_labels_5.csv')
valid_text = pd.read_csv('../data/data/source_1/train/test_data_5.csv')
valid_label = pd.read_csv('../data/data/source_1/train/test_labels_5.csv')
train_text, valid_text, embedding_matrix = dataflow(train_text, valid_text)

params = {}
params['x'] = train_text
params['y'] = np.array(train_label.iloc[:,1:])
params['validation_data'] = (valid_text, np.array(valid_label.iloc[:,1:]))
params['batch_size'] = 256
params['epochs'] = 30
params['verbose'] = 1
params['callbacks'] = callbacks(5)
model = define_model(embedding_matrix)
model.fit(**params)

total words: 238589
common words: 88118


  name=name)


Train on 127657 samples, validate on 31914 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


<keras.callbacks.History at 0x7fa13df5bb00>