In [1]:
import pandas as pd
import numpy as np
import re
from keras.layers import Dense, Dropout, GRU, Embedding, TimeDistributed, Flatten
from keras.layers import Input, Activation, concatenate, GlobalAveragePooling1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from keras.layers import RepeatVector, Permute, merge, Lambda
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau
from keras.utils import np_utils, get_custom_objects
from keras.preprocessing import text, sequence
from string import ascii_letters, punctuation, digits
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from sklearn.metrics import roc_curve, auc 

def swish(x):
    return (K.sigmoid(x) * x)

get_custom_objects().update({'swish': Activation(swish)})

SEQ_LENGTH = 100
EMBED_SIZE = 100
VOCAB = 238590
np.random.seed(2017)

Using TensorFlow backend.
  return f(*args, **kwds)


### model

In [2]:
def define_model(matrix):
    rnn = {}
    rnn['units'] = 50
    rnn['return_sequences'] = True
    rnn['recurrent_dropout'] = 0.2
    rnn['dropout'] = 0.1
    rnn['activation'] = 'tanh'
    inputs = Input(shape=(SEQ_LENGTH,), name='sequence')
    embed = Embedding(VOCAB,EMBED_SIZE, weights=[matrix], trainable=False)(inputs)
    lstm = GRU(**rnn)(embed)
    atten = TimeDistributed(Dense(1, activation='tanh'))(lstm)
    atten = Flatten()(atten)
    atten = Activation('softmax')(atten)
    atten = RepeatVector(rnn['units'])(atten)
    atten = Permute([2, 1])(atten)
    dense = merge([lstm, atten], mode='mul')
    dense = Lambda(lambda xin: K.sum(xin, axis=1))(dense)
    dense = BatchNormalization()(dense)
    dense = Dense(200, activation='swish')(dense)
    dense = Dropout(0.2)(dense)
    predict = Dense(6, activation='sigmoid')(dense)
    model = Model(inputs=[inputs], output=predict)
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

### generator

In [3]:
embeddings_index = {}

f = open('../data/data/fasttext/vector.vec')

for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(values[0])
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 93651 word vectors.


In [4]:
def dataflow(train_text, valid_text, score_text):
    train_text['comment_text'] = train_text['comment_text'].fillna('nan')
    valid_text['comment_text'] = valid_text['comment_text'].fillna('nan')
    score_text['comment_text'] = score_text['comment_text'].fillna('nan')
    train_text = list(train_text['comment_text'].values)
    valid_text = list(valid_text['comment_text'].values)
    score_text = list(score_text['comment_text'].values)
    tokenizer = text.Tokenizer(lower=True, char_level=False, num_words=100000)
    tokenizer.fit_on_texts(train_text + valid_text)
    word_index = tokenizer.word_index
    print('total words:', len(word_index))
    intersect = 0
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            intersect += 1
    print('common words:', intersect)
    score_token = tokenizer.texts_to_sequences(score_text)
    score_seq = sequence.pad_sequences(score_token, maxlen=SEQ_LENGTH)
    return score_seq, embedding_matrix

### execute

In [5]:
def score_model(mode):
    train_text = pd.read_csv('../data/data/source_1/train/train_data_{}.csv'.format(mode))
    valid_text = pd.read_csv('../data/data/source_1/train/test_data_{}.csv'.format(mode))
    score_text = pd.read_csv('../data/data/source_1/train/test_data_{}.csv'.format(mode))
    labels = pd.read_csv('../data/data/source_1/train/test_labels_{}.csv'.format(mode))
    score_data = score_text[['id']]
    score_text, embedding_matrix = dataflow(train_text, valid_text, score_text)
    model = define_model(embedding_matrix)
    path = '../data/data/source_1/model_3/model_{}.hdf5'.format(mode)
    model.load_weights(path)
    scores = model.predict(score_text, batch_size=512)
    scores = pd.DataFrame(scores)
    scores.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
    scores = score_data.join(scores)
    return scores, labels

In [6]:
score_1, labels_1 = score_model(1)
score_2, labels_2 = score_model(2)
score_3, labels_3 = score_model(3)
score_4, labels_4 = score_model(4)
score_5, labels_5 = score_model(5)

total words: 238589
common words: 88118


  name=name)


total words: 238589
common words: 88118
total words: 238589
common words: 88118
total words: 238589
common words: 88118
total words: 238589
common words: 88118


In [7]:
submit = score_1.append(score_2)
submit = submit.append(score_3)
submit = submit.append(score_4)
submit = submit.append(score_5)
submit = submit.sort_values(by='id')
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0.000128,7.191598e-08,9e-06,1.805343e-07,3.187131e-06,1.722753e-07
0,000103f0d9cfb60f,0.002646,1.325167e-05,0.000602,2.220314e-05,0.0002932923,4.405921e-05
0,000113f07ec002fd,0.01374,1.138331e-05,0.001181,5.583086e-05,0.0006639684,1.669362e-05
19159,00013fa6fb6ef643,0.001132,5.115979e-07,0.000214,4.896253e-07,1.939144e-05,2.502788e-06
0,0001b41b1c6bb37e,1.1e-05,1.41706e-06,1.8e-05,9.555013e-09,4.74519e-07,2.599188e-07


In [8]:
labels = labels_1.append(labels_2)
labels = labels.append(labels_3)
labels = labels.append(labels_4)
labels = labels.append(labels_5)
labels = labels.sort_values(by='id')
labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0,0,0,0,0,0
0,000103f0d9cfb60f,0,0,0,0,0,0
0,000113f07ec002fd,0,0,0,0,0,0
19159,00013fa6fb6ef643,0,0,0,0,0,0
0,0001b41b1c6bb37e,0,0,0,0,0,0


### evaluate

In [9]:
models = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
evaluate = 0.

for subset in models:
    predict = submit[subset]
    actual = labels[subset]
    fpr, tpr, threshold = roc_curve(actual, predict)
    metric = round(2*auc(fpr, tpr)-1, 4)
    print('subset:', subset, ':', metric)
    evaluate += metric
    
print('overall:', round(evaluate/6, 4))

subset: toxic : 0.9596
subset: severe_toxic : 0.9784
subset: obscene : 0.98
subset: threat : 0.9729
subset: insult : 0.9707
subset: identity_hate : 0.9673
overall: 0.9715


In [10]:
submit.to_csv('../data/model/baseline_3.csv', index=False)