This version is not for competition purpose but for self-learning purpose. I followed exactly what this guy did
https://www.kaggle.com/thousandvoices/simple-lstm

In [1]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

Using TensorFlow backend.


In [2]:
>>> # Define some parameters for deep learning models
EMBEDDING_FILES = [
    '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec',
    '../input/glove840b300dtxt/glove.840B.300d.txt'
]
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

**Create build_matrix function to import embedding from text files:**

*- Build_matrix: *

input:  array of tuple (word, index)

output: embedding_matrix is an array with shape of (number of words, dimension of embedding)

*- load_embeddings:*

input: path to embedding files

output: dictionary with each word has a proper numpy array having shape of (number of words, dimension of embedding)

*- get_coefs:*

input: list or array of embedding numbers

output: proper numpy array of embedding numbers

In [3]:
>>> #Define functions to load embedding
def get_coefs(word, *arr): #return proper numpy array to input into embedding matrix
    return word, np.asarray(arr, dtype='float32') 

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

Build Deep Learning model:


In [4]:
>>> #Build model
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

__PREPROCESSING__

In [5]:
>>> #Create data for model
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)

In [6]:
print(IDENTITY_COLUMNS)
print(AUX_COLUMNS)
print(TEXT_COLUMN)

['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
comment_text


In [7]:
#train.columns
train_df[train_df.severe_toxicity > 0][['id','target','comment_text','severe_toxicity','obscene','identity_attack','insult','threat']].head()
#View full comments without truncation comment_list = train.comment_text.tolist() comment_list[0:5]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0
5,59859,0.666667,ur a sh*tty comment.,0.047619,0.638095,0.0,0.333333,0.0
6,59861,0.457627,hahahahahahahahhha suck it.,0.050847,0.305085,0.0,0.254237,0.0
31,239607,0.9125,Yet call out all Muslims for the acts of a few...,0.05,0.2375,0.6125,0.8875,0.1125
34,239612,0.830769,This bitch is nuts. Who would read a book by a...,0.107692,0.661538,0.338462,0.830769,0.0


In [8]:
>>> #tokenize and create weights for each record
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

#replace each word/token by a number and put the record in a list/sequence of numbers
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
#Add zero or truncated left part of the record to make all records with equal length
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

#preset all weight of 1
sample_weights = np.ones(len(x_train), dtype=np.float32)
#add number of identity col
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
#only keep weight of identity columns when target is 1
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
#add 5 times weight of identity columns when target is 0
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
#normalize weight by dividing to its mean
sample_weights /= sample_weights.mean()

#concat 2 embedding matrix of num_word,300 to create a matrix of num_word,600 shape
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [9]:
print('last word: '+str([(word, i) for (word, i) in tokenizer.word_index.items() if i == len(tokenizer.word_index.items())]))
print('all words: '+str(len(tokenizer.word_index.items())))
a = np.zeros((600,))
print('unknown words: '+str(len([x for x in embedding_matrix if x.all() == a.all()])))
print('known words: '+str(len([x for x in embedding_matrix if x.all() != a.all()])))

last word: [('uv4j7sid3pk', 327821)]
all words: 327821
unknown words: 187282
known words: 140540


In [10]:
y_aux_train.shape

(1804874, 6)

In [11]:
model = build_model(embedding_matrix, y_aux_train.shape[-1])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 600)    196693200   input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, None, 600)    0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidi

In [12]:
>>> #
checkpoint_predictions = []
weights = []
for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    for global_epoch in range(EPOCHS):
        model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)],
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))
            ]
        )
        checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)


Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1
 - 765s - loss: 0.5260 - dense_7_loss: 0.4194 - dense_8_loss: 0.1066
Epoch 1/1
 - 762s - loss: 0.5079 - dense_7_loss: 0.4058 - dense_8_loss: 0.1021
Epoch 1/1
 - 764s - loss: 0.5022 - dense_7_loss: 0.4010 - dense_8_loss: 0.1012
Epoch 1/1
 - 768s - loss: 0.4984 - dense_7_loss: 0.3978 - dense_8_loss: 0.1006
Epoch 1/1
 - 767s - loss: 0.5258 - dense_11_loss: 0.4193 - dense_12_loss: 0.1065
Epoch 1/1
 - 766s - loss: 0.5077 - dense_11_loss: 0.4056 - dense_12_loss: 0.1021
Epoch 1/1
 - 767s - loss: 0.5020 - dense_11_loss: 0.4008 - dense_12_loss: 0.1012
Epoch 1/1
 - 766s - loss: 0.4982 - dense_11_loss: 0.3976 - dense_12_loss: 0.1006


In [13]:
>>> #Create submission file
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)