In [21]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Conv1D, LSTM
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.layers import merge
from keras.layers.core import *
from keras.models import *
from keras.utils import plot_model
from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from tqdm import tqdm
pd.options.display.max_rows=500
pd.options.display.max_colwidth=2000

import warnings
warnings.filterwarnings('ignore')



EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
test = pd.read_csv('../data/test.csv.zip')
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [22]:
import re
def clean(string):
    return re.sub(r'^\d+\s|\s\d+\s|\s\d+$|\u200e|\n|\(diff \| hist\)|\(\+[0-9]*\)|\(\-[0-9]*\)|[0-9]+:[0-9]+|User talk:*|[0-9]*.[0-9]*.[0-9]*.[0-9]', '', string)

In [23]:
%%time
X_train = Parallel(n_jobs=16)(delayed(clean)(x) for x in tqdm(X_train))
X_test = Parallel(n_jobs=16)(delayed(clean)(x) for x in tqdm(X_test))

100%|██████████| 159571/159571 [00:05<00:00, 27483.96it/s]
100%|██████████| 153164/153164 [00:02<00:00, 59002.28it/s]


CPU times: user 3.31 s, sys: 2.44 s, total: 5.75 s
Wall time: 9.36 s


In [24]:
X_train

["ExplanationWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired n7",
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (tal, Janua (UTC)",
 "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
 '"MoreI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself ple

In [12]:
max_features = 100000
maxlen = 150
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_crawl = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index_crawl.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [14]:
def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, 150))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(150, activation='softmax')(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul

In [15]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Conv1D(128, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

model = get_model()

In [17]:
%%time
batch_size = 600
epochs = 40

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    #model.save(f'../models/Polled_gru_with_conv_and_regex/{i}epoch.h5')

    y_pred = model.predict(X_val, batch_size=1024)
    #submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    #submission.to_csv(f'../submits/Polled_gru_with_conv_and_regex/{i}epoch.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.982578 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986942 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.987534 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.987977 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988077 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988005 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988057 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988557 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.987973 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 

KeyboardInterrupt: 

In [8]:
%%time
batch_size = 600
epochs = 40

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_gru_with_conv_and_regex/{i}epoch_noregex.h5')

    y_pred = model.predict(x_test, batch_size=1024)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_gru_with_conv_and_regex/{i}epoch_noregex.csv', index=False)

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.978711 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.983871 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988078 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988342 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988780 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988908 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988928 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989091 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989563 

Train on 143613 samples, validate on 15958 samples
Epoch 1/1

 ROC-AUC - epoch: 1 

KeyboardInterrupt: 

In [25]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=233)

In [33]:
a= np.array([0, 1, 8, 2])

In [34]:
np.argsort(a)

array([0, 1, 3, 2])

In [36]:
X_tra[np.argsort(np.abs(y_pred[:,0] - y_val[:,0]))]


TypeError: only integer scalar arrays can be converted to a scalar index

In [37]:
X_tra

['" You have no shame, you should admit that you are biased. Aft organizing methods with the same result your lame excuses are a joke. BTW you wrote ""nationalistic"" not me.  "',
 'Please refrain from adding nonsense to Wikipedia, as you did to Septemb. It is considered vandalism. If you would like to experiment, use the sandbox.  A link to the edit I have reverted can be found here: link. If you believe this edit should not have been reverted, please contact me.',
 'i consider available to myself and wherever it is available to me..',
 'The problem is that you slam books without cracking the cover. The choice of articles was reviewed by Paul Levine, M.D., National Institute of Health, a key player in defining Chronic Fatigue Syndrome, who provided a foreword. You are painting the book with tar and feathers to advocate your own POV.  Single author (thus single POV) books should not be deemed textbooks for this illness  We need a decider and I am deeming what the CDC deems reliable or 

In [35]:
y_pred[]

array([[2.46101903e-04, 1.01015321e-05, 7.32734334e-05, 6.45780074e-06,
        1.15576418e-04, 2.47273101e-06],
       [5.18125424e-04, 8.52071025e-06, 9.29128873e-05, 8.13772112e-06,
        2.13281837e-05, 6.33231002e-06],
       [7.49366445e-05, 3.01231103e-06, 3.05413487e-05, 6.49340564e-07,
        6.26978799e-05, 2.37333506e-06],
       ...,
       [2.08531870e-04, 2.12254727e-05, 1.50735606e-04, 9.14726934e-06,
        1.07563006e-04, 1.91476429e-05],
       [3.82177852e-04, 2.15665259e-05, 1.80539922e-04, 2.61033245e-04,
        2.74883612e-04, 3.43580869e-05],
       [1.69829764e-05, 8.77004766e-07, 1.37079060e-05, 7.98705514e-07,
        1.36571753e-05, 2.21317964e-06]], dtype=float32)

In [10]:
%%time
gru_for_stack = pd.DataFrame.from_dict({'id': train['id'],
                                        "toxic":0,
                                        "severe_toxic":0,
                                        "obscene":0,
                                        "threat":0,
                                        "insult":0,
                                        "identity_hate":0})

batch_size = 800
epochs = 20
cv = KFold(n_splits=5, random_state=0, shuffle=True)
i = 0

for train_index, test_index in cv.split(x_train, y_train):
    X_tra, X_test = x_train[train_index], x_train[test_index]
    y_tra, y_test = y_train[train_index], y_train[test_index]
    i += 1
    
    model = get_model()
    file_path=f"../models/gru_with_attention_cv10/fold_{i}1.best.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True, mode='min')
    X_tra, X_val, y_tra, y_val = train_test_split(X_tra, y_tra, train_size=0.9, random_state=233)
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    early = EarlyStopping(monitor="val_loss", mode="min")
    callbacks_list = [checkpoint, early, RocAuc] #early
    #exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    #steps = int(len(X_tra)/batch_size) * epochs
    #lr_init, lr_fin = 0.001, 0.0005
    #lr_decay = exp_decay(lr_init, lr_fin, steps)
    #K.set_value(model.optimizer.lr, lr_init)
    #K.set_value(model.optimizer.decay, lr_decay)

    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list, validation_data=(X_val, y_val))

    model.load_weights(file_path)

    y_pred = model.predict(X_test, batch_size=1024)
    gru_for_stack.loc[test_index,["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
gru_for_stack.to_csv(f'../submits/gru_with_attention_cv10/training_data1', index=False)

Train on 114890 samples, validate on 12766 samples
Epoch 1/20

 ROC-AUC - epoch: 1 - score: 0.965449 

Epoch 2/20

 ROC-AUC - epoch: 2 - score: 0.975566 

Epoch 3/20

 ROC-AUC - epoch: 3 - score: 0.976301 

Epoch 4/20

 ROC-AUC - epoch: 4 - score: 0.978507 

Epoch 5/20

 ROC-AUC - epoch: 5 - score: 0.980265 

Epoch 6/20

 ROC-AUC - epoch: 6 - score: 0.981835 

Epoch 7/20

 ROC-AUC - epoch: 7 - score: 0.982663 

Epoch 8/20

 ROC-AUC - epoch: 8 - score: 0.983443 

Epoch 9/20

 ROC-AUC - epoch: 9 - score: 0.983871 

Epoch 10/20

 ROC-AUC - epoch: 10 - score: 0.984343 

Epoch 11/20

 ROC-AUC - epoch: 11 - score: 0.985220 

Epoch 12/20

 ROC-AUC - epoch: 12 - score: 0.985208 

Train on 114891 samples, validate on 12766 samples
Epoch 1/20

 ROC-AUC - epoch: 1 - score: 0.912869 

Epoch 2/20

 ROC-AUC - epoch: 2 - score: 0.969813 

Epoch 3/20

KeyboardInterrupt: 

In [9]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 150, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 150, 256)     329472      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
permute_2 

In [None]:
%%time
batch_size = 300
epochs = 10

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_gru_with_conv_and_dense/{i}epoch.h5')

    y_pred = model.predict(x_test, batch_size=2048)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_gru_with_conv_and_dense/{i}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.977701 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.983930 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.984834 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986827 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986717 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988229 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989507 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989258 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

In [9]:
batch_size = 128
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/6epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/6epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.987003 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986401 



In [11]:
batch_size = 256
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/8epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/8epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986093 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.985398 



KeyboardInterrupt: 

In [None]:
batch_size = 512
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

model.save('../models/Polled_gru_double_GRU_0.4/10epoch.h5')

y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('../submits/Polled_gru_double_GRU_0.4/10epoch.csv', index=False)