In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam

import nltk
from numba import jit
from gensim.models import FastText
from joblib import Parallel, delayed
from tqdm import tqdm
import string
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
train["comment_text"] = train.comment_text.apply(lambda x: x[:800])
test = pd.read_csv('../data/test.csv.zip')
test["comment_text"] = test.comment_text.apply(lambda x: x[:800])
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def lawer(sen):
    count = 0
    for i in sen:
        if i.isupper():
            count += 1
    if len(sen) == 0: 
        return sen
    if count/len(sen) > 0.2: 
        sen.append("gronker")
    return sen

In [3]:
def stamer(sen):
    return list(map(st.stem,sen))
st = nltk.stem.PorterStemmer()

In [4]:
def token_to_sen(sen):
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in sen]).strip()

In [5]:
def token(sen):
    return text.text_to_word_sequence(sen)

In [6]:
%%time
tr = Parallel(n_jobs=16)(delayed(token)(x) for x in tqdm(X_train))
tes = Parallel(n_jobs=16)(delayed(token)(x) for x in tqdm(X_test))
#gro_tr = Parallel(n_jobs=16)(delayed(lawer)(x) for x in tqdm(tr))
#gro_tes = Parallel(n_jobs=16)(delayed(lawer)(x) for x in tqdm(tes))
#lem_tr = Parallel(n_jobs=16)(delayed(stamer)(x) for x in tqdm(gro_tr))
#lem_tes = Parallel(n_jobs=16)(delayed(stamer)(x) for x in tqdm(gro_tes))
keras_ready_tr = Parallel(n_jobs=16)(delayed(token_to_sen)(x) for x in tqdm(tr))
keras_ready_tes = Parallel(n_jobs=16)(delayed(token_to_sen)(x) for x in tqdm(tes))

100%|██████████| 159571/159571 [00:02<00:00, 58584.11it/s]
100%|██████████| 153164/153164 [00:02<00:00, 51740.88it/s]
100%|██████████| 159571/159571 [00:03<00:00, 45957.26it/s]
100%|██████████| 153164/153164 [00:03<00:00, 44508.24it/s]


CPU times: user 11.2 s, sys: 1.86 s, total: 13 s
Wall time: 13.9 s


In [7]:
model = FastText.load("../models/FastText/Fasttest_alpha_0.05_5iter.gen")

In [8]:
fast_vec = dict(zip(model.wv.index2word, model.wv.syn0))

In [9]:
max_features = 60000
maxlen = 800
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features, lower=False)
tokenizer.fit_on_texts(keras_ready_tr+keras_ready_tes)
X_train = tokenizer.texts_to_sequences(keras_ready_tr)
X_test = tokenizer.texts_to_sequences(keras_ready_tes)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [10]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embeddings_matrix_self = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = fast_vec.get(word)
    if embedding_vector is not None: embeddings_matrix_self[i] = embedding_vector
        
del(model)
del(fast_vec)

In [11]:
%%time
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_crawl = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
print("ok")

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_crawl = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index_crawl.get(word)
    if embedding_vector is not None: embedding_matrix_crawl[i] = embedding_vector
        
del(embeddings_index_crawl)

ok
CPU times: user 1min 44s, sys: 2.69 s, total: 1min 46s
Wall time: 1min 46s


In [12]:
'''embeddings_index_glove = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open("../data/glove.840B.300d.txt"))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embeddings_matrix_glove = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index_glove.get(word)
    if embedding_vector is not None: embeddings_index_glove[i] = embedding_vector
        
del(embeddings_index_glove)'''

In [12]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model(unit=80):
    sequence_input = Input(shape=(maxlen, ))
    crawl = Embedding(max_features, embed_size, weights=[embedding_matrix_crawl],trainable = False)(sequence_input)
    crawl = SpatialDropout1D(0.2)(crawl)
    crawl = Bidirectional(GRU(unit, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(crawl)
    crawl = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(crawl)
    avg_pool_crawl = GlobalAveragePooling1D()(crawl)
    max_pool_crawl = GlobalMaxPooling1D()(crawl)
    
    self = Embedding(max_features, embed_size, weights=[embeddings_matrix_self],trainable = False)(sequence_input)
    self = SpatialDropout1D(0.2)(self)
    self = Bidirectional(GRU(unit, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(self)
    self = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(self)
    avg_pool_self = GlobalAveragePooling1D()(self)
    max_pool_self = GlobalMaxPooling1D()(self)
    
    #glove = Embedding(max_features, embed_size, weights=[embeddings_matrix_glove],trainable = False)(sequence_input)
    #glove = SpatialDropout1D(0.2)(glove)
    #glove = Bidirectional(GRU(unit, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(glove)
    #glove = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(glove)
    #avg_pool_glove = GlobalAveragePooling1D()(glove)
    #max_pool_glove = GlobalMaxPooling1D()(glove)
    
    x = concatenate([avg_pool_self, avg_pool_crawl, max_pool_crawl, max_pool_self]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

model = get_model()

In [None]:
%%time
batch_size = 300
epochs = 20

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for i in range(1,epochs+1):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_GRU_conv_double_way/{i}epoch.h5')

    y_pred = model.predict(x_test, batch_size=batch_size)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_GRU_conv_double_way/{i}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.986601 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.988228 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989208 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989689 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.990075 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989659 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.989849 

