In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import nltk
from numba import jit
from gensim.models import FastText
from joblib import Parallel, delayed
from tqdm import tqdm
import string
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

EMBEDDING_FILE = '../data/crawl-300d-2M.vec'

train = pd.read_csv('../data/train.csv.zip')
train["comment_text"] = train.comment_text.apply(lambda x: x[:800])
test = pd.read_csv('../data/test.csv.zip')
test["comment_text"] = test.comment_text.apply(lambda x: x[:800])
submission = pd.read_csv('../data/sample_submission.csv.zip')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def lawer(sen):
    count = 0
    for i in sen:
        if i.isupper():
            count += 1
    if len(sen) == 0: 
        return sen
    if count/len(sen) > 0.2: 
        sen.append("gronker")
    return sen

In [3]:
def stamer(sen):
    return list(map(st.stem,sen))
st = nltk.stem.PorterStemmer()

In [4]:
def token_to_sen(sen):
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in sen]).strip()

In [5]:
def token(sen):
    return text.text_to_word_sequence(sen)

In [6]:
%%time
tr = Parallel(n_jobs=16)(delayed(token)(x) for x in tqdm(X_train))
tes = Parallel(n_jobs=16)(delayed(token)(x) for x in tqdm(X_test))
#gro_tr = Parallel(n_jobs=16)(delayed(lawer)(x) for x in tqdm(tr))
#gro_tes = Parallel(n_jobs=16)(delayed(lawer)(x) for x in tqdm(tes))
#lem_tr = Parallel(n_jobs=16)(delayed(stamer)(x) for x in tqdm(gro_tr))
#lem_tes = Parallel(n_jobs=16)(delayed(stamer)(x) for x in tqdm(gro_tes))
keras_ready_tr = Parallel(n_jobs=16)(delayed(token_to_sen)(x) for x in tqdm(tr))
keras_ready_tes = Parallel(n_jobs=16)(delayed(token_to_sen)(x) for x in tqdm(tes))

100%|██████████| 159571/159571 [00:02<00:00, 65906.16it/s]
100%|██████████| 153164/153164 [00:03<00:00, 46993.10it/s]
100%|██████████| 159571/159571 [00:03<00:00, 42071.14it/s]
100%|██████████| 153164/153164 [00:03<00:00, 43295.77it/s]


CPU times: user 11.2 s, sys: 2.28 s, total: 13.5 s
Wall time: 14.4 s


In [10]:
a = [0.05, 0.1, 0.25, 0.5]

In [None]:
%%time
for i in tqdm(a):
    model = FastText(tes+tr, size=300, window=5, workers=16, iter=5,alpha=i)
    model.save(f"../models/FastText/Fasttest_alpha_{i}_5iter.gen")

 25%|██▌       | 1/4 [08:10<24:32, 490.93s/it]

In [11]:
a

[0.05, 0.1, 0.25, 0.5]

In [7]:
model = FastText.load("../models/FastText/Fasttest_alpha_0.5_5iter.gen")

In [8]:
fast_vec = dict(zip(model.wv.index2word, model.wv.syn0))

In [10]:
max_features = 60000
maxlen = 800
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features, lower=False)
tokenizer.fit_on_texts(keras_ready_tr+keras_ready_tes)
X_train = tokenizer.texts_to_sequences(keras_ready_tr)
X_test = tokenizer.texts_to_sequences(keras_ready_tes)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [11]:
%%time
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

CPU times: user 1min 46s, sys: 2.61 s, total: 1min 49s
Wall time: 1min 49s


In [12]:
np.vstack((np.zeros(2),np.zeros(2)))

array([[0., 0.],
       [0., 0.]])

In [13]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size*2))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector_crawl = embeddings_index.get(word)
    embedding_vector_self = fast_vec.get(word)
    if (embedding_vector_crawl is not None) and (embedding_vector_self is not None): 
        embedding_matrix[i] = np.hstack((embedding_vector_crawl,embedding_vector_self))
        
    elif (embedding_vector_crawl is None) and (embedding_vector_self is not None):
        embedding_matrix[i] = np.hstack((np.zeros(300),embedding_vector_self))
    
    else: 
        embedding_matrix[i] = np.hstack((np.zeros(300),np.zeros(300)))

In [14]:
embedding_matrix.shape

(60000, 600)

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size*2, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

In [None]:
batch_size = 300

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
for epoch in range(1,10):
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    model.save(f'../models/Polled_gru_one_layer_0.2_double_embedsize/alpha_0.5_{epoch}epoch.h5')

    y_pred = model.predict(x_test, batch_size=200)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv(f'../submits/Polled_gru_one_layer_0.2_double_embedsize/alpha_0.5_{epoch}epoch.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.881331 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.967567 

Train on 151592 samples, validate on 7979 samples
Epoch 1/1

 ROC-AUC - epoch: 1 - score: 0.973070 

