In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statistics import mean

from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [None]:
EMBEDDING_FILE = 'glove.6B.50d.txt'

train = pd.read_csv('input/train.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [None]:
max_features = 30000
maxlen = 100
embed_size = 50

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))

# saving
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

X_train = tokenizer.texts_to_sequences(X_train)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE,encoding='UTF-8'))

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_model()

In [None]:
batch_size = 32
epochs = 2

In [None]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [None]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

In [None]:
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

In [None]:
y_pred = model.predict(X_val, verbose=0)
score = roc_auc_score(y_val, y_pred)
print(score)

In [None]:
def confusion_matrix(y_pred,y):
    if(y_pred.shape!=y.shape):
        print("Dimensions mismatch!")
        return
    tp=0
    fp=0
    tn=0
    fn=0
    for i in range(y.shape[0]):
        if(y_pred[i]==y[i]):
            if(y_pred[i]==1):
                tp+=1
            else:
                tn+=1
        else:
            if(y_pred[i]==1):
                fp+=1
            else:
                fn+=1
    matrix = np.array([tn,fp,fn,tp]).reshape(2,2)
    return matrix

In [None]:
def threshold(value,thr):
  if(value>thr):
    return 1
  return 0
my_thr=np.vectorize(threshold)

In [None]:
thresholds=[0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
scores=[]
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for i,c in enumerate(classes):
    print(c)
    score=roc_auc_score(y_val[c],y_pred.T[i])
    print(score)
#         for t in thresholds:
    print(confusion_matrix(my_thr(y_pred.T[i],0.5),y_val[c]))
    scores.append(score)
print('\n')
print("Mean")
print(mean(scores))
print('\n')