Model used to make predictions for the Antidote web extension.

Basic model structure is forked from [here](https://www.kaggle.com/joviis/keras-2bilstm-ensemble).

Uses the dataset from the [Jigsaw Toxic Comment Competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview) to train the model.

In [None]:
import json
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, CuDNNLSTM, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

train_df = pd.read_csv("../input/google-data/train.csv")
test_df = pd.read_csv("../input/google-data/val.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)




embed_size = 300
max_features = 100000 
maxlen = 60
num_folds = 2
num_epochs = 20

train_X = train_df["comment_text"].fillna("_na_").values
test_X = test_df["comment_text"].fillna("_na_").values


tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

train_y = train_df['toxic'].values


with open('word_index.json', 'w') as outfile:  
    json.dump(tokenizer.word_index, outfile)


EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
f = open(EMBEDDING_FILE, encoding="utf8", errors='ignore')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in f)
f.close()

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
        
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

    
with open('embedding_matrix.json', 'w') as outfile: 
    json.dump(embedding_matrix, outfile, cls=NumpyEncoder)


del train_df, test_df, tokenizer, embeddings_index, all_embs,word_index
import gc; gc.collect()
time.sleep(10)

In [None]:
from keras import regularizers 
from keras.layers import BatchNormalization,Activation
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import keras

adam = keras.optimizers.Adam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=1, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)
callbacks = [EarlyStopping(monitor='val_loss', patience=3),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True), learning_rate_reduction]

In [None]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Dropout(0.3)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation="elu")(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation="elu")(x)
    x = Dropout(0.3)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    return model

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
skf.get_n_splits(train_X, train_y)

In [None]:
from sklearn.metrics import f1_score
def thresh_search(y_true, y_proba):
    best_thresh = 0
    best_score = 0
    for thresh in np.arange(0, 1, 0.01):
        score = f1_score(y_true, y_proba > thresh)
        if score > best_score:
            best_thresh = thresh
            best_score = score
    return best_thresh, best_score

In [None]:
from sklearn.metrics import f1_score

models = []
y_pred = []
pred_val_y = []
tresh_f1 = []
for i,(train_index, test_index) in enumerate(skf.split(train_X, train_y)):
    X_train, X_val = train_X[train_index], train_X[test_index]
    y_train, y_val = train_y[train_index], train_y[test_index]
    models.append(get_model())
    models[i].compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    history = models[i].fit(X_train, y_train, batch_size=512, callbacks=callbacks, epochs=num_epochs, validation_data=(X_val, y_val))
    y_pred.append(models[i].predict(test_X, batch_size=1024, verbose=True))
    pred_val_y.append(models[i].predict([X_val], batch_size=1024, verbose=1))
    tresh_f1.append(thresh_search(y_val, pred_val_y[i]))
    
for i, model in enumerate(models):
    model.save("model_{}".format(i))

In [None]:
for i,j in tresh_f1:
    print('{}\n'.format((i,j)))