In [None]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import LSTM, Bidirectional,  GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import GRU
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors
from keras.callbacks import LearningRateScheduler

In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

In [None]:
train.shape

In [None]:
train.head(10)

In [None]:
def preprocess(data):
    '''
    Adapted from https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [None]:
x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)

In [None]:
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

#### Tokenization

In [None]:
MAX_LEN = 220 # length of each comment after converting to a number

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [None]:
# tokenize the train and test dataframes
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [None]:
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
# free some memory
import gc
del train
gc.collect()

#### Embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

In [None]:
EMBEDDING_FILES = ['glove.840B.300d.txt']

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [None]:
def build_matrix(word_index, path):
    """
    path: a path that contains embedding matrix
    word_index is a dict of the form ('apple': 123, 'banana': 349, etc)
    
    we will construct an embedding_matrix for the words in word_index
    using pre-trained embedding word vectors from 'path'
    """
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [None]:
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [None]:
del tokenizer
gc.collect()

# LSTM Model

In [None]:
NUM_MODELS = 2
MAX_FEATURES = 100000 # maximum number of different words to keep in the original texts
BATCH_SIZE = 512 # the number of training sample to put in the model in each step
LSTM_UNITS = 128 # the dimension of the output vector of each LSTM cell.
GRU_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 5

In [None]:
from keras.models import Sequential

LSTM_model = Sequential()
LSTM_model.add(Embedding(*embedding_matrix.shape, weights = [embedding_matrix], 
                               input_length = MAX_LEN, trainable = False))
LSTM_model.add(LSTM(units = LSTM_UNITS, return_sequences = True))
LSTM_model.add(GlobalAveragePooling1D())
LSTM_model.add(Dense(units = 1, activation = 'sigmoid'))

In [None]:
LSTM_model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

LSTM_model.fit(x_train, [y_train, y_aux_train], epochs=EPOCHS, 
               batch_size = BATCH_SIZE)

In [None]:
LSTM_pred = LSTM_model.predict(x_test, batch_size=2048)

# GRU Model

In [None]:
GRU_model = Sequential()
GRU_model.add(Embedding(*embedding_matrix.shape, weights = [embedding_matrix], 
                           input_length = MAX_LEN, trainable = False))
GRU_model.add(GRU(units = GRU_UNITS, return_sequences = True))
GRU_model.add(GlobalAveragePooling1D())
GRU_model.add(Dense(units = 1, activation = 'sigmoid'))

In [None]:
GRU_model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

GRU_model.fit(x_train, [y_train, y_aux_train], epochs=EPOCHS, 
               batch_size = BATCH_SIZE)

In [None]:
GRU_pred = GRU_model.predict(x_test, batch_size=2048)

In [None]:
len(GRU_pred)

In [None]:
test.shape

In [None]:
# submission = pd.DataFrame.from_dict({
#     'id': test['id'],
#     'prediction': GRU_pred
# })

# submission.to_csv('submission.csv', index=False)