In [1]:
import numpy as np
import pandas as pd

In [2]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping,ModelCheckpoint

Using TensorFlow backend.


In [3]:
max_features= 20000
maxlen = 100

In [4]:
train = pd.read_csv("Data/train.csv")

In [5]:
list_sentences_train = train['comment_text'].fillna("unknown").values
list_classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
y= train[list_classes].values

#list_sentences_test = test['comment_text'].fillna("unknown").values

In [7]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
# list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
#X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [9]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50,return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs=x)
    
    model.compile(loss='binary_crossentropy', optimizer ='adam', metrics =['accuracy'])
    
    return model

model = get_model()
model.summary()




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 100)          71600     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 100)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
__________

In [None]:
batch_size = 32
epochs =2


file_path = "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor = 'val_loss', verbose =1, save_best_only=True, mode='min')

early = EarlyStopping(monitor='val_loss', mode='min', patience=20)

callbacks_list = [checkpoint, early]

model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)


Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2

In [None]:
model.load_weights(file_path)
y_test = model.predict(X_t)