In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [2]:
import keras.backend as K
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Embedding, LSTM, Input, Dropout, Flatten
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [3]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
submission_df = pd.read_csv('../input/sample_submission.csv')

In [4]:
train_df.shape, test_df.shape

In [5]:
# Fill NAN
train_df.fillna(' UNKNOWN ', inplace = True)
test_df.fillna(' UNKNOWN ', inplace = True)

In [6]:
tokenize = Tokenizer(num_words = 50000)

In [7]:
train_fit = tokenize.fit_on_texts(train_df['comment_text'].values)

In [8]:
train_text = tokenize.texts_to_sequences(train_df['comment_text'].values)
test_text = tokenize.texts_to_sequences(test_df['comment_text'].values)

In [9]:
X_train = pad_sequences(train_text, maxlen = 400)
X_test = pad_sequences(test_text, maxlen = 400)

In [11]:
y_train = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [12]:
print ("Training Data: {} {}".format(X_train.shape, y_train.shape))
print ("Testing Data: {}".format(X_test.shape))

In [13]:
def classification_model():
    K.clear_session()
    model = Sequential()
    model.add(Dense(32, input_shape = (400, ), activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(6, activation = 'sigmoid'))
    
    return model

In [14]:
model = classification_model()
model.summary()

In [15]:
file_path="weights_base.best.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callback_list = [checkpoint, early]

In [16]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])

In [17]:
model.fit(X_train, y_train, 
          batch_size = 32, 
          epochs = 5, 
          validation_split = 0.1, 
          callbacks = callback_list)

In [18]:
model.load_weights('weights_base.best.hdf5')

In [19]:
model.evaluate(X_train, y_train)

In [20]:
pred = model.predict(X_test)

In [21]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [22]:
submid = pd.DataFrame({'id': test_df["id"]})
submission = pd.concat([submid, pd.DataFrame(pred, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

In [23]:
submission.head()