This project is based on the available kaggle competition - Detection of toxic comment.  
link: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

Let's import necessary packages first.

In [1]:
import os
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout,Conv1D,Flatten,Concatenate
from keras.models import Model
from keras import optimizers
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# Mentioning the name of the input file
TRAIN_DATA_FILE= 'train.csv'
TEST_DATA_FILE= 'test.csv'

In [3]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

In [9]:
# Little preprocessing required
sentences_train = train["comment_text"].fillna("_na_").values
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[classes].values
sentences_test = test["comment_text"].fillna("_na_").values

In [10]:
# Embedding parameter set
embed_size = 100 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 50 # max number of words in a comment to use

In [11]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(sentences_train))
tokens_train = tokenizer.texts_to_sequences(sentences_train)
tokens_test = tokenizer.texts_to_sequences(sentences_test)
X_train = pad_sequences(tokens_train, maxlen=maxlen)
X_test = pad_sequences(tokens_test, maxlen=maxlen)

# Create Keras Model

In [21]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = LSTM(4, return_sequences=True, dropout=0.2, recurrent_dropout=0.1)(x)
x = Conv1D(16,4,activation='relu')(x)
x = Flatten()(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer=optimizers.rmsprop(lr = 0.001,decay = 1e-06), metrics=['accuracy'])
filepath="Weights/weights-improvement.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 4)             1680      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 47, 16)            272       
_________________________________________________________________
flatten_3 (Flatten)          (None, 752)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               75300     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
__________

Fit the defined model onto the data:

In [None]:
model.fit(X_train, y, batch_size=32, epochs=5,callbacks=callbacks_list, verbose=1, validation_split=0.2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 00001: val_acc improved from -inf to 0.98077, saving model to Weights/weights-improvement.hdf5
Epoch 2/5
Epoch 00002: val_acc did not improve
Epoch 3/5
Epoch 00003: val_acc did not improve
Epoch 4/5
Epoch 00004: val_acc did not improve
Epoch 5/5

# Select from saved weights as per choice and predict response

In [20]:
from keras.models import load_model
saved_model = load_model('Weights/weights-improvement.hdf5')
y_test = saved_model.predict(X_test)

In [19]:
Submit = pd.DataFrame(test.id,columns=['id'])
Submit2 = pd.DataFrame(y_test,columns=list_classes)
Submit = pd.concat([Submit,Submit2],axis=1)
Submit.to_csv("Kaggle_Submission_Convolution_LSTM_.csv",index=False)