In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [2]:
train = pd.read_csv('../input/train.csv',header=0)
test = pd.read_csv('../input/test.csv',header=0)

In [3]:
train.head()

In [4]:
test.head()

In [5]:
train.shape, test.shape

In [6]:
print(train.isnull().any() ,'\n\n', test.isnull().any())

In [7]:
train_features = train['comment_text']
test_features = test['comment_text']

classes = list(train.columns.values[2:])
train_labels = train[classes]

In [8]:
classes

In [9]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_features)
train_tokenized_list = tokenizer.texts_to_sequences(train_features)
test_tokenized_list = tokenizer.texts_to_sequences(test_features)

In [10]:
train_features[5]

In [11]:
train_tokenized_list[5]

In [12]:
totalNumWords = [len(comment) for comment in train_tokenized_list]
totalNumWords[5]

In [13]:
plt.hist(totalNumWords,bins=np.arange(0,510,10))
plt.show()

In [14]:
pad_len = 200
train_pad = pad_sequences(train_tokenized_list,maxlen=pad_len,padding='post')
test_pad = pad_sequences(test_tokenized_list,maxlen=pad_len,padding='post')


In [15]:
train_pad[5]

In [16]:
inp = Input(shape=(pad_len,))
embed_size = 128
x = Embedding(max_features,embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

In [17]:
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [18]:
batch_size = 32
epochs = 1
model.fit(train_pad,train_labels, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [19]:
model.summary()

In [20]:
pred = model.predict(test_pad)

In [21]:
test_labels = pd.DataFrame({'id':test['id'],classes[0]:pred[:,0], classes[1]:pred[:,1], classes[2]:pred[:,2], classes[3]:pred[:,3], classes[4]:pred[:,4], classes[5]:pred[:,5]},columns=['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
test_labels.to_csv('Toxic_comment.csv',index=False)

In [22]:
test_labels.head()