In [12]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [13]:
%matplotlib inline

In [14]:
train_raw  = pd.read_csv('train.csv')
test_raw  = pd.read_csv('test.csv')

In [15]:
import spacy
nlp = spacy.load('en')

In [16]:
comments_train = train_raw.comment_text
comments_test = test_raw.comment_text

In [17]:
lemma_train = comments_train.apply(lambda row: [w.lemma_ for w in nlp(row)])
lemma_test = comments_test.apply(lambda row: [w.lemma_ for w in nlp(row)])

In [18]:
import pickle

In [19]:
with open('train_lemma.pickle', 'wb') as handle:
    pickle.dump(lemma_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('test_lemma.pickle', 'wb') as handle:
    pickle.dump(lemma_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
lemma_train.head()

0    [explanation, \n, why, the, edit, make, under,...
1    [d'aww, !, -PRON-, match, this, background, co...
2    [hey, man, ,, -PRON-, be, really, not, try, to...
3    [", \n, more, \n, -PRON-, can, not, make, any,...
4    [-PRON-, ,, sir, ,, be, -PRON-, hero, ., any, ...
Name: comment_text, dtype: object

In [22]:
list_classes = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = train_raw[list_classes].values
list_sentences_train = lemma_train
list_sentences_test = lemma_test

In [46]:
joined_train = list_sentences_train.str.join(' ')
joined_test = list_sentences_test.str.join(' ')

In [47]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(joined_train))
list_tokenized_train = tokenizer.texts_to_sequences(joined_train)
list_tokenized_test = tokenizer.texts_to_sequences(joined_test)

In [48]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [49]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPool1D, Dropout

In [50]:
inp = Input(shape=(maxlen, ))

x = Embedding(max_features, 128)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(5, activation="sigmoid")(x)


In [54]:
model = Model(inputs=inp, outputs=x)


In [55]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [57]:
batch_size = 32
epochs = 2
model.fit(X_t,Y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2b8d401080>

In [62]:
inp = pad_sequences(tokenizer.texts_to_sequences(["I will kill you"]), maxlen=maxlen)
dict(zip(list_classes, model.predict(inp)[0]))

{'identity_hate': 0.18027107,
 'insult': 0.2681649,
 'obscene': 0.19187137,
 'severe_toxic': 0.05276593,
 'threat': 0.51758426}