In [2]:
import pandas as pd
import numpy as np

In [64]:
train = pd.read_csv("./data/train.csv")
train = train.fillna("unknown")

In [65]:
train[:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [66]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [67]:
test = pd.read_csv("./data/test.csv")
test = test.fillna("unknown")

In [68]:
test[:5]

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [69]:
train[train["toxic"] == 1][:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
20,225701312,Why hasn't Alitalia been removed rom the allia...,1,0,0,0,0,0
26,293668009,"""\nThe Graceful Slick....\nIs non other than a...",1,0,0,0,0,0
30,341549388,"""\n\n Stupid? \n\nAs soon as I saw the phrase ...",1,0,0,0,0,0
32,345843351,"""\nBan one side of an argument by a bullshit n...",1,0,1,0,1,0


In [70]:
train_y_pd = train[target_labels]
train_y = train_y_pd.values
train_y[0]

array([1, 0, 0, 0, 0, 0])

In [71]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [146]:
max_fatures = 25000
max_len = 100

In [147]:
tokenizer = Tokenizer(num_words=max_fatures)
corpus = train["comment_text"].append(test["comment_text"])
tokenizer.fit_on_texts(list(corpus.values))

In [148]:
def calc_text_uniq_words(samples):
    unique_words = set()
    for sample in samples:
        for word in sample.split():
            unique_words.add(word)
    return len(unique_words)

calc_text_uniq_words(corpus.values)

1021763

In [149]:
def calc_text_len_word(samples):
    lens = []
    for sample in samples:
        for word in sample.split():
            lens.append(len(word))
    return mean(lens)

In [150]:
train_x = tokenizer.texts_to_sequences(train["comment_text"])
train_x = pad_sequences(train_x, maxlen=max_len)

In [151]:
print(train_x[0])
train_x[0].shape

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0   881  2426   307  9418    52     9   216     8   406   331
    22    24   217 13956]


(100,)

In [152]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential

In [182]:
model = Sequential([
    Embedding(max_fatures,256, input_length=train_x.shape[1]),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dense(num_classes, activation="sigmoid")
])

In [183]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

In [184]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 256)          6400000   
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 100, 256)          394240    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 256)               0         
_________________________________________________________________
dropout_22 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_30 (Dense)             (None, 6)                 774       
Total params: 6,827,910
Trainable params: 6,827,910
Non-trainable params: 0
_________________________________________________________________


In [168]:
print(train_x.shape)
print(train_y.shape)

(95851, 100)
(95851, 6)


In [185]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

stopping = EarlyStopping(patience=1)

checkpoint = ModelCheckpoint("weights.{epoch:03d}--{val_loss:.2f}.hdf5", save_best_only=False)

In [187]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=256, epochs=2, callbacks=[stopping, checkpoint])

Train on 76680 samples, validate on 19171 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2e73b016a0>

In [84]:
model.save("bi_lstm_1.h5")

In [173]:
test_tokenized = tokenizer.texts_to_sequences(test["comment_text"])

In [174]:
from keras.models import load_model

In [97]:
model = load_model("bi_lstm.h5")

In [175]:
test_tokenized = pad_sequences(test_tokenized, maxlen=max_len)

In [176]:
predicted  = model.predict(test_tokenized)

In [177]:
predicted[0]

array([  1.54495640e-02,   1.92406114e-05,   2.35622609e-03,
         1.31144683e-04,   7.44883087e-04,   2.72818113e-04], dtype=float32)

In [178]:
predicted[:5]

array([[  1.54495640e-02,   1.92406114e-05,   2.35622609e-03,
          1.31144683e-04,   7.44883087e-04,   2.72818113e-04],
       [  7.65317091e-05,   1.77033507e-06,   6.08848204e-05,
          7.15530314e-06,   1.14226059e-05,   1.21743296e-05],
       [  1.08290507e-04,   1.72356772e-06,   8.28837001e-05,
          6.99986549e-06,   1.62713131e-05,   1.37219795e-05],
       [  1.16326229e-03,   1.74534216e-06,   4.30701184e-04,
          1.35800547e-05,   7.55119399e-05,   3.25274705e-05],
       [  1.75380308e-04,   1.20401512e-06,   7.05508792e-05,
          7.16337354e-06,   1.34465063e-05,   1.22750207e-05]], dtype=float32)

In [179]:
submission = pd.DataFrame(data=predicted,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6044863,0.01545,1.9e-05,0.002356,0.000131,0.000745,0.000273
6102620,7.7e-05,2e-06,6.1e-05,7e-06,1.1e-05,1.2e-05
14563293,0.000108,2e-06,8.3e-05,7e-06,1.6e-05,1.4e-05
21086297,0.001163,2e-06,0.000431,1.4e-05,7.6e-05,3.3e-05
22982444,0.000175,1e-06,7.1e-05,7e-06,1.3e-05,1.2e-05


In [180]:
submission.to_csv("./submission_lstm2.csv")