In [11]:
import pandas as pd
import numpy as np

In [105]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

train = train.fillna("unknown")
test = test.fillna("unknown")

In [106]:
train["comment_text"] = train["comment_text"].apply(lambda x: x.replace("\n", " "))
test["comment_text"] = test["comment_text"].apply(lambda x: x.replace("\n", " "))

train["comment_text"] = train["comment_text"].str.lower()
test["comment_text"] = test["comment_text"].str.lower()

train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" more i can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


In [107]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [108]:
train[train["toxic"] == 1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
12,0005c987bdfc9d4b,hey... what is it.. @ | talk . what is it... a...,1,0,0,0,0,0
16,0007e25b2121310b,"bye! don't look, come or think of comming ba...",1,0,0,0,0,0
42,001810bf8c45bf5f,you are gay or antisemmitian? archangel whit...,1,0,1,0,1,1
43,00190820581d90ce,"fuck your filthy mother in the ass, dry!",1,0,1,0,1,0


In [109]:
train_y_pd = train[target_labels]
train_y = train_y_pd.values
train_y[0]

array([0, 0, 0, 0, 0, 0])

In [110]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [111]:
max_fatures = 100000
max_len = 50

In [113]:
tokenizer = Tokenizer(num_words=max_fatures)
corpus = train["comment_text"].append(test["comment_text"])
tokenizer.fit_on_texts(corpus)

In [29]:
def calc_text_len_word(samples):
    lens = []
    for sample in samples:
        for word in sample.split():
            lens.append(len(word))
    return np.array(lens).mean()
calc_text_len_word(corpus.values)

4.8292019915471629

In [114]:
train_x = tokenizer.texts_to_sequences(train["comment_text"])
train_x = pad_sequences(train_x, maxlen=max_len)

In [115]:
print(train_x[0])
train_x[0].shape

[    0     0     0   733    78     1   140   131   182    30   712  4438
 10284  1252    86   368    51  2230 14039    49  6744    15    60  2624
   151     7  2832    33   115  1246 16129  2517     5    50    59   256
     1   370    31     1    46    29   144    72  3931    89  4208  6368
  2687  1183]


(50,)

In [35]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential

In [36]:
model = Sequential([
    Embedding(max_fatures,256, input_length=train_x.shape[1]),
    Bidirectional(LSTM(200, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.4),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(num_classes, activation="sigmoid")
])

In [37]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           20480000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 400)           731200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 400)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               51328     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
__________

In [39]:
print(train_x.shape)
print(train_y.shape)

(159571, 10)
(159571, 6)


In [40]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

stopping = EarlyStopping(patience=2)

checkpoint = ModelCheckpoint("weights.{epoch:03d}--{val_loss:.2f}.hdf5", save_best_only=True)

In [41]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=64, epochs=10, callbacks=[stopping, checkpoint])

Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x7fe2f966e320>

In [23]:
model.save("bi_lstm_1.h5")

In [32]:
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.layers import GRU

In [72]:
model = Sequential([
    Embedding(max_fatures,256, input_length=train_x.shape[1]),
    Conv1D(32, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Conv1D(64, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.4),
    Conv1D(128, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation="relu"),
    Dropout(0.4),
    Dense(6, activation="sigmoid")
])

In [73]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [74]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 128)          3200000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 100, 32)           20512     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 33, 32)            0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 33, 32)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 33, 64)            10304     
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 11, 64)            0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 11, 64)            0         
__________

In [75]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=512, epochs=5, callbacks=[stopping, checkpoint])

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


<keras.callbacks.History at 0x7f9ae28def60>

In [63]:
model.save('conv_bi_lstm.h5')

In [48]:
def load_vectors(path):
    with open(path) as lines:
        w2v = {line.split()[0]: np.array( list( map(float,line.split()[1:]) ))
           for line in lines}
        embedding_matrix = np.zeros((len(w2v) + 1, 300))
        for i,word in enumerate(w2v):
            embedding_vector = w2v[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        return w2v, embedding_matrix
    

In [49]:
w2v, embedding_matrix = load_vectors("/home/radoslav/ML/glove.6B.300d.txt")

In [50]:
len(w2v)

400000

In [116]:
model = Sequential([
    Embedding(len(w2v) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False),
    Bidirectional(LSTM(200, return_sequences=True)),
#     Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.4),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dropout(0.1),
    Dense(num_classes, activation="sigmoid")
])

In [117]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 300)           120000300 
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 50, 400)           801600    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 400)               0         
_________________________________________________________________
dropout_24 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 128)               51328     
_________________________________________________________________
dropout_25 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 64)                8256      
__________

In [118]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

In [119]:
checkpoint = ModelCheckpoint("./models/glove_my_vectors_bi_lstm.h5", save_best_only=True)

In [120]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=64, epochs=10, callbacks=[stopping, checkpoint])

Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7fe2e337bb70>

In [63]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [122]:
test_tokenized = tokenizer.texts_to_sequences(test["comment_text"])

In [44]:
from keras.models import load_model

In [45]:
MODEL = "./models/glove_my_vectors_bi_lstm.h5"

In [46]:
model = load_model(MODEL)

In [123]:
test_tokenized = pad_sequences(test_tokenized, maxlen=max_len)

In [124]:
predicted  = model.predict(test_tokenized)

In [125]:
test["comment_text"][0]

"yo bitch ja rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"

In [128]:
predicted[0]

array([ 0.97311425,  0.17160489,  0.74296844,  0.06184849,  0.64987212,
        0.14731732], dtype=float32)

In [126]:
predicted[:5]

array([[  9.73114252e-01,   1.71604887e-01,   7.42968440e-01,
          6.18484914e-02,   6.49872124e-01,   1.47317320e-01],
       [  2.16275192e-04,   4.94657116e-21,   4.51902736e-07,
          1.92833628e-16,   3.49699896e-08,   1.65242778e-10],
       [  7.06344610e-04,   2.00268884e-18,   2.93375797e-06,
          2.03965483e-14,   3.56071126e-07,   2.93819702e-09],
       [  1.33714348e-04,   5.26414884e-22,   2.30417697e-07,
          2.82608870e-17,   1.42138656e-08,   5.60467193e-11],
       [  3.16889072e-03,   2.74070957e-15,   2.85188398e-05,
          7.29193008e-12,   6.28011503e-06,   9.11141171e-08]], dtype=float32)

In [127]:
submission = pd.DataFrame(data=predicted,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.973114,0.1716049,0.7429684,0.06184849,0.6498721,0.1473173
0000247867823ef7,0.000216,4.946571e-21,4.519027e-07,1.928336e-16,3.496999e-08,1.652428e-10
00013b17ad220c46,0.000706,2.002689e-18,2.933758e-06,2.039655e-14,3.560711e-07,2.938197e-09
00017563c3f7919a,0.000134,5.264149000000001e-22,2.304177e-07,2.826089e-17,1.421387e-08,5.604672e-11
00017695ad8997eb,0.003169,2.74071e-15,2.851884e-05,7.29193e-12,6.280115e-06,9.111412e-08


In [85]:
submission.to_csv("./submission_glove_bi_lstm_300.csv")