In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/train.csv")
train = train.fillna("unknown")

In [3]:
train[:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [5]:
test = pd.read_csv("../data/test.csv")
test = test.fillna("unknown")

In [6]:
test[:5]

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
train[train["toxic"] == 1][:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [8]:
train_y_pd = train[target_labels]
train_y = train_y_pd.values
train_y[0]

array([0, 0, 0, 0, 0, 0])

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [10]:
max_fatures = 25000
max_len = 100

In [11]:
tokenizer = Tokenizer(num_words=max_fatures)
corpus = train["comment_text"].append(test["comment_text"])
tokenizer.fit_on_texts(list(corpus.values))

In [12]:
def calc_text_uniq_words(samples):
    unique_words = set()
    for sample in samples:
        for word in sample.split():
            unique_words.add(word)
    return len(unique_words)

calc_text_uniq_words(corpus.values)

962229

In [13]:
def calc_text_len_word(samples):
    lens = []
    for sample in samples:
        for word in sample.split():
            lens.append(len(word))
    return mean(lens)

In [14]:
train_x = tokenizer.texts_to_sequences(train["comment_text"])
train_x = pad_sequences(train_x, maxlen=max_len)

In [15]:
print(train_x[0])
train_x[0].shape

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0   733    78     1   140   131   182    30
   712  4438 10284  1252    86   368    51  2230 14039    49  6744    15
    60  2624   151     7  2832    33   115  1246 16129  2517     5    50
    59   256     1   370    31     1    46    29   144    72  3931    89
  4208  6368  2687  1183]


(100,)

In [16]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential

In [17]:
model = Sequential([
    Embedding(max_fatures,256, input_length=train_x.shape[1]),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dense(num_classes, activation="sigmoid")
])

In [18]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 256)          6400000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          394240    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774       
Total params: 6,827,910
Trainable params: 6,827,910
Non-trainable params: 0
_________________________________________________________________


In [20]:
print(train_x.shape)
print(train_y.shape)

(159571, 100)
(159571, 6)


In [21]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

stopping = EarlyStopping(patience=2)

checkpoint = ModelCheckpoint("weights.{epoch:03d}--{val_loss:.2f}.hdf5", save_best_only=False)

In [22]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=256, epochs=5, callbacks=[stopping, checkpoint])

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


<keras.callbacks.History at 0x7f9b4ed20a20>

In [23]:
model.save("bi_lstm_1.h5")

In [71]:
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.layers import GRU

In [72]:
model = Sequential([
    Embedding(max_fatures,256, input_length=train_x.shape[1]),
    Conv1D(32, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Conv1D(64, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.4),
    Conv1D(128, kernel_size=5, padding="same", activation="relu"),
    MaxPooling1D(pool_size=3),
    Dropout(0.3),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation="relu"),
    Dropout(0.4),
    Dense(6, activation="sigmoid")
])

In [73]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [74]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 128)          3200000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 100, 32)           20512     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 33, 32)            0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 33, 32)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 33, 64)            10304     
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 11, 64)            0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 11, 64)            0         
__________

In [75]:
model.fit(train_x, train_y, validation_split=0.2, batch_size=512, epochs=5, callbacks=[stopping, checkpoint])

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


<keras.callbacks.History at 0x7f9ae28def60>

In [63]:
model.save('conv_bi_lstm.h5')

In [89]:
def load_vectors(path):
    with open(path) as lines:
        w2v = {line.split()[0].decode("utf-8"): np.array( list( map(float,line.split()[1:]) ))
           for line in lines}
        coefs = [np.array( list( map(float,line.split()[1:]))) for line in lines]
        return w2v,coefs
    

In [90]:
load_vectors("/media/radoslav/6906F83679A14133/Download/glove/glove.840B.300d.txt")

AttributeError: 'str' object has no attribute 'decode'

In [None]:
len(embeddings_index)

In [81]:
coefs[0]

'-'

In [24]:
test_tokenized = tokenizer.texts_to_sequences(test["comment_text"])

In [25]:
from keras.models import load_model

In [64]:
MODEL = "conv_bi_lstm.h5"

In [65]:
model = load_model(MODEL)

In [27]:
test_tokenized = pad_sequences(test_tokenized, maxlen=max_len)

In [66]:
predicted  = model.predict(test_tokenized)

In [67]:
predicted[0]

array([ 0.98309785,  0.32500991,  0.89437902,  0.06073291,  0.79196614,
        0.18210582], dtype=float32)

In [68]:
predicted[:5]

array([[  9.83097851e-01,   3.25009912e-01,   8.94379020e-01,
          6.07329085e-02,   7.91966140e-01,   1.82105824e-01],
       [  1.42182072e-03,   4.30916316e-07,   1.18122560e-04,
          3.68104565e-06,   7.13195113e-05,   2.10982926e-05],
       [  6.11998420e-03,   4.95374115e-06,   5.66422357e-04,
          2.95630689e-05,   4.45642159e-04,   1.24110054e-04],
       [  3.54118703e-04,   2.00482404e-08,   2.30939095e-05,
          2.66141996e-07,   9.23551124e-06,   2.38507482e-06],
       [  2.21202406e-03,   8.81990275e-07,   1.89342623e-04,
          6.89633271e-06,   1.23796111e-04,   3.54671211e-05]], dtype=float32)

In [69]:
submission = pd.DataFrame(data=predicted,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.983098,0.3250099,0.894379,0.06073291,0.791966,0.182106
0000247867823ef7,0.001422,4.309163e-07,0.000118,3.681046e-06,7.1e-05,2.1e-05
00013b17ad220c46,0.00612,4.953741e-06,0.000566,2.956307e-05,0.000446,0.000124
00017563c3f7919a,0.000354,2.004824e-08,2.3e-05,2.66142e-07,9e-06,2e-06
00017695ad8997eb,0.002212,8.819903e-07,0.000189,6.896333e-06,0.000124,3.5e-05


In [70]:
submission.to_csv("./submission_conv_bi_lstm.csv")