In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('./data/train_cleaned.csv')
test = pd.read_csv('./data/test_cleaned.csv')

In [3]:
max_features = 30000
maxlen = 100
embed_size = 300
EMBEDDING_FILE = '/media/radoslav/ce763dbf-b2a6-4110-960f-2ef10c8c6bde/MachineLearning/crawl-300d-2M.vec'

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train['comment_text'].append(test['comment_text']))

In [6]:
def get_features(comments):
    seq = tokenizer.texts_to_sequences(comments)
    seq = pad_sequences(seq, maxlen=maxlen)
    return seq

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_fold, val_fold = train_test_split(train, train_size=0.90 ,random_state=1234)



In [9]:
train_seq = get_features(comments=train_fold['comment_text'])

In [10]:
val_seq = get_features(comments=val_fold['comment_text'])

In [11]:
import numpy as np
from util import labels, RocAucEvaluation, get_fastext_embedding

In [12]:
embedding_matrix = get_fastext_embedding(EMBEDDING_FILE, embed_size, tokenizer.word_index, max_features)

In [13]:
from keras.layers import Input, Bidirectional, CuDNNGRU, Dense, Dropout, Embedding, GlobalAveragePooling1D,GlobalMaxPooling1D,Concatenate, SpatialDropout1D
from keras.models import Model

In [14]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = Concatenate()([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
    

In [15]:
model = get_model()

In [16]:
roc_auc = RocAucEvaluation(validation_data=(val_seq, val_fold[labels].values))

In [17]:
model.fit(train_seq, train_fold[labels].values,
          validation_data=(val_seq, val_fold[labels].values),
          epochs=2,
          batch_size=64,
          callbacks=[roc_auc])

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.983949 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.982877 



<keras.callbacks.History at 0x7f611818ecc0>

In [18]:
test_seq = get_features(test['comment_text'])

In [19]:
preds = model.predict(test_seq)

In [20]:
subm = pd.DataFrame(preds, columns=labels, index=test['id'])

In [21]:
subm.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.996313,0.375589,0.988004,0.044333,0.958232,0.103217
0000247867823ef7,0.000225,1e-05,8.3e-05,4e-06,7.9e-05,7e-06
00013b17ad220c46,0.001614,5.5e-05,0.000453,1.3e-05,0.000519,2.4e-05
00017563c3f7919a,0.000538,1.7e-05,0.000234,4.9e-05,0.000353,6e-06
00017695ad8997eb,0.006324,3.5e-05,0.000533,3.4e-05,0.000437,1.8e-05


In [22]:
subm.to_csv('./submission-tmp/pooled_gru_fastext.csv')