In [1]:
import pandas as pd

In [3]:
train = pd.read_csv('./data/train_cleaned.csv')
test = pd.read_csv('./data/test_cleaned.csv')

In [4]:
embedd_dim = 300
embedd_file = '/media/radoslav/ce763dbf-b2a6-4110-960f-2ef10c8c6bde/MachineLearning/crawl-300d-2M.vec'
max_features = 30000
maxlen = 100

In [5]:
from util import labels, RocAucEvaluation, AttentionWithContext, get_fastext_embedding

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [13]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train['comment_text'].append(test['comment_text']))

In [11]:
def get_features(comments):
    seq = tokenizer.texts_to_sequences(comments)
    seq = pad_sequences(seq, maxlen=maxlen)
    return seq

In [14]:
embedding_matrix = get_fastext_embedding(embed_file=embedd_file, embed_size=embedd_dim, max_features=max_features, word_index=tokenizer.word_index)

In [19]:
from keras.layers import Input, Dense, CuDNNGRU, Bidirectional, SpatialDropout1D, Embedding
from keras.models import Model

In [31]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embedd_dim, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = AttentionWithContext()(x)
    output = Dense(6, activation='sigmoid')(x)
    
    model = Model(inputs=inp, outputs=output)
    model.summary()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [35]:
model = get_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 300)          9000000   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 100, 300)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 200)          241200    
_________________________________________________________________
attention_with_context_4 (At (None, 200)               40400     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 1206      
Total params: 9,282,806
Trainable params: 282,806
Non-trainable params: 9,000,000
____________________________________________________________

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
train_fold, val_fold = train_test_split(train, train_size=0.90, random_state=12345)



In [24]:
train_seq = get_features(train_fold['comment_text'])
val_seq = get_features(val_fold['comment_text'])

In [25]:
roc_auc = RocAucEvaluation(validation_data=(val_seq, val_fold[labels].values))

In [36]:
model.fit(train_seq, train_fold[labels].values, 
          validation_data=(val_seq, val_fold[labels].values),
          batch_size=64,
          epochs=4,
          callbacks=[roc_auc]
         )

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.985301 

Epoch 2/4
 ROC-AUC - epoch: 2 - score: 0.985116 

Epoch 3/4
 ROC-AUC - epoch: 3 - score: 0.986720 

Epoch 4/4
 ROC-AUC - epoch: 4 - score: 0.987339 



<keras.callbacks.History at 0x7f53eb135f98>

In [37]:
test_seq = get_features(test['comment_text'])

In [38]:
preds = model.predict(test_seq)

In [39]:
subm = pd.DataFrame(preds, columns=labels, index=test['id'])

In [40]:
subm.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.993797,0.470241,0.983761,0.210092,0.937724,0.518181
0000247867823ef7,0.000411,3.2e-05,0.000108,6e-06,0.000138,3.5e-05
00013b17ad220c46,0.000125,0.000104,0.000115,1.9e-05,0.000117,2.9e-05
00017563c3f7919a,0.000725,3.7e-05,0.000272,3.6e-05,0.000286,1.7e-05
00017695ad8997eb,0.001608,6.8e-05,0.000461,2.9e-05,0.000121,1.2e-05


In [43]:
subm.to_csv('./submission-tmp/att_gru_fastext.csv')