In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('./data/train_cleaned.csv')
test = pd.read_csv('./data/test_cleaned.csv')

In [3]:
train["comment_text"].fillna("unknown", inplace=True)
test["comment_text"].fillna("unknown", inplace=True)

In [4]:
from util import labels, RocAucEvaluation

Using TensorFlow backend.


In [5]:
num_classes = len(labels)
max_features = 30000
hidden = 300
dropout = 0.4
batch_size = 1000

In [6]:
labels_to_id = {label:key for key,label in enumerate(labels)}

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english').fit(train['comment_text'].append(test['comment_text']))

In [9]:
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.models import Model

In [10]:
def get_model():
    input_comment = Input(shape=(max_features,), sparse=True)
    
    x = Dense(hidden, activation='relu')(input_comment)
    x = BatchNormalization()(x)
    x = Dropout(dropout)(x)
    output_pred = Dense(num_classes, activation='sigmoid')(x)
    
    model = Model(inputs=input_comment, outputs=output_pred)
    return model

In [11]:
model = get_model()

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               9000300   
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1806      
Total params: 9,003,306
Trainable params: 9,002,706
Non-trainable params: 600
_________________________________________________________________


In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_fold, val_fold = train_test_split(train, train_size=0.95, random_state=123456)



In [16]:
train_fold.shape, val_fold.shape

((151592, 8), (7979, 8))

In [17]:
train_vect = vectorizer.transform(train_fold['comment_text'])

In [18]:
val_fold_features = vectorizer.transform(val_fold['comment_text'])

In [19]:
roc_auc = RocAucEvaluation(validation_data=(val_fold_features, val_fold[labels]))

In [20]:
model.fit(train_vect, train_fold[labels].values, batch_size=batch_size, epochs=4, callbacks=[roc_auc])

Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.969580 

Epoch 2/4
 ROC-AUC - epoch: 2 - score: 0.973836 

Epoch 3/4
 ROC-AUC - epoch: 3 - score: 0.972107 

Epoch 4/4
 ROC-AUC - epoch: 4 - score: 0.969529 



<keras.callbacks.History at 0x7f9e63a65390>

In [21]:
test_vect = vectorizer.transform(test['comment_text'])

In [22]:
preds = model.predict(test_vect)

In [23]:
subm = pd.DataFrame(preds, columns=labels, index=test['id'])

In [24]:
subm.head(20)

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.98256,0.011348,0.891086,0.003658,0.36814,0.01979
0000247867823ef7,0.000934,0.000541,0.000489,0.0006,0.000726,0.000677
00013b17ad220c46,0.00389,0.001247,0.001868,0.001195,0.003235,0.001298
00017563c3f7919a,0.000719,0.000682,0.000694,0.000711,0.001033,0.00048
00017695ad8997eb,0.000958,0.000419,0.000323,0.000589,0.000585,0.000722
0001ea8717f6de06,0.00285,0.001014,0.00147,0.000881,0.002315,0.001152
00024115d4cbde0f,0.001339,0.000939,0.00079,0.000983,0.001035,0.000723
000247e83dcc1211,0.132324,0.000697,0.004448,0.000493,0.010202,0.000764
00025358d4737918,0.001686,0.00094,0.001402,0.001141,0.00161,0.001316
00026d1092fe71cc,0.000912,0.000906,0.000887,0.000782,0.001335,0.00076
