In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
import re, string
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_class = len(label_cols)
X = data[['comment_text']]
y = data[label_cols]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [22]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r'\1', s).split()


vec = TfidfVectorizer(tokenizer=tokenize, stop_words={'english'}, max_features=10000,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [23]:
vec.fit(X['comment_text'])
train_vec = vec.transform(X_train['comment_text'])
test_vec = vec.transform(X_test['comment_text'])

In [24]:
train_vec.shape, test_vec.shape

((111699, 10000), (47872, 10000))

In [29]:
model = Sequential()
model.add(Dense(1000, input_dim = 10000, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dense(num_class, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 1000)              10001000  
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_6 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 150)               30150     
__________

In [30]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [31]:
batch_size = 256 
num_epochs = 8 

In [32]:
hist = model.fit(train_vec, y_train, batch_size=batch_size, epochs=num_epochs, 
                 callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 100529 samples, validate on 11170 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 00005: early stopping


In [33]:
y_pred = model.predict(test_vec)

In [34]:
y_pred

array([[9.8568225e-01, 5.8932900e-03, 5.3210425e-01, 2.8428435e-04,
        3.5953483e-01, 7.5185895e-03],
       [8.9406967e-08, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0961586e-01, 6.3955784e-05, 1.2990028e-02, 8.1628561e-05,
        1.8287510e-02, 1.2849271e-03],
       ...,
       [2.3356512e-01, 7.3406100e-03, 1.1590782e-01, 1.2461960e-02,
        1.3968307e-01, 4.3352753e-02],
       [7.9572201e-06, 0.0000000e+00, 5.9604645e-07, 0.0000000e+00,
        3.5762787e-07, 0.0000000e+00],
       [1.0290742e-04, 0.0000000e+00, 9.7155571e-06, 0.0000000e+00,
        6.8545341e-06, 1.7881393e-07]], dtype=float32)

In [37]:
p = y_pred.round()

In [38]:
p

array([[1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)

In [39]:
for i, j in enumerate(label_cols):
    print(j)
    print(confusion_matrix(y_test[j], p[:, i]))
    print(f1_score(y_test[j], p[:, i]))
    print('\n')

toxic
[[41994  1296]
 [ 1421  3161]]
0.6994136519526496


severe_toxic
[[47200   186]
 [  341   145]]
0.3549571603427173


obscene
[[44854   462]
 [  834  1722]]
0.7265822784810126


threat
[[47712    24]
 [  111    25]]
0.2702702702702703


insult
[[44896   587]
 [ 1001  1388]]
0.6361136571952337


identity_hate
[[47306   134]
 [  351    81]]
0.250386398763524


