In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
import keras
from keras.preprocessing.text import Tokenizer
from keras import Sequential
import tensorflow as tf
from keras import backend as K

from keras.layers import Dense, Activation, Dropout



In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
max_words = 20000
tokenizer = Tokenizer(num_words=max_words, char_level=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(train['comment_text']) 
# теперь токенизатор знает словарь для этого корпуса текстов

In [None]:
x_train = tokenizer.texts_to_matrix(train['comment_text'], mode='freq')
x_test = tokenizer.texts_to_matrix(test['comment_text'], mode='freq')

In [None]:
x_train.shape, x_test.shape

In [None]:
num_classes = 2

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dense(256, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dense(128, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [None]:
# AUC for a binary classifier
def auc(y_true, y_pred):   
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

#-----------------------------------------------------------------------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)    
    return FP/N
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)    
    return TP/P


model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', auc])

In [None]:
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = train[target_col]

In [None]:
prd = np.zeros((x_test.shape[0],y.shape[1]))
cv_score =[]

batch_size = 2000
epochs = 4
num_classes = 2

for i,col in enumerate(target_col):
    y_train = keras.utils.to_categorical(train[col], num_classes)

    history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.4)
    prd[:,i] = model.predict_proba(x_test)[:,1]

In [None]:
prd_1 = pd.DataFrame(prd,columns=y.columns)
submit = pd.concat([test['id'],prd_1],axis=1)
#submit.to_csv('toxic_lr.csv.gz',compression='gzip',index=False)
submit.to_csv('nn_by_keras.csv',index=False)
submit.head()