In [1]:
import time
start_time = time.time()

import gc
gc.enable()
import os 
import numpy as np 
import pandas as pd 
import multiprocessing
from tqdm.notebook import tqdm 
from kaggle_datasets import KaggleDatasets 

import transformers 
from transformers import (TFAutoModel, 
                          AutoTokenizer) 
from tokenizers import (Tokenizer, 
                        models, 
                        pre_tokenizers, 
                        decoders, 
                        processors, 
                        BertWordPieceTokenizer, 
                        SentencePieceBPETokenizer) 
from transformers import AdamW

import tensorflow as tf 
from tensorflow.keras import backend
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.models import Model 
from tensorflow.keras.callbacks import ModelCheckpoint 
tf.config.experimental_run_functions_eagerly(False)
from tensorflow.keras.mixed_precision import experimental as mixed_precision

from nltk.tokenize.treebank import TreebankWordTokenizer
Ttokenizer = TreebankWordTokenizer()

from sklearn.metrics import roc_auc_score as roc, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

import warnings
warnings.simplefilter("ignore")
print('Import done! Time past %.2f secs' % (time.time() - start_time))

Import done! Time past 8.77 secs


In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [3]:
# USE_FLOAT16 = True
# XLA_ACCELERATE = False
# if USE_FLOAT16:
#     if tpu: 
#         policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
#     else: 
#         policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
#     mixed_precision.set_policy(policy)
#     print('Mixed precision enabled')

# if XLA_ACCELERATE:
#     tf.config.optimizer.set_jit(True)
#     print('Accelerated Linear Algebra enabled')

In [4]:
def binary_focal_loss(gamma=2., alpha=.25):
    def binary_focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        epsilon = K.epsilon()
        pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
        pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) \
               -K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return binary_focal_loss_fixed

In [5]:
def transform(tensor):
    apool = backend.mean(tensor, 1)
    mpool = backend.max(tensor, 1)
    cat = backend.concatenate((apool, mpool), 1)
    return cat

def build_model(transformer, max_len=192):
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    o1, o2 = transformer(ids,attention_mask=att,token_type_ids=tok)
#     lam = tf.keras.layers.Lambda(transform, name='transform')(o1)
#     drop = tf.keras.layers.Dropout(0.2)(lam)
    out = Dense(1, activation='sigmoid',
                kernel_initializer=tf.keras.initializers.GlorotUniform(),
                bias_initializer=tf.keras.initializers.Zeros())(o1[:,0,:])
    model = Model(inputs=[ids, att, tok], outputs=out)
    model.compile(Adam(lr=1e-5), loss="binary_crossentropy", metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

In [6]:
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [7]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [8]:
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=250465, random_state=0)
])

In [9]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=True, 
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
    )    
    return np.array(enc_di['input_ids']), np.array(enc_di['attention_mask']), np.array(enc_di['token_type_ids'])

In [10]:
input_ids, input_mask, input_type_ids = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
input_ids_v, input_mask_v, input_type_ids_v = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
input_ids_t, input_mask_t, input_type_ids_t = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [11]:
del train, train1, train2, valid, test
VER = 'large'
tf.compat.v1.reset_default_graph()
backend.clear_session()
gc.collect()

20

In [12]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 192)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 192)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 192)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 192, 1024),  559890432   input_1[0][0]                    
_____________________________________________________________________________________________

In [13]:
VER = 'large'
sv = tf.keras.callbacks.ModelCheckpoint(
        'xlm-roberta-%s.h5'%(VER), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, patience=2,
                                      restore_best_weights=True, mode='auto')

train_history = model.fit([input_ids, input_mask, input_type_ids], [y_train], 
                    epochs=4, batch_size=128, verbose=1, callbacks=[sv, es],
                    validation_data=([input_ids_v, input_mask_v, input_type_ids_v], [y_valid]))

Train on 586240 samples, validate on 8000 samples
Epoch 1/4
Epoch 00001: val_loss improved from inf to 0.30436, saving model to xlm-roberta-large.h5
Epoch 2/4
Epoch 00002: val_loss did not improve from 0.30436
Epoch 3/4
Epoch 00003: val_loss did not improve from 0.30436
Restoring model weights from the end of the best epoch.
Epoch 00003: early stopping


In [14]:
valid_history = model.fit([input_ids_v, input_mask_v, input_type_ids_v], [y_valid], 
                    epochs=5, batch_size=64, verbose=1, callbacks=[sv, es],
                    validation_data=None)

Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
preds = model.predict([input_ids_t, input_mask_t, input_type_ids_t], verbose=1, batch_size=64)
sample = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
sample.toxic = preds
sample.to_csv('submission.csv',index=False)



In [16]:
# train_history = model.fit(
#     train_dataset,
#     steps_per_epoch=n_steps,
#     validation_data=valid_dataset,
#     epochs=EPOCHS
# )

In [17]:
# VER = 'large'
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tf.compat.v1.reset_default_graph()
# backend.clear_session()
# gc.collect()
# with strategy.scope():
#     transformer = TFAutoModel.from_pretrained("jplu/tf-xlm-roberta-large")
#     model = build_model(transformer, max_len=196)

# sv = tf.keras.callbacks.ModelCheckpoint(
#         'xlm-roberta-%s.h5'%(VER), monitor='val_loss', verbose=1, save_best_only=True,
#         save_weights_only=True, mode='auto', save_freq='epoch')
# es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, patience=2,
#                                       restore_best_weights=True, mode='auto')
# train_history = model.fit([input_ids, input_mask, input_type_ids], [y_train], 
#                     epochs=5, abatch_size=128, verbose=1, callbacks=[sv,es],
#                     validation_split=0.1)

In [18]:
# start_time = time.time()
# EPOCHS = 2
# VERBOSE = 1
# BATCH_SIZE = 128
# MAX_LEN = 196
# VER = 'large'
# oof = np.zeros((input_ids.shape[0],1))
# preds = np.zeros((input_ids_t.shape[0],1))
# skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2020)

# for fold,(idxT, idxV) in enumerate(skf.split(input_ids, input_y)):
#     ft = time.time()
#     print('#'*25)
#     print('### FOLD %i'%(fold+1))
#     print('#'*25)
    
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     backend.clear_session()
#     tf.compat.v1.reset_default_graph()
    
#     print('Building Model...')
#     st = time.time()
#     with strategy.scope():
#         transformer = TFAutoModel.from_pretrained("jplu/tf-xlm-roberta-large")
#         model = build_model(transformer, max_len=196)
#     print('Building Model Done! Time past %.2f secs' % (time.time() - st))
#     print()
    
#     sv = tf.keras.callbacks.ModelCheckpoint(
#         'xlm-roberta-%s.h5'%(VER), monitor='val_loss', verbose=1, save_best_only=True,
#         save_weights_only=True, mode='auto', save_freq='epoch')
#     es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', verbose=1, patience=1,
#                                           restore_best_weights=True, mode='auto')
#     train_history = model.fit([input_ids[idxT,], input_mask[idxT,], input_type_ids[idxT,]], [input_y[idxT]], 
#                     epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=VERBOSE, callbacks=[sv],
#                     validation_data=([input_ids[idxV,],input_mask[idxV,],input_type_ids[idxV,]], 
#                     [input_y[idxV]]))
    
#     print()
#     print('Loading model...')
#     model.load_weights('xlm-roberta-%s.h5'%(VER))
    
#     print('Predicting OOF...')
#     oof[idxV] = model.predict([input_ids[idxV,],input_mask[idxV,],input_type_ids[idxV,]],verbose=VERBOSE, batch_size=50)
    
#     print('Predicting Test...')
#     preds += model.predict([input_ids_t, input_mask_t, input_type_ids_t],verbose=VERBOSE, batch_size=64)/skf.n_splits
    
#     del model, transformer
#     gc.collect()
    
#     print('>>>> FOLD %i ROC AUC ='%(fold+1),roc(input_y[idxV], oof[idxV]))
    
#     print(f'Fold {fold} completed. Time past %.2f secs'%(time.time()-ft))
#     print()
    
# print('Training Done! Time past %.2f secs' % (time.time() - start_time))

In [19]:
# sample = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
# sample.toxic = preds
# sample.to_csv('submission.csv',index=False)

Batch Size / GPU = 16 on 8 GPUs (Effective BS = 128)
Adam with a LR of 0.000005
We run validation after each epoch - where the epoch consists of 5K batches with data randomly sampled from the training set - and select the checkpoint with the best validation set result. This is quite important.
We run training for 30 epochs with early stopping (stop if the validation accuracy has not improved for 5 epochs) where epoch is defined as above.

In [20]:
# with strategy.scope():
#     transformer = TFAutoModel.from_pretrained("jplu/tf-xlm-roberta-base")
#     model = build_model(transformer, max_len=196)

In [21]:
# EPOCHS = 5
# VERBOSE = 1
# BATCH_SIZE = 100

# train_history = model.fit(
#     [train_ids, train_mask, train_type_ids],
#     [y_train],
#     batch_size=BATCH_SIZE,
#     epochs=EPOCHS,    
#     validation_data=([valid_ids, valid_mask, valid_type_ids ], [y_valid]),
#     verbose=VERBOSE
# )

In [22]:
# model.evaluate([valid_ids, valid_mask, valid_type_ids ], [y_valid])
# preds = model.predict([valid_ids, valid_mask, valid_type_ids ])
# from sklearn.metrics import roc_auc_score as roc
# roc(y_valid, preds)

## Results

### Selected model - 

<p style="color:darkgreen;font-size:18px;font-style:italic"><b>Batch size 100, Seq Length 196, Learning Rate 3e-5 </b></p>

    Train on 50000 samples, validate on 4484 samples
    Epoch 1/5
    50000/50000 [==============================] - 239s 5ms/sample - loss: 0.3019 - accuracy: 0.8668 - val_loss: 0.3219 - val_accuracy: 0.9012
    Epoch 2/5
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2005 - accuracy: 0.9152 - val_loss: 0.2922 - val_accuracy: 0.9092
    Epoch 3/5
    50000/50000 [==============================] - 102s 2ms/sample - loss: 0.1634 - accuracy: 0.9314 - val_loss: 0.2704 - val_accuracy: 0.9061
    Epoch 4/5
    50000/50000 [==============================] - 102s 2ms/sample - loss: 0.1328 - accuracy: 0.9455 - val_loss: 0.2231 - val_accuracy: 0.9088
    Epoch 5/5
    50000/50000 [==============================] - 103s 2ms/sample - loss: 0.1057 - accuracy: 0.9565 - val_loss: 0.2250 - val_accuracy: 0.9045

Train on 50000 samples, validate on 4484 samples

    Epoch 1/5
    50000/50000 [==============================] - 187s 4ms/sample - loss: 0.2961 - accuracy: 0.8722 - auc: 0.9471 - val_loss: 0.3097 - val_accuracy: 0.8965 - val_auc: 0.9708
    Epoch 2/5
    50000/50000 [==============================] - 101s 2ms/sample - loss: 0.2044 - accuracy: 0.9143 - auc: 0.9742 - val_loss: 0.2766 - val_accuracy: 0.9110 - val_auc: 0.9730
    Epoch 3/5
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.1664 - accuracy: 0.9313 - auc: 0.9826 - val_loss: 0.2308 - val_accuracy: 0.9112 - val_auc: 0.9741
    Epoch 4/5
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.1403 - accuracy: 0.9423 - auc: 0.9873 - val_loss: 0.2434 - val_accuracy: 0.9070 - val_auc: 0.9713
    Epoch 5/5
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.1111 - accuracy: 0.9563 - auc: 0.9918 - val_loss: 0.2170 - val_accuracy: 0.9066 - val_auc: 0.9730

4484/4484 [==============================] - 19s 4ms/sample - loss: 0.2173 - accuracy: 0.9066 - auc: 0.9730
            
0.9730803671701838

### Experimenting with Models

### Batch size 200, Seq Length 196
    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 156s 3ms/sample - loss: 0.3874 - accuracy: 0.8212 - val_loss: 0.3751 - val_accuracy: 0.8539
    Epoch 2/3
    50000/50000 [==============================] - 57s 1ms/sample - loss: 0.2459 - accuracy: 0.8970 - val_loss: 0.3186 - val_accuracy: 0.8831
    Epoch 3/3
    50000/50000 [==============================] - 57s 1ms/sample - loss: 0.2128 - accuracy: 0.9117 - val_loss: 0.2848 - val_accuracy: 0.8934


### Batch size 100, Seq Length 196, Dropout 0.3 

    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 198s 4ms/sample - loss: 0.3514 - accuracy: 0.8440 - val_loss: 0.2428 - val_accuracy: 0.9003
    Epoch 2/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2392 - accuracy: 0.9009 - val_loss: 0.2402 - val_accuracy: 0.9066
    Epoch 3/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2062 - accuracy: 0.9134 - val_loss: 0.2132 - val_accuracy: 0.9088

### Batch size 100, Seq Length 196, Learning Rate 3e-5 - Best Model

    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 187s 4ms/sample - loss: 0.2952 - accuracy: 0.8719 - val_loss: 0.2591 - val_accuracy: 0.9019
    Epoch 2/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2019 - accuracy: 0.9158 - val_loss: 0.2485 - val_accuracy: 0.9106
    Epoch 3/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.1650 - accuracy: 0.9331 - val_loss: 0.2188 - val_accuracy: 0.9099

### Batch size 100, Seq Length 196 - Best Model

    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 204s 4ms/sample - loss: 0.3417 - accuracy: 0.8490 - val_loss: 0.2572 - val_accuracy: 0.9034
    Epoch 2/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2305 - accuracy: 0.9037 - val_loss: 0.2574 - val_accuracy: 0.9045
    Epoch 3/3
    50000/50000 [==============================] - 100s 2ms/sample - loss: 0.2002 - accuracy: 0.9171 - val_loss: 0.2275 - val_accuracy: 0.9112

### Batch size 100, Seq Length 256
    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 206s 4ms/sample - loss: 0.4001 - accuracy: 0.8248 - val_loss: 0.3113 - val_accuracy: 0.8758
    Epoch 2/3
    50000/50000 [==============================] - 107s 2ms/sample - loss: 0.2505 - accuracy: 0.8959 - val_loss: 0.2523 - val_accuracy: 0.9028
    Epoch 3/3
    50000/50000 [==============================] - 106s 2ms/sample - loss: 0.2132 - accuracy: 0.9118 - val_loss: 0.2587 - val_accuracy: 0.8992

### Batch size 200, Seq Length 256
    Train on 50000 samples, validate on 4484 samples
    Epoch 1/3
    50000/50000 [==============================] - 161s 3ms/sample - loss: 0.4465 - accuracy: 0.8008 - val_loss: 0.2882 - val_accuracy: 0.8954
    Epoch 2/3
    50000/50000 [==============================] - 63s 1ms/sample - loss: 0.2732 - accuracy: 0.8876 - val_loss: 0.3020 - val_accuracy: 0.8892
    Epoch 3/3
    50000/50000 [==============================] - 63s 1ms/sample - loss: 0.2402 - accuracy: 0.9014 - val_loss: 0.2540 - val_accuracy: 0.9045

### Batch size 100

Train on 7000 samples, validate on 1000 samples
7000/7000 [==============================] - 106s 15ms/sample - loss: 0.3744 - accuracy: 0.8457 - val_loss: 0.2753 - val_accuracy: 0.8714

#### Label smoothing

Train on 7000 samples, validate on 1000 samples
7000/7000 [==============================] - 106s 15ms/sample - loss: 0.4207 - accuracy: 0.8121 - val_loss: 0.2752 - val_accuracy: 0.8765

### Batch size 50

Train on 7000 samples, validate on 1000 samples
7000/7000 [==============================] - 116s 17ms/sample - loss: 0.3597 - accuracy: 0.8460 - val_loss: 0.2887 - val_accuracy: 0.8807

Train on 7000 samples, validate on 1000 samples
7000/7000 [==============================] - 112s 16ms/sample - loss: 0.4206 - accuracy: 0.8151 - val_loss: 0.2791 - val_accuracy: 0.8820

Train on 7000 samples, validate on 1000 samples
7000/7000 [==============================] - 108s 15ms/sample - loss: 0.4366 - accuracy: 0.8020 - val_loss: 0.3113 - val_accuracy: 0.8825

In [23]:
# def auc_maximization(y_pred, y_true):
#     cost = - tf.reduce_mean(tf.sigmoid(y_pred @ tf.transpose(y_pred)) * np.maximum(y_true @ np.ones(y_true.shape).T - np.ones(y_true.shape) @ y_true.T, 0))
#     return cost


# def roc_auc_score(y_pred, y_true):
#     """ ROC AUC Score.
#     Approximates the Area Under Curve score, using approximation based on
#     the Wilcoxon-Mann-Whitney U statistic.
#     Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
#     Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
#     Measures overall performance for a full range of threshold levels.
#     Arguments:
#         y_pred: `Tensor`. Predicted values.
#         y_true: `Tensor` . Targets (labels), a probability distribution.
#     """

    
#     pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
#     neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

#     pos = tf.expand_dims(pos, 0)
#     neg = tf.expand_dims(neg, 1)

#     # original paper suggests performance is robust to exact parameter choice
#     gamma = 0.2
#     p     = 3

#     difference = tf.zeros_like(pos * neg) + pos - neg - gamma

#     masked = tf.boolean_mask(difference, difference < 0.0)

#     return tf.reduce_sum(tf.pow(-masked, p))




# import tensorflow.keras.backend as K
# def binary_crossentropy_with_ranking(y_true, y_pred):
#     """ Trying to combine ranking loss with numeric precision"""
#     y_true, y_pred = tf.cast(y_true, tf.float32), tf.cast(y_pred, tf.float32)
#     logloss = K.mean(K.binary_crossentropy(y_pred, y_true), axis=-1)
#     y_pred_clipped = K.clip(y_pred, K.epsilon(), 1-K.epsilon())
#     y_pred_score = K.log(y_pred_clipped / (1 - y_pred_clipped))
#     y_pred_score_zerooutcome_max = K.max(y_pred_score * (y_true <1))
#     rankloss = y_pred_score - y_pred_score_zerooutcome_max
#     rankloss = rankloss * y_true
#     rankloss = K.square(K.clip(rankloss, -100, 0))
#     rankloss = K.sum(rankloss, axis=-1) / (K.sum(y_true > 0) + 1)
#     return rankloss + logloss