I ended up my 5 day challenge of this competition.  
Here I share my solution, Character-level CNN.  
0.87 in LB without external data is not so bad, doesn't it?
In this notebook I use trained weight.  
Training procedure is included in this notebook.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import GroupKFold
from keras.preprocessing.text import Tokenizer

In [None]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim.lr_scheduler import _LRScheduler
import torchvision.models as models

In [None]:
cudnn.benchmark = True
NUM_FOLD = 5
SEED = 42
BATCH_SIZE = 64
NUM_CLASS = 1
LR_RANGE = [1e-3, 1e-6]
NUM_EPOCH = 64
NUM_CYCLE = 2
SIZE_REDUCE = -1
VERBOSE = 100
NUM_TOKEN = 256
LEN_TOKEN = 1024
EPOCH_DEVIDE = 16
FOLD_LIST = [1]
np.random.seed(SEED)

In [None]:
df_train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
df_test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
df_test['target'] = 0
print(df_train.shape, df_test.shape)
df_train.head()

In [None]:
# Tokenizer
starttime = time.time()
df_traintest = pd.concat([df_train, df_test]).reset_index(drop=True)
tokenizer = Tokenizer(num_words=None, filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(df_traintest['comment_text'].values.tolist())
print("done. {:.1f} sec".format(time.time()-starttime))

In [None]:
tokenizer.oov_token = tokenizer.index_word[NUM_TOKEN-1]
print(tokenizer.oov_token)
tmp = '12345あ'
tmp = np.array(tokenizer.texts_to_sequences([tmp])[0])
print(tmp)

In [None]:
# GroupKFold
df_train['group'] = df_train['id']
df_train['group'][pd.isna(df_train['parent_id'])==False] = df_train['parent_id'][pd.isna(df_train['group'])==False]
shuffle_idx = np.arange(len(df_train))
shuffle_idx = np.random.permutation(shuffle_idx)
df_train_shuffle = df_train.loc[shuffle_idx].reset_index(drop=True)
folds = list(GroupKFold(n_splits=NUM_FOLD).split(
    np.arange(len(df_train_shuffle)), 
    df_train_shuffle['target'], 
    df_train_shuffle['group']))

df_train_shuffle['fold'] = 0
for i in range(NUM_FOLD):
    df_train_shuffle['fold'][folds[i][1]] = i

df_train = df_train_shuffle.sort_values('id').reset_index(drop=True)
df_train[['id', 'group','fold']].head(10)

In [None]:
df_train['valid'] = 0
df_train['valid'][(df_train['male']>0) 
                    | (df_train['female']>0)
                    | (df_train['homosexual_gay_or_lesbian']>0)
                    | (df_train['christian']>0)
                    | (df_train['jewish']>0)
                    | (df_train['muslim']>0)
                    | (df_train['black']>0)
                    | (df_train['white']>0)
                    | (df_train['psychiatric_or_mental_illness']>0)
                   ] = 1
df_train.head()

In [None]:
from math import cos, pi


def cycle(iterable):
    """
    dataloaderをiteratorに変換
    :param iterable:
    :return:
    """
    while True:
        for x in iterable:
            yield x

class CosineLR(_LRScheduler):
    """SGD with cosine annealing.
    """

    def __init__(self, optimizer, step_size_min=1e-5, t0=100, tmult=2, curr_epoch=-1, last_epoch=-1):
        self.step_size_min = step_size_min
        self.t0 = t0
        self.tmult = tmult
        self.epochs_since_restart = curr_epoch
        super(CosineLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        self.epochs_since_restart += 1

        if self.epochs_since_restart > self.t0:
            self.t0 *= self.tmult
            self.epochs_since_restart = 0

        lrs = [self.step_size_min + (
                    0.5 * (base_lr - self.step_size_min) * (1 + cos(self.epochs_since_restart * pi / self.t0)))
               for base_lr in self.base_lrs]

        # print(lrs)

        return lrs

    
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def make_layer(input, output, filter_size):
    x = nn.Sequential(
        nn.Conv1d(input, output, kernel_size=filter_size, padding=(filter_size-1)//2),
        nn.BatchNorm1d(output),
        nn.ReLU(),
    )
    return x

class CharacterLevel1DCNN(nn.Module):
    """
    Chalararacter level 1D CNN
    """

    def __init__(self, num_classes=53, num_token=205):
        super(CharacterLevel1DCNN, self).__init__()

        self.num_classes = num_classes
        self.mode = 'train'
        self.conv1 = make_layer(num_token, 16, 7)
        self.conv2 = make_layer(16, 32, 7)
        self.conv3 = make_layer(32, 64, 7)
        self.conv4 = make_layer(64, 128, 7)
        self.conv5 = make_layer(128, 256, 7)
        self.mp = nn.MaxPool1d(2)
        self.gmp = nn.AdaptiveMaxPool1d(1)
        self.dense_layers = nn.Sequential(
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(1024, NUM_CLASS),
        )

    def feature(self, input):
        x = self.conv1(input)
        x = self.mp(x)
        x = self.conv2(x)
        x = self.mp(x)
        x = self.conv3(x)
        x = self.mp(x)
        x = self.conv4(x)
        x = self.mp(x)
        x = self.conv5(x)
        x = self.mp(x)
        x = self.gmp(x).view(input.size(0), -1)

        return concat

    def forward(self, input):
        x = self.conv1(input)
        x = self.mp(x)
        x = self.conv2(x)
        x = self.mp(x)
        x = self.conv3(x)
        x = self.mp(x)
        x = self.conv4(x)
        x = self.mp(x)
        x = self.conv5(x)
        x = self.mp(x)
        x = self.gmp(x).view(input.size(0), -1)

        x = self.dense_layers(x)

        return x

In [None]:
class TokenDataset(Dataset):
    def __init__(self, df, idx, len_token=LEN_TOKEN, num_token=NUM_TOKEN,
                 aug=True):
        self.X_train = df['comment_text']
        self.y_train = df['target']>0.5
        self.idx = idx
        self.aug = aug
        self.len_token = len_token
        self.num_token = num_token

    def do_slice(self, input):
        if self.len_token==-1:
            input_new = input
        else:
            input_new = np.zeros(self.len_token, np.int64)
            if input.shape[0]<self.len_token:
                shift = np.random.randint(0, self.len_token - input.shape[0])
                if self.aug==True: shift = 0
                input_new[shift:shift + input.shape[0]] = input
            elif input.shape[0]==self.len_token:
                input_new = input
            else:
                shift = np.random.randint(0, input.shape[0]-self.len_token)
                if self.aug==True: shift = 0
                input_new = input[shift:shift+self.len_token]
        return input_new

    def __getitem__(self, index):
        idx = self.idx[index]
        input = self.X_train[idx]
        input = np.array(tokenizer.texts_to_sequences([input])[0])
        input[input>=self.num_token-1] = self.num_token-1
        input = self.do_slice(input)
        input = np.eye(self.num_token)[input].transpose([1,0]).astype(np.float32)
        target = self.y_train[idx].astype(np.float32).reshape([NUM_CLASS])
        
        return input, target

    def __len__(self):
        return len(self.idx)

In [None]:
from sklearn import metrics

def train(train_loader, model, optimizer, scheduler, epoch, verbose, steps):
    mse_avr = AverageMeter()
    criterion_mse = nn.MSELoss().cuda()
    criterion_bce = nn.BCELoss().cuda()
    sigmoid = torch.nn.Sigmoid().cuda()

    # switch to train mode
    model.train()

    starttime = time.time()
    preds = np.zeros([0, NUM_CLASS], np.float32)
    y_true = np.zeros([0, NUM_CLASS], np.float32)
    for i in range(steps):
        # prepare batches
        input, target = next(train_loader)
        input = torch.autograd.Variable(input.cuda(async=True))
        target = torch.autograd.Variable(target.cuda(async=True))
        # get model outputs
        output = sigmoid(model(input))
        mse = criterion_bce(output, target)
        
        # calc losses
        loss = mse
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # log
        pred = output.data.cpu().numpy()
        mse_avr.update(mse.data, input.size(0))
        preds = np.concatenate([preds, pred])
        y_true = np.concatenate([y_true, target.data.cpu().numpy()])
        if verbose!=-1 and (i+1)%verbose==0:
            print("step: {}/{} ".format(i + 1, steps)
                  + "MSE: {:.3f} ".format(mse_avr.avg.item())
                  + "Sec: {:.1f} ".format(time.time()-starttime)
                  )
    auc = metrics.roc_auc_score(y_true[:,0]>0.5, preds[:,0])
    return mse_avr.avg.item(), auc


def validate(val_loader, model):
    mse_avr = AverageMeter()
    criterion_mse = nn.MSELoss().cuda()
    criterion_bce = nn.BCELoss().cuda()
    sigmoid = torch.nn.Sigmoid().cuda()

    # switch to eval mode
    model.eval()

    starttime = time.time()
    preds = np.zeros([0, NUM_CLASS], np.float32)
    y_true = np.zeros([0, NUM_CLASS], np.float32)
    for i, (input, target) in enumerate(val_loader):
        # prepare batches
        input = torch.autograd.Variable(input.cuda(async=True))
        target = torch.autograd.Variable(target.cuda(async=True))
        
        # get model outputs
        output = sigmoid(model(input))
        mse = criterion_bce(output, target)

        # log
        pred = output.data.cpu().numpy()
        mse_avr.update(mse.data, input.size(0))
        preds = np.concatenate([preds, pred])
        y_true = np.concatenate([y_true, target.data.cpu().numpy()])
        
    auc = metrics.roc_auc_score(y_true[:,0]>0.5, preds[:,0])
    return mse_avr.avg.item(), auc

In [None]:
# # Training
# log_columns = ['epoch', 'mse', 'auc', 'val_mse', 'val_auc', 'time']

# for fold in range(NUM_FOLD):
#     if fold+1 not in FOLD_LIST: continue
#     starttime = time.time()
#     print("fold: {}".format(fold + 1))

#     # build model
#     model = CharacterLevel1DCNN(num_classes=NUM_CLASS, num_token=NUM_TOKEN).cuda()  # 生徒モデル

#     ###　prepare batch generator
#     idx_train = df_train[df_train['fold']!=fold].index
#     if SIZE_REDUCE!=-1:
#         idx_train = idx_train[:SIZE_REDUCE]
#     idx_valid = df_train[(df_train['fold']==fold) & (df_train['valid']==1)].index[:10000]
# #     idx_valid = df_train[(df_train['fold']==fold)].index[:10000]

#     # train dataset
#     dataset_train = TokenDataset(df_train, idx_train, aug=True, len_token=1024)
#     # dataloader
#     train_loader = DataLoader(dataset_train,
#                               batch_size=BATCH_SIZE,
#                               shuffle=True,
#                               num_workers=1,
#                               pin_memory=True,
#                               )
#     steps = int(len(train_loader)/EPOCH_DEVIDE)
#     train_itr = cycle(train_loader)  # dataloader to generator

#     # valid dataset
#     dataset_valid = TokenDataset(df_train, idx_valid, aug=False, len_token=1024)
#     valid_loader = DataLoader(dataset_valid,
#                               batch_size=BATCH_SIZE,
#                               shuffle=False,
#                               num_workers=1,
#                               pin_memory=True
#                               )

#     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR_RANGE[0])  # Adam
#     scheduler = CosineLR(optimizer, step_size_min=LR_RANGE[1], t0=len(train_loader) * NUM_CYCLE * EPOCH_DEVIDE,
#                          tmult=1)  # Cyclic lr

#     train_log = pd.DataFrame(columns=log_columns)
#     for epoch in range(NUM_EPOCH):  # epoch数 = NUM_EPOCH0
#         # train 1 epoch
#         mse, auc = train(train_itr, model, optimizer, scheduler, epoch, VERBOSE, steps)

#         #  validation
#         val_mse, val_auc = validate(valid_loader, model)

#         # save log
#         endtime = time.time() - starttime
#         train_log_epoch = pd.DataFrame([[epoch+1, mse, auc, val_mse, val_auc, endtime]],
#                                        columns=log_columns)
#         train_log = pd.concat([train_log, train_log_epoch])
#         train_log.to_csv("train_log_fold{}.csv".format(fold + 1), index=False)

#         # display log
#         print("Epoch: {}/{} ".format(epoch + 1, NUM_EPOCH)
#               + "MSE: {:.3f} ".format(mse)
#               + "AUC: {:.3f} ".format(auc)
#               + "Valid MSE: {:.3f} ".format(val_mse)
#               + "Valid AUC: {:.3f} ".format(val_auc)
#               + "Sec: {:.1f} ".format(time.time()-starttime)
#               )
#         if (epoch+1)%(NUM_CYCLE*EPOCH_DEVIDE)==0:
#             torch.save(model.state_dict(), "weight_fold_{}_epoch_{}.pth".format(fold+1, epoch+1))
            
#     torch.save(model.state_dict(), "weight_fold_{}_epoch_{}.pth".format(fold+1, epoch+1))
#     torch.save(optimizer.state_dict(), 'optimizer_fold_{}_epoch_{}.pth'.format(fold+1, epoch+1))

In [None]:
def predict(val_loader, model):
    sigmoid = torch.nn.Sigmoid().cuda()

    # switch to eval mode
    model.eval()

    starttime = time.time()
    preds = np.zeros([0, NUM_CLASS], np.float32)
    for i, (input, _) in enumerate(val_loader):
        # prepare batches
        input = torch.autograd.Variable(input.cuda(async=True))
        
        # get model outputs
        output = sigmoid(model(input))

        # compute gradient and do SGD step
        pred = output.data.cpu().numpy()
        preds = np.concatenate([preds, pred])
        if i%100==0:
            print("step: {}/{} ".format(i + 1, len(val_loader))
                  + "Sec: {:.1f} ".format(time.time()-starttime)
                  )
        
    return preds

In [None]:
model = CharacterLevel1DCNN(num_classes=NUM_CLASS, num_token=NUM_TOKEN).cuda()
preds_test = np.zeros([NUM_FOLD, len(df_test)])
for fold in range(NUM_FOLD):
    if fold+1 not in FOLD_LIST: break
    starttime = time.time()
    print("fold: {}".format(fold + 1))

    # build model
    model.load_state_dict(
        torch.load("../input/characerlevel-cnn-weights/models/weight_fold_{}_epoch_{}.pth".format(fold+1, NUM_EPOCH)))

    dataset_test = TokenDataset(df_test, np.arange(len(df_test)), aug=False, len_token=1024)
    test_loader = DataLoader(dataset_test,
                              batch_size=BATCH_SIZE*4,
                              shuffle=False,
                              num_workers=1,
                              pin_memory=True
                              )

    preds_test[fold] = predict(test_loader, model)[:,0]
np.save("preds_test.npy", preds_test)

In [None]:
df_sub = df_test[['id']]
df_sub['prediction'] = preds_test.mean(axis=0)
df_sub.to_csv("submission.csv", index=None)
df_sub.head()

In [None]:
preds_test[[FOLD_LIST]].mean(axis=0).shape