This kernel is an slightly modified version from [Jigsaw Unintended Bias in Toxicity Classification discussion](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/discussion/97471#latest-582610) originally wrote by [sakami](https://www.kaggle.com/sakami) meant for let people who access this kernel can have a easy and quick hands on runable version to exprience how Optuna perform hyperparameters tuning in GPT2 and BERT weight blending.

In [None]:
!ls

In [None]:
!conda install pytorch torchvision torchaudio cudatoolkit=10.2.89 -c pytorch -y

In [None]:
import torch
print(torch.version.cuda)
print(torch.__version__)
torch.cuda.is_available()

In [None]:
!git clone https://github.com/sakami0000/kaggle_jigsaw.git

In [None]:
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/apex-master/apex/
!pip install --no-cache-dir transformers

In [None]:
from apex import amp

In [None]:
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm

In [None]:
!nvcc -V

In [None]:
!pip install pytorch-pretrained-bert

In [None]:
!pip install git+https://github.com/pronkinnikita/pytorch-pretrained-BERT

In [None]:
import random
from typing import List
import numpy as np
import torch
from torch.utils.data import Dataset, Sampler, DataLoader


class TextDataset(Dataset):

    def __init__(self, token_lists: List[List[int]], targets: np.ndarray = None, identities: np.ndarray = None,
                 annotator_counts: np.ndarray = None):
        assert targets is None or type(targets) == np.ndarray
        assert identities is None or type(identities) == np.ndarray
        super(TextDataset, self).__init__()
        self.token_lists = token_lists
        self.targets = targets
        self.identities = identities
        self.annotator_counts = annotator_counts

    def __len__(self) -> int:
        return len(self.token_lists)

    def __getitem__(self, item):
        if self.targets is None:
            return self.token_lists[item], item
        return self.token_lists[item], item, self.annotator_counts[item], self.targets[item], self.identities[item]

    def collate_fn(self, batch):
        transposed = list(zip(*batch))
        max_len = max([len(x) for x in transposed[0]])
        tokens = np.zeros((len(batch), max_len), dtype=np.int64)
        for i, row in enumerate(transposed[0]):
            row = np.array(row[:min(max_len, len(row))])
            tokens[i, :len(row)] = row

        # token_lists, indices
        tensors = [
            torch.from_numpy(tokens),
            torch.Tensor(transposed[1]).type(torch.IntTensor),
        ]
        for i in range(2, len(transposed)):
            tensors.append(torch.Tensor(transposed[i]).type(torch.FloatTensor))
        return tensors


class LengthBucketingDataLoader(object):

    def __init__(self, dataset: TextDataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None,
                 num_workers=0, pin_memory=False, drop_last=False,
                 timeout=0, worker_init_fn=None):
        self.large_bucket_loader = DataLoader(dataset=dataset, batch_size=batch_size * 100, shuffle=shuffle,
                                              sampler=sampler, batch_sampler=batch_sampler, num_workers=num_workers,
                                              collate_fn=self.nop_collate_fn, pin_memory=pin_memory, drop_last=False,
                                              timeout=timeout, worker_init_fn=worker_init_fn)
        self.drop_last = drop_last
        self.batch_size = batch_size
        self.collate_fn = dataset.collate_fn

    @staticmethod
    def nop_collate_fn(batch):
        return batch

    def __iter__(self):
        for large_batch in self.large_bucket_loader:
            assert type(large_batch[0])
            large_batch = sorted(large_batch, key=lambda example: len(example[0]))

            small_batches = []
            for start_idx in range(0, len(large_batch), self.batch_size):
                end_idx = min(len(large_batch), start_idx + self.batch_size)
                small_batch = large_batch[start_idx:end_idx]
                if end_idx - start_idx == self.batch_size or not self.drop_last:
                    small_batches.append(self.collate_fn(small_batch))
            random.shuffle(small_batches)

            for small_batch in small_batches:
                yield small_batch


class TokenDataset(Dataset):

    def __init__(self, seqs, targets=None, maxlen=200):
        if targets is not None:
            self.targets = targets
        else:
            self.targets = np.random.randint(2, size=(len(seqs),))
        
        self.seqs = seqs
        self.maxlen = maxlen
        
    def __len__(self):
        return len(self.seqs)
        
    def get_keys(self):
        lens = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.seqs),
            dtype=np.int32)
        return lens
        
    def __getitem__(self, index):
        return index, self.seqs[index], self.targets[index]


def collate_fn(data):

    def _pad_sequences(seqs):
        lens = [len(seq) for seq in seqs]
        max_len = max(lens)

        padded_seqs = torch.zeros(len(seqs), max_len).long()
        for i, seq in enumerate(seqs):
            start = max_len - lens[i]
            padded_seqs[i, start:] = torch.LongTensor(seq)
        return padded_seqs

    index, seqs, targets = zip(*data)
    seqs = _pad_sequences(seqs)
    return index, seqs, torch.FloatTensor(targets)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_keys, bucket_size=None, batch_size=1048, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_keys = sort_keys
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_keys)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_keys)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_keys)
        
    def prepare_buckets(self, indices=None):
        lens = - self.sort_keys
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lens)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lens), dtype=np.int32))
                lens = lens[indices]
            else:
                indices = np.arange(len(lens), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch = None
        for chunk_index, chunk in divide_chunks(lens, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk, axis=-1)]
            batches = []
            for _, batch in divide_chunks(indices_sorted, self.batch_size):
                if len(batch) == self.batch_size:
                    batches.append(batch.tolist())
                else:
                    assert extra_batch is None
                    assert batch is not None
                    extra_batch = batch
    
            # shuffling batches within buckets
            if self.shuffle:
                batches = shuffle(batches)
            for batch in batches:
                new_indices.extend(batch)
    
        if extra_batch is not None:
            new_indices.extend(extra_batch)
        return indices[new_indices]

In [None]:
from multiprocessing.pool import Pool
from typing import List, TypeVar

from transformers import PreTrainedTokenizer
from tqdm import tqdm

Tokenizer = TypeVar('Tokenizer', bound=PreTrainedTokenizer)


class MyTokenizer:

    def __init__(self, tokenizer: Tokenizer, max_len=220, max_head_len=128, mode='bert'):
        assert max_len >= max_head_len
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.max_head_len = max_head_len
        self.mode = mode
        if self.mode == 'bert':
            self.max_len -= 2

    def _tokenize_one(self, text: str) -> List[int]:
        """
        when the sentence is longer then `max_len`,
        the first `max_head_len` and the last `max_len` - `max_head_len` words will 
        be used to train or inference
        """
        tokens = self.tokenizer.tokenize(text)
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_head_len] + tokens[self.max_head_len - self.max_len:]
        if self.mode == 'bert':
            tokens = ['[CLS]'] + tokens + ['[SEP]']
        return self.tokenizer.convert_tokens_to_ids(tokens)

    def tokenize(self, examples: List[str], num_threads=1, chunksize=1000):
        if num_threads < 1:
            raise ValueError('num_threads must be positive integer.')
        all_tokens = []
        total = len(examples)
        if num_threads == 1:
            for _, text in tqdm(enumerate(examples), total=total):
                all_tokens.append(self._tokenize_one(text))
        else:
            with Pool(num_threads) as pool:
                for _, tokens in tqdm(enumerate(pool.imap(self._tokenize_one, examples, chunksize=chunksize)),
                                      total=total):
                    all_tokens.append(tokens)
        return all_tokens

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score

from kaggle_jigsaw.config.base import IDENTITY_COLUMNS


class JigsawEvaluator:

    def __init__(self, y_target: np.ndarray, y_identity: np.ndarray, power=-5, overall_model_weight=0.25):
        self.y = (y_target >= 0.5).astype(int)
        self.y_i = (y_identity >= 0.5).astype(int)
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except ValueError:
            # return np.nan
            return 1e-15

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bias_metrics_for_model(self, y_pred):
        metrics = np.zeros((3, self.n_subgroups))
        record = {
            'subgroup_auc': {},
            'bpsn_auc': {},
            'bnsp_auc': {}
        }
        for i in range(self.n_subgroups):
            metrics[0, i] = self._compute_subgroup_auc(i, y_pred)
            metrics[1, i] = self._compute_bpsn_auc(i, y_pred)
            metrics[2, i] = self._compute_bnsp_auc(i, y_pred)
            subgroup_name = IDENTITY_COLUMNS[i]
            record['subgroup_auc'][subgroup_name] = metrics[0, i]
            record['bpsn_auc'][subgroup_name] = metrics[1, i]
            record['bnsp_auc'][subgroup_name] = metrics[2, i]
        return metrics, record

    def _calculate_overall_auc(self, y_pred):
        return roc_auc_score(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred: np.ndarray):
        bias_metrics, bias_record = self._compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_auc = self._calculate_overall_auc(y_pred)
        overall_score = self.overall_model_weight * overall_auc
        bias_score = (1 - self.overall_model_weight) * bias_score
        final_score = overall_score + bias_score

        bias_record['overall_auc'] = overall_auc
        bias_record['final_score'] = final_score
        bias_record['mean_subgroup_auc'] = self._power_mean(bias_metrics[0])
        bias_record['mean_bpsn_auc'] = self._power_mean(bias_metrics[1])
        bias_record['mean_bnsp_auc'] = self._power_mean(bias_metrics[2])
        return final_score, bias_record


def accuracy(ys, ps):
    return torch.mean(((ps >= 0.5) == (ys >= 0.5)).type(torch.FloatTensor)).item()

In [None]:
from torch import nn


class CustomLoss(nn.Module):

    def __init__(self, loss_weight=None, alpha=1, beta=1, use_annotator_counts=False,
                 weight_from_annotator_counts=None):
        super(CustomLoss, self).__init__()
        self.loss_weight = loss_weight
        self.alpha = alpha
        self.beta = beta
        self.use_annotator_counts = use_annotator_counts
        self.weight_from_annotator_counts = weight_from_annotator_counts

    def forward(self, logits, targets, annotator_counts=None):
        """
        preds[:, 0] = "prediction for target labels"
        preds[:, 1:] = "prediction for auxiliary target labels"
        targets[:, 0] = "target labels"
        targets[:, 1] = "instance weight"
        targets[:, 2:] = "auxiliary target labels"
        """
        if self.loss_weight is None:
            weight = None
            loss_weight = 1
        else:
            weight = targets[:, 1:2]
            loss_weight = self.loss_weight
    
        if annotator_counts is None or not self.use_annotator_counts:
            bce_loss_1 = nn.BCEWithLogitsLoss(weight=weight)(logits[:, :1], targets[:, :1])
            bce_loss_2 = nn.BCEWithLogitsLoss()(logits[:, 1:], targets[:, 2:])
            return (bce_loss_1 * loss_weight) + bce_loss_2
        else:
            annotator_counts = annotator_counts.view(-1, 1)
            new_targets = targets.clone()
            new_targets[:, :1] = (targets[:, :1] * annotator_counts + self.alpha) / (
                annotator_counts + self.alpha + self.beta)

            num_aux_targets = targets.size()[1] - 1
            aux_annotator_counts = annotator_counts.view(-1, 1).repeat(1, num_aux_targets)
            new_targets[:, 1:] = (targets[:, 1:] * aux_annotator_counts + self.alpha) / (
                aux_annotator_counts + self.alpha + self.beta)

            bce_loss_1 = nn.BCEWithLogitsLoss(weight=weight, reduction='none')(
                logits[:, :1], new_targets[:, :1])
            bce_loss_2 = torch.mean(nn.BCEWithLogitsLoss(reduction='none')(
                logits[:, 1:], new_targets[:, 2:]), 1).view(-1, 1)
            if self.weight_from_annotator_counts is None:
                return ((bce_loss_1 * loss_weight) + bce_loss_2).mean()
            return (((bce_loss_1 * loss_weight) + bce_loss_2) * self.weight_from_annotator_counts(
                annotator_counts + self.alpha + self.beta)).mean()

In [None]:
!pip install attrdict==2.0.1

In [None]:
from contextlib import contextmanager
import json
import os
import random
import time

from attrdict import AttrDict
import numpy as np

@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def load_config(config_path: str) -> AttrDict:
    with open(config_path) as f:
        config = json.load(f, object_hook=AttrDict)
    return config

In [None]:
import numpy as np


def training_weights(df_train, toxicity_column, identity_columns):
    subgroup_positive = (df_train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int)
    subgroup_negative = (df_train[identity_columns].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(np.int)

    background_positive = (df_train[toxicity_column].values >= 0.5).astype(bool).astype(np.int)
    background_negative = (df_train[toxicity_column].values < 0.5).astype(bool).astype(np.int)

    weights = np.ones((len(df_train),)) / 4
    weights += (df_train[identity_columns].fillna(0).values >= 0.5).mean(axis=1) / 4
    weights += ((background_positive + subgroup_negative) > 1).astype(bool).astype(np.int) / 4
    weights += ((background_negative + subgroup_positive) > 1).astype(bool).astype(np.int) / 4
    return weights


def training_weights_s(df_train, toxicity_column, identity_columns):
    weights = np.ones((len(df_train),))
    weights += df_train[identity_columns].fillna(0).values.sum(axis=1) * 3
    weights += df_train[toxicity_column].values * 8
    weights /= weights.max()
    return weights

In [None]:
import numpy as np
from tqdm import tqdm

def get_optimizer_params(model, lr, lr_weight_decay_coef, num_layers):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if lr_weight_decay_coef < 1.0:
        optimizer_grouped_parameters = [
            {'params': [
                p for n, p in param_optimizer
                if 'bert.embeddings' not in n
                and 'bert.encoder' not in n
                and not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [
                p for n, p in param_optimizer
                if 'bert.embeddings' not in n
                and 'bert.encoder' not in n
                and any(nd in n for nd in no_decay)],
             'weight_decay': 0.0},
            {'params': [
                p for n, p in param_optimizer
                if 'bert.embeddings' in n
                and not any(nd in n for nd in no_decay)],
             'lr': lr * lr_weight_decay_coef ** (num_layers + 1), 'weight_decay': 0.01},
            {'params': [
                p for n, p in param_optimizer
                if 'bert.embeddings' in n
                and any(nd in n for nd in no_decay)],
             'lr': lr * lr_weight_decay_coef ** (num_layers + 1), 'weight_decay': 0.0}
        ]
        for i in range(num_layers):
            optimizer_grouped_parameters.append(
                {'params': [
                    p for n, p in param_optimizer
                    if 'bert.encoder.layer.{}.'.format(i) in n
                    and any(nd in n for nd in no_decay)],
                 'lr': lr * lr_weight_decay_coef ** (num_layers - i), 'weight_decay': 0.0})
            optimizer_grouped_parameters.append(
                {'params': [
                    p for n, p in param_optimizer
                    if 'bert.encoder.layer.{}.'.format(i) in n
                    and any(nd in n for nd in no_decay)],
                 'lr': lr * lr_weight_decay_coef ** (num_layers - i), 'weight_decay': 0.0})
    else:
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
    return optimizer_grouped_parameters


def predict(model: nn.Module, dataset: TextDataset, device, batch_size=32) -> np.ndarray:
    model.eval()
    test_ps = []
    test_is = []
    with torch.no_grad():
        for batch in tqdm(LengthBucketingDataLoader(dataset=dataset, batch_size=batch_size,
                                                    shuffle=False, drop_last=False),
                          total=len(dataset) // batch_size):
            x_batch = batch[0]
            i_batch = batch[1]
            p_batch = model(x_batch.type(torch.LongTensor).to(device))
            test_ps.append(p_batch.detach().cpu())
            test_is.append(i_batch.detach().cpu())
    test_ps = torch.sigmoid(torch.cat(test_ps, 0)[:, 0]).numpy().ravel()
    test_is = torch.cat(test_is, 0).numpy().ravel()
    return np.array(list(map(lambda pi: pi[0], sorted(list(zip(test_ps, test_is)), key=lambda pi: pi[1]))))

In [None]:
from pytorch_pretrained_bert import GPT2Model

class GPT2ClassificationHeadModel(GPT2Model):

    def __init__(self, config, clf_dropout=0.4, n_class=8):
        super(GPT2ClassificationHeadModel, self).__init__(config)
        self.transformer = GPT2Model(config)
        self.dropout = nn.Dropout(clf_dropout)
        self.linear = nn.Linear(config.n_embd * 3, n_class)

        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)
        
        self.apply(self.init_weights)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        avg_pool = torch.mean(hidden_states, 1)
        max_pool, _ = torch.max(hidden_states, 1)
        h_conc = torch.cat((avg_pool, max_pool, hidden_states[:, -1, :]), 1)
        logits = self.linear(self.dropout(h_conc))
        return logits

In [None]:
from pathlib import Path
import warnings

from apex import amp
import numpy as np
import pandas as pd
from tqdm import tqdm

from kaggle_jigsaw.config.base import (
    TOXICITY_COLUMN, IDENTITY_COLUMNS, AUX_TOXICITY_COLUMNS,
    OLD_TOXICITY_COLUMN, OLD_IDENTITY_COLUMNS, OLD_AUX_TOXICITY_COLUMNS,
    TRAIN_DATA, TEST_DATA, SAMPLE_SUBMISSION,
    TRAIN_OLD, TEST_OLD, SAMPLE_OLD)

config_file = Path('./kaggle_jigsaw/config/bert_large_cased.json')
config = load_config(config_file)

config.setdefault('max_len', 220)
config.setdefault('max_head_len', 128)
config.setdefault('epochs', 2)
config.setdefault('down_sample_frac', 0.5)
config.setdefault('lr', 1.5e-5)
config.setdefault('batch_size', 16)
config.setdefault('accumulation_steps', 4)
config.setdefault('lr_weight_decay_coef', 1.0)
config.setdefault('warmup', 0.05)
config.setdefault('old_data', False)
config.setdefault('old_fine_tuned', False)
config.setdefault('device', 'cuda')
config.setdefault('seed', 1234)

assert 'lm_model_name' in config
assert not (config.old_fine_tuned and config.old_data)
assert config.max_len >= config.max_head_len
assert config.epochs <= 2
valid = False
old = False

lm_model_name = config_file.stem
if config.old_fine_tuned:
    PRETRAINED_PATH = Path(f'../output/{lm_model_name}_old_fine_tune/')
    assert PRETRAINED_PATH.exists()
else:
    PRETRAINED_PATH = lm_model_name
MODE = lm_model_name[:4]
LOWER_CASE = 'uncased' in lm_model_name
LARGE_MODEL = 'large' in lm_model_name
DEVICE = torch.device(config.device)

if config.old_data:
    lm_model_name += '_old_fine_tune'

if valid:
    valid_size = 200000
    shuffle_seed = 1029
    lm_model_name += '_valid'
else:
    valid_size = 0
    shuffle_seed = config.seed

OUT_DIR = Path(f'../output/{lm_model_name}/')
TEST_SUBMISSION = OUT_DIR / 'submission.csv'
VALID_SUBMISSION = OUT_DIR / 'valid_submission.csv'
OUT_DIR.mkdir(parents=True, exist_ok=True)

warnings.filterwarnings('ignore')
seed_torch(config.seed)

if not old:
    train_data = TRAIN_DATA
    test_data = TEST_DATA
    sample_submission = SAMPLE_SUBMISSION
    train_size = 1804874 - valid_size
else:
    train_data = TRAIN_OLD
    test_data = TEST_OLD
    sample_submission = SAMPLE_OLD
    train_size = 159571 - valid_size

    TOXICITY_COLUMN = OLD_TOXICITY_COLUMN
    IDENTITY_COLUMNS = OLD_IDENTITY_COLUMNS
    AUX_TOXICITY_COLUMNS = OLD_AUX_TOXICITY_COLUMNS

if MODE == 'bert':
    from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam

    lm_tokenizer = BertTokenizer.from_pretrained('bert-large-cased', cache_dir=None, do_lower_case=LOWER_CASE)
    model = BertForSequenceClassification.from_pretrained("bert-large-cased", cache_dir=None, num_labels=1 + len(AUX_TOXICITY_COLUMNS))
    optimizer_class = BertAdam
else:
    from pytorch_pretrained_bert import GPT2Tokenizer, OpenAIAdam, GPT2Model

    lm_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=None)
    model = GPT2ClassificationHeadModel.from_pretrained("gpt2", clf_dropout=config.get('dropout_rate', 0.1),n_class=1 + len(AUX_TOXICITY_COLUMNS))
    optimizer_class = OpenAIAdam
    assert config.lr_weight_decay_coef == 1.0  
    
with timer('preprocess'):
    tokenizer = MyTokenizer(lm_tokenizer, config.max_len, config.max_head_len, MODE)
    df_train = pd.read_csv(TRAIN_DATA).sample(frac=1, random_state=shuffle_seed).reset_index(drop=True)
    df_train['comment_text'] = df_train['comment_text'].astype(str)
    df_train = df_train.fillna(0)
    X_train = tokenizer.tokenize(df_train['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000)

    df_test = pd.read_csv(TEST_DATA)
    df_test['comment_text'] = df_test['comment_text'].astype(str)
    df_test = df_test.fillna(0)
    X_test = tokenizer.tokenize(df_test['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000)

    df_train.drop(['comment_text'], axis=1, inplace=True)
    df_test.drop(['comment_text'], axis=1, inplace=True)

    X_valid = X_train[train_size:]
    X_train = X_train[:train_size]

    y_identity_train = df_train[IDENTITY_COLUMNS].values
    y_annotator_counts_train = df_train['toxicity_annotator_count'].values

    weights = training_weights(df_train, TOXICITY_COLUMN, IDENTITY_COLUMNS)
    y_train = np.hstack((df_train[TOXICITY_COLUMN].values.reshape(-1, 1), weights.reshape(-1, 1),df_train[AUX_TOXICITY_COLUMNS].values))

    y_valid = y_train[train_size:]
    y_train = y_train[:train_size]
    y_identity_valid = y_identity_train[train_size:]
    y_identity_train = y_identity_train[:train_size]
    y_annotator_counts_valid = y_annotator_counts_train[train_size:]
    y_annotator_counts_train = y_annotator_counts_train[:train_size]
    loss_weight = 1.0 / weights.mean() if not old else None

        # drop negative samples here
    frac = config.down_sample_frac
    target_negative = (y_train > 0.0).sum(axis=1) == 1
    identity_negative = (y_identity_train > 0.0).sum(axis=1) == 0
    negative_mask = identity_negative & target_negative
    negative_indices = np.arange(len(y_train))[negative_mask]
    drop_indices_0 = set(negative_indices[:int(len(negative_indices) * frac)])
    drop_indices_1 = set(negative_indices[int(len(negative_indices) * (1 - frac)):])
    drop_indices_list = [drop_indices_0, drop_indices_1]
    len_train = len(y_train) - len(drop_indices_0)

In [None]:
with timer('train'):
    model.zero_grad()
    model = model.to(DEVICE)
    num_layers = 24 if LARGE_MODEL else 12
    optimizer_grouped_parameters = get_optimizer_params(model, config.lr, config.lr_weight_decay_coef, num_layers)
    num_train_optimization_steps = int(config.epochs * len_train / config.batch_size / config.accumulation_steps)

    optimizer = optimizer_class(optimizer_grouped_parameters,
                                    lr=config.lr,
                                    warmup=config.warmup,
                                    t_total=num_train_optimization_steps)

    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
    model = model.train()

    batch_count = len_train // config.batch_size
    loss_fn = CustomLoss(loss_weight)
    for epoch, drop_indices in zip(range(config.epochs), drop_indices_list):
        sample_indices = np.array([i for i in range(len(y_train)) if i not in drop_indices])
        X_sampled_train = [X_train[i] for i in sample_indices]
        y_sampled_train = y_train[sample_indices]
        y_sampled_identity_train = y_identity_train[sample_indices]
        y_sampled_annotator_counts_train = y_annotator_counts_train[sample_indices]
        train_dataset = TextDataset(X_sampled_train, y_sampled_train,
                                        y_sampled_identity_train, y_sampled_annotator_counts_train)
        train_loader = LengthBucketingDataLoader(
                train_dataset, shuffle=True, drop_last=True, batch_size=config.batch_size)
        tk0 = tqdm(enumerate(train_loader), total=batch_count)
        optimizer.zero_grad()
        for i, (x_batch, _, a_batch, y_batch, y_identity_batch) in tk0:
            y_pred = model(x_batch.to(DEVICE), attention_mask=(x_batch > 0).to(DEVICE), labels=None)
            loss = loss_fn(y_pred, y_batch.to(DEVICE))
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            if (i + 1) % config.accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

    model.save_pretrained(OUT_DIR)

In [None]:
with timer('evaluate'):
    if valid:
        valid_dataset = TextDataset(X_valid, y_valid, y_identity_valid, y_annotator_counts_valid)
        valid_preds = predict(model, valid_dataset, device=DEVICE)

        df_valid = df_train.tail(valid_size)
        df_valid['model1'] = valid_preds
        evaluator = JigsawEvaluator(df_valid[TOXICITY_COLUMN].values, df_valid[IDENTITY_COLUMNS].values)
        final_score, _ = evaluator.get_final_metric(df_valid['model1'].values)

        valid_prediction = predict(model, TextDataset(X_valid), device=DEVICE)
        valid_submission = pd.DataFrame({
                'id': df_valid['id'],
                'prediction': valid_prediction 
            })
        valid_submission.to_csv(VALID_SUBMISSION, index=False)
        print(f'validation score: {final_score:.5f}')

    test_prediction = predict(model, TextDataset(X_test), device=DEVICE)
    submission = pd.DataFrame({
            'id': df_test['id'],
            'prediction': test_prediction 
    })
    submission.to_csv(TEST_SUBMISSION, index=False)

In [None]:
df_train.head()

In [None]:
from pathlib import Path

import numpy as np
import optuna
import pandas as pd

from kaggle_jigsaw.config.base import TOXICITY_COLUMN, IDENTITY_COLUMNS

OUT_DIR = Path('../output/')
VALID_DIR = OUT_DIR.glob('*_valid/')
models = [path.stem[:-6] for path in VALID_DIR]


def objective(trial, train_fold, evaluator):
    params = {
        model: trial.suggest_uniform(model, 0.0, 1.0)
        for model in models
    }
    train_fold = np.array(train_fold)
    train_preds = np.average(train_fold, weights=params.values(), axis=1)
    score, _ = evaluator.get_final_metric(train_preds)
    return 1 - score



config = load_config('./kaggle_jigsaw/config/blend.json')
config.setdefault('n_folds', 10)
config.setdefault('n_trials', 300)
config.setdefault('threshold', 0.03)

df_valid = pd.concat([pd.read_csv(path / 'valid_submission.csv', index_col='id') for path in VALID_DIR], axis=0)
train_scores = []
valid_scores = []
params = {model: [] for model in models}

for i in range(config.n_folds):
    df_valid = df_valid.sample(frac=1, random_state=i).reset_index(drop=True)
    train_fold = df_valid[:len(df_valid) // 2]
    valid_fold = df_valid[len(df_valid) // 2:]
    train_evaluator = JigsawEvaluator(
        train_fold[TOXICITY_COLUMN].values, train_fold[IDENTITY_COLUMNS].values)
    valid_evaluator = JigsawEvaluator(
        valid_fold[TOXICITY_COLUMN].values, valid_fold[IDENTITY_COLUMNS].values)
        
    study = optuna.create_study()
    study.optimize(lambda trial: objective(trial, train_fold.values, train_evaluator),
                       n_trials=config.n_trials)
    trial = study.best_trial
    train_scores.append(1 - trial.value)
    values = np.array(list(trial.params.values()))
    values /= values.sum()
    for key, value in zip(trial.params.keys(), values):
        params[key].append(value)
        
    valid_preds = np.zeros((len(valid_fold)))
    for key, value in trial.params.items():
        valid_preds += valid_fold[key].values * value
    score, _ = valid_evaluator.get_final_metric(valid_preds)
    valid_scores.append(score)

for i, (train_score, valid_score) in enumerate(zip(train_scores, valid_scores)):
    print(f'fold {str(i + 1):2s} - train: {train_score:.5f}, valid: {valid_score:.5f}')

print('-' * 20)
print(f'train mean: {np.mean(train_scores):.5f}, var: {np.var(train_scores):.7f}')
print(f'valid mean: {np.mean(valid_scores):.5f}, var: {np.var(valid_scores):.7f}')

print('-' * 20)
for key, values in params.items():
    print(f'{key:25s} {np.mean(values):.6f} {np.var(values):.6f}')

print('-' * 20)
print(f'robust folds: threshold {config.threshold}')
robust_folds = []
robust_train_scores = []
robust_valid_scores = []
for i, (train_score, valid_score) in enumerate(zip(train_scores, valid_scores)):
    if np.abs(train_score - valid_score) < config.threshold:
        robust_folds.append(i)
        robust_train_scores.append(train_score)
        robust_valid_scores.append(valid_score)
    print(' '.join(map(str, robust_folds)))

print('-' * 20)
print(f'train mean: {np.mean(robust_train_scores):.5f}, var: {np.var(robust_train_scores):.7f}')
print(f'valid mean: {np.mean(robust_valid_scores):.5f}, var: {np.var(robust_valid_scores):.7f}')

print('-' * 20)
for key, values in params.items():
    robust_values = np.array(values)[robust_folds]
    print(f'{key:25s} {np.mean(robust_values):.6f} {np.var(robust_values):.6f}')    

References:  
https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/discussion/97471#latest-582610