## <font color="DarkViolet">パラメータ設定</font>

In [None]:
CONT_MODEL_KERNEL = ''
TOKENIZED = "../input/jigsaw-bert-tokenize-x-len-y-yaux-01-01-02/"

CONTINUE=False

N_FOLD = 1
FOLD = 5

MAX_SEQUENCE_LENGTH = 300
SEED = 100+N_FOLD-1
EPOCHS = 1
Data_dir = "../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input"
WORK_DIR = "../working/"
TARGET = 'target'
TEXT_COL = 'comment_text'

TOTAL_RECORD = 1804874
filename = 'checkpoint.pth'

## <font color="DarkViolet">ライブラリのインポート</font>

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_colwidth", 300)
import os
print(os.listdir("../input/nvidiaapex/repository/NVIDIA-apex-39e153a"))

In [None]:
# NVIDIA Apex のインストール (FP16: 半精度浮動小数点)
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats
import gc
import re
import operator 
import sys
from sklearn import metrics
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
%load_ext autoreload
%autoreload 2
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
from apex import amp
import shutil

## <font color="DarkViolet">pickle用ヘルパーの定義とseed固定</font>

In [None]:
def save_obj(obj, name):
    with open(name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [None]:
# 但しkaggle kernelでは起動ごとにインスタンスが変わるのでpytorch seedは固定できない
def seed_torch(s):
    os.environ['PYTHONHASHSEED'] = str(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed(s)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seed_torch(SEED)

## <font color="DarkViolet">BERTのライブラリへのパス接続とTF→pytorchの変換</font>

In [None]:
device = torch.device('cuda')

# Add the BERT Pytorch repo to the PATH
# using files from: https://github.com/huggingface/pytorch-pretrained-BERT
package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, package_dir_a)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam

In [None]:
# Translate model from tensorflow to pytorch
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH+'bert_config.json', WORK_DIR+'bert_config.json')

In [None]:
os.listdir("../working")

In [None]:
# This is the BERT configuration file
from pytorch_pretrained_bert import BertConfig

bert_config = BertConfig('../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'+'bert_config.json')

## <font color="DarkViolet">関数・クラスの定義</font>

**AUCをsckit-learnよりも早く計算できる関数の定義（速度改善は未確認。Contributed by Giba）**

In [None]:
from numba import jit
@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc = auc / (nfalse * (n - nfalse))
    return auc

def eval_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'auc', fast_auc(labels, preds), True

**今回のコンペにおける評価関数を計算するクラスの定義**

In [None]:
cols_identity = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
class JigsawEvaluator:
    def __init__(self, y_true, y_identity, power=-5, overall_model_weight=0.25):
        self.y = (y_true >= 0.5).astype('int8')
        self.y_i = (y_identity >= 0.5).astype('int8')
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return fast_auc(y_true, y_pred)
        except ValueError:
            return np.nan

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def compute_bias_metrics_for_model(self, y_pred):
        records = np.zeros((3, self.n_subgroups))
        for i in range(self.n_subgroups):
            records[0, i] = self._compute_subgroup_auc(i, y_pred)
            records[1, i] = self._compute_bpsn_auc(i, y_pred)
            records[2, i] = self._compute_bnsp_auc(i, y_pred)
        return records

    def _calculate_overall_auc(self, y_pred):
        return fast_auc(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_score = self.overall_model_weight * self._calculate_overall_auc(y_pred)
        bias_score = (1 - self.overall_model_weight) * bias_score
        return [
            overall_score + bias_score, # auc
            overall_score / self.overall_model_weight, # overall_auc
            self._power_mean(bias_metrics[0]), self._power_mean(bias_metrics[1]), self._power_mean(bias_metrics[2]) # subgroup_auc, bpsn_auc, bnsp_auc
            ]

**Validationデータの推論**

In [None]:
def create_valid(X_val):
    batch_size = 16
    valid_preds = np.zeros((len(X_val), len(X_val[0][2])-1))
    valid_collator = SequenceBucketCollator(lambda lengths: lengths.max(), sequence_index=0, length_index=1, label_index=2)
    valid_loader = torch.utils.data.DataLoader(X_val, batch_size=batch_size, shuffle=False, collate_fn=valid_collator)

    tk0 = tqdm_notebook(enumerate(valid_loader), total=len(valid_loader), leave=True)
    avg_val_loss = 0.
    for i,(x_batch, _) in tk0:
        pred = model(x_batch[0].to(device), attention_mask=(x_batch[0]>0).to(device), labels=None)
        valid_preds[i*batch_size: min(len(X_val), (i+1)*batch_size), :]=pred.detach().cpu().squeeze().numpy()
        
    avg_val_loss = custom_loss(
                        torch.tensor(valid_preds, dtype=torch.float32).to(device), 
                        torch.tensor(X_val[:][2], dtype=torch.float32).to(device) # X_val[:][2][:,0] <- delete [:,0] when using multi-label
                    ).item()
    return torch.sigmoid(torch.tensor(valid_preds)).numpy(), avg_val_loss

**バッチ内で最長トークン長に合わせてマスキング**

In [None]:
class SequenceBucketCollator():
    def __init__(self, choose_length, sequence_index, length_index, label_index=None):
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        
    def __call__(self, batch):
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        
        length = self.choose_length(lengths).cpu()
        mask = torch.arange(start=0, end=MAX_SEQUENCE_LENGTH, step=1) < length.type(torch.LongTensor)
        padded_sequences = sequences[:, mask]
        
        batch[self.sequence_index] = padded_sequences
        
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i not in (self.length_index, self.label_index)], batch[self.label_index]
        else:
            return [x for i, x in enumerate(batch) if i not in (self.length_index, _)], _

**損失関数の定義**

In [None]:
loss_weight = 3.209226860170181
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

## <font color="DarkViolet">トークナイゼーション</font>

**トークン化関数とトークン化されたコメントのチェック関数**

In [None]:
# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(example, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    for text in example:
        tokens_a = tokenizer.tokenize(text)
        print('\n')
        print('=== トークン化 ===')
        print(tokens_a)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0]*(max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
#     print(longer)
    print('\n')
    print('=== トークン番号 ===')
    return np.array(all_tokens)

def check_tokenizer(i):
    print('=== オリジナル ===')
    print(df[TEXT_COL].iloc[i])

    s = pd.Series(df[TEXT_COL].fillna("DUMMY_VALUE").iloc[i])
    print(convert_lines(s, MAX_SEQUENCE_LENGTH, tokenizer))

**トークン化を実際にチェックする**

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True)
df = pd.read_csv(os.path.join(Data_dir, "train.csv"), nrows=10).sample(frac=1, random_state=SEED).reset_index(drop=True)
df[TEXT_COL] = df[TEXT_COL].astype(str)
df[['id', 'target', 'comment_text']].head()

In [None]:
check_tokenizer(0)

In [None]:
check_tokenizer(2)

In [None]:
check_tokenizer(4)

**あらかじめトークン化しておいたものをpickleで読み込む (3分クッキング)**

In [None]:
train_dataset = load_obj(TOKENIZED+'train_dataset')
train_df = load_obj(TOKENIZED+'train_df')

## <font color="DarkViolet">BERTのトレーニング</font>

**5-fold分のindex作成 (sklearnのクラスでcallableに持っておくのもよいが、デバッグしやすいのでリストで持っておくのが個人的に好き)**

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
stratified = True
stratified_by = TARGET
if stratified: folds = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED)
else: folds = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)

fold_list = []
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_dataset[:][2][:,0].numpy(), train_dataset[:][2][:,0].numpy())):
    fold_list.append([n_fold, train_idx, valid_idx])
del folds; gc.collect()
fold_list

**速度改善のため、同じバッチ内でトークン長が同じになるようにシャッフルする<font color="Red">8.5時間→3.5時間 (/epoch)</font>**

In [None]:
batch_size = 32

R_TRAIN = 0.01 # ここで学習対象のデータ量を調整する
N_VALID = 100000
train_idx, valid_idx = fold_list[N_FOLD-1][1], fold_list[N_FOLD-1][2][:N_VALID]

## Sort train by lengths --------
# make sure every batch has similar sentences length, and shuffle the batchs
np.random.seed(SEED)
total_len = len(train_idx)
train_idx = np.random.choice(train_idx, total_len-total_len%batch_size, replace=False)
sort_idx = np.argsort(train_dataset[train_idx][1].numpy().reshape(-1))
sort_idx = sort_idx.reshape(-1, batch_size)
np.random.shuffle(sort_idx)
sort_idx = sort_idx.reshape(-1)
train_idx = train_idx[sort_idx]
## ------------------------------

## Sort valid by lengths --------
valid_idx_df = pd.DataFrame.from_dict({
    'idx': valid_idx,
    'length': train_dataset[valid_idx][1].numpy().reshape(-1)
}).sort_values('length')
valid_idx = valid_idx_df['idx'].values
## ------------------------------

train, valid = torch.utils.data.Subset(train_dataset, train_idx), torch.utils.data.Subset(train_dataset, valid_idx)
valid_df = train_df.iloc[valid_idx, :]

In [None]:
output_model_file = "bert_pytorch.bin"
y_columns = [TARGET]

accumulation_steps = 1

**モデル・オプティマイザーの作成とAPEXでFP16化する**

In [None]:
lr = 2e-5
warmup = 0.05
num_train_optimization_steps = int(TOTAL_RECORD*((FOLD-1)/FOLD)*R_TRAIN/(batch_size*accumulation_steps))
print(f'lr: {lr}\nwarmup: {warmup}\nnum_train_optimization_steps: {num_train_optimization_steps}')

## Define model & optimizer ----------------------------
model = BertForSequenceClassification.from_pretrained("../working", cache_dir=None, num_labels=train[0:1][2].shape[1]-1)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

model.zero_grad();
model = model.to(device);
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=warmup,
                     t_total=num_train_optimization_steps)
## -----------------------------------------------------

if CONTINUE:
    ## Reload model ----------------------------------------
    checkpoint = torch.load('../input/'+CONT_MODEL_KERNEL+'/'+filename)
    model.load_state_dict(checkpoint['state_dict']);
    optimizer.load_state_dict(checkpoint['optimizer'])
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(device)
    ## -----------------------------------------------------
else:
    ## Convert digits form fp32 to fp16 --------------------
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
    ## -----------------------------------------------------

del param_optimizer

In [None]:
gc.collect()
torch.cuda.empty_cache()

**学習**

In [None]:
tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
    start_time = time.time()
    jigsaw_evel_valid = JigsawEvaluator(valid[:][2][:,2].numpy(), valid_df[cols_identity].values)
    train_collator = SequenceBucketCollator(lambda lengths: lengths.max(), sequence_index=0, length_index=1, label_index=2)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False, collate_fn=train_collator)
    
    lossf = None
    tk0 = tqdm_notebook(enumerate(train_loader), total=len(train_loader), leave=True)
    print(f'Iterations: {len(train_loader)}')
    optimizer.zero_grad()
    log_df = pd.DataFrame()
    for i,(x_batch, y_batch) in tk0:
        model = model.train()
        y_pred = model(x_batch[0].to(device), attention_mask=(x_batch[0]>0).to(device), labels=None)
        loss = custom_loss(y_pred, y_batch.to(device))
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
                        
        if  (i!=0 and i%20000==0) or (i/len(tk0)>=R_TRAIN):
            model = model.eval()
            valid_preds, avg_val_loss = create_valid(valid)
            valid_score = jigsaw_evel_valid.get_final_metric(valid_preds[:,0])
            elapsed_time = time.time()-start_time
            print(f'iter: {i}\t loss: {lossf:.6f} \t val_loss: {avg_val_loss:.6f} \t time: {elapsed_time:.2f}s')
            print(f'Valid:\n\t \
                auc:\t{valid_score[0]:.6f}\n\t \
                overall:\t{valid_score[1]:.6f}\n\t \
                subgroup:\t{valid_score[2]:.6f}\n\t \
                bpsn:\t{valid_score[3]:.6f}\n\t \
                bnsp:\t{valid_score[4]:.6f}\n\t')
            scores = 7
            log_df = pd.concat([log_df, 
                               pd.DataFrame.from_dict(
                               {'Iteration': [i]*scores,
                                'Time': [elapsed_time]*scores,
                                'Score_type': ['loss_train', 'loss_valid', 'auc', 'overall', 'subgroup', 'bpsn', 'bnsp'],
                                'Score': [lossf, avg_val_loss]+[valid_score[i] for i in range(5)],
                                'LR': [optimizer.get_lr()[0]]*scores
                               })], axis=0)
            
        if i/len(tk0)>=R_TRAIN:
            break

checkpoint = {'state_dict': model.state_dict()}
torch.save(checkpoint, filename)
log_df[['Iteration', 'Time', 'Score_type', 'Score', 'LR']].to_csv('log.csv', index=False)

In [None]:
del train_dataset, train_df

gc.collect()
torch.cuda.empty_cache()

In [None]:
del model

gc.collect()
torch.cuda.empty_cache()

**1 epoch分(3.5時間)学習させたモデルのインポート (3分クッキング)**

In [None]:
df = pd.read_csv(os.path.join(Data_dir, "train.csv")).sample(frac=1, random_state=SEED).reset_index(drop=True)
## 3分クッキングの際にはコメントアウト
# error_analysis = pd.concat([df.loc[valid_idx, [TARGET, TEXT_COL]+cols_identity].reset_index(drop=True), pd.DataFrame(valid_preds[:,0], columns=['Prediction'])], axis=1)
# error_analysis = error_analysis[[TARGET, 'Prediction', TEXT_COL]+cols_identity].fillna(0)
# error_analysis.to_csv('error_analysis.csv')

## 代わりに下記を実行
error_analysis = pd.read_csv('../input/jigsaw-bert-demo-prep/error_analysis.csv')

In [None]:
num_labels = 8
checkpoint = torch.load('../input/jigsaw-bert-demo-prep/checkpoint.pth')
model = BertForSequenceClassification(bert_config, num_labels=num_labels)
model.load_state_dict(checkpoint['state_dict']);

model = model.to(device);
for param in model.parameters():
    param.requires_grad = False
model = model.eval()


## 推論

In [None]:
def human_vs_ai():
    i = np.random.randint(len(valid_idx))
    print('=== Validation通し番号 ===')
    print(f'{i}/{len(valid_idx)}\n')
    check_tokenizer(valid_idx[i])
    display(pd.DataFrame(error_analysis.iloc[i:i+1,:]))

In [None]:
human_vs_ai()

In [None]:
def create_test(X_test):
    X_test = pd.Series(X_test)
    X_test = convert_lines(X_test, MAX_SEQUENCE_LENGTH, tokenizer)
    print(X_test)
    lengths = pd.DataFrame(np.array([len([i for i in x if i!=0]) for x in X_test]), columns=['length']).astype('int16').values
    
    X_test = torch.utils.data.TensorDataset(
                        torch.tensor(X_test, dtype=torch.long),
                        torch.tensor(lengths, dtype=torch.int16)
                    )
    
    batch_size = 16
    test_preds = np.zeros(len(X_test))
    test_collator = SequenceBucketCollator(lambda lengths: lengths.max(), sequence_index=0, length_index=1)
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False, collate_fn=test_collator)
    
#     tk0 = tqdm_notebook(test_loader)
    tk0 = test_loader
    for i,(x_batch, _) in enumerate(tk0):
        pred = model(x_batch[0].to(device), attention_mask=(x_batch[0]>0).to(device), labels=None)
        test_preds[i*batch_size:(i+1)*batch_size]=pred[:,0].detach().cpu().squeeze().numpy()
        
    print('\n=== Toxic判定結果 ===')
    print(f'{torch.sigmoid(torch.tensor(test_preds)).numpy()[0]}')

In [None]:
create_test('Yuta is a great soccer player.')
# create_test('Yuta is a wonderful soccer player.')
# create_test('Yuta is a bad soccer player.')
# create_test('Yuta is a garbage soccer player.')
# create_test('Yuta took out the garbage in the morning.')
# create_test('Yuta talked to a garbage man in the morning.')