In [None]:
!ls ../input/fastai-sigsaw-1-epoch

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
%%time
# Installing Nvidia Apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a

In [None]:
import sys

package_dir_a = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, package_dir_a)

In [None]:
# library
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function

import numpy as np
import pandas as pd

from typing import *
from pathlib import Path

import torch
import torch.optim as optim

from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.callbacks import *
from sklearn.model_selection import train_test_split

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="my bar!")
import torch.utils.data
from sklearn import metrics
from scipy.stats import rankdata
from tqdm import tqdm_notebook as tqdm
from nltk.tokenize.treebank import TreebankWordTokenizer
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig, convert_tf_checkpoint_to_pytorch

import re
import psutil
import multiprocessing as mp
from multiprocessing import Pool

from gensim.models import KeyedVectors

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

In [None]:
# Translate model from tensorflow to pytorch
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
WORK_DIR = '../working/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(BERT_MODEL_PATH + 'bert_model.ckpt',
                                                                  BERT_MODEL_PATH + 'bert_config.json',
                                                                  WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')

In [None]:
warnings.filterwarnings(action='once')
device = torch.device('cuda')
MAX_SEQUENCE_LENGTH = 220
SEED = 620402
BATCH_SIZE = 16
BERT_MODEL_BIN = '../working'
bert_config = BertConfig(BERT_MODEL_PATH + 'bert_config.json')
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# get the tokenizer
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True)

In [None]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around a BertTokenizer to be a BaseTokenizer in fastai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=300, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [None]:
# tokenizer fo fastai
fastai_tokenizer = Tokenizer(
    tok_func = FastAiBertTokenizer(bert_tokenizer, max_seq_len = MAX_SEQUENCE_LENGTH), 
    pre_rules=[], 
    post_rules=[]
)

In [None]:
# set vocabulary
fastai_bert_vocab = Vocab(list(bert_tokenizer.vocab.keys()))

In [None]:
%%time
# load the train and test dataset
train_origin = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")

In [None]:
len_test = len(test_df)
len_train_origin = len(train_origin)

In [None]:
len_test, len_train_origin

In [None]:
train_origin['target']=(train_origin['target']>=0.5).astype(int)
train_orgin_0 = train_origin[train_origin['target'] == 0]
train_orgin_1 = train_origin[train_origin['target'] > 0]
len_orgin_0 = len(train_orgin_0)
len_orgin_1 = len(train_orgin_1)
n_div_01 = len_orgin_0 // len_orgin_1
len_orgin_0, len_orgin_1

In [None]:
train_df_0 = train_orgin_0.sample(len_test*10, replace=True)
train_df_1 = train_orgin_1.sample(len_test*10, replace=True)
train_df = pd.concat([train_df_0, train_df_1]).reset_index(drop=True)
train_df['target']=(train_df['target']>=0.5).astype(int)
len(train_df), train_df.head()

In [None]:
def get_sum(train_df):
    sum_1 = train_df[train_df['target'] == 1].shape
    sum_0 = train_df[train_df['target'] == 0].shape
    print(sum_0, sum_1)
get_sum(train_df)
get_sum(train_df_0)
get_sum(train_df_1)

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [None]:
%%time
# preprocess
train_df['comment_text'] = preprocess(train_df['comment_text']).astype(str) 
test_df['comment_text'] = preprocess(test_df['comment_text']).astype(str) 

In [None]:
train_df.head(2)

In [None]:
train, val = train_test_split(train_df)
test = test_df

In [None]:
train.shape, val.shape, test.shape

In [None]:
print(set(train_df['target']))

In [None]:
del train_df, test_df, train_origin
gc.collect()

In [None]:
class BertTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tokenizer.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

In [None]:
class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

In [None]:
# this will produce a virtually identical databunch to the code above
databunch = BertDataBunch.from_df("../input/jigsaw-unintended-bias-in-toxicity-classification", train, val, test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  text_cols="comment_text",
                  label_cols='target',
                  bs=BATCH_SIZE,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
                )

In [None]:
databunch.show_batch()

In [None]:
bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_BIN, num_labels=2)
loss_func = nn.CrossEntropyLoss()

In [None]:
learn = Learner(databunch,
           bert_model,
           loss_func=loss_func,
           metrics=[accuracy],
           model_dir='/kaggle/working')

In [None]:
!ls ../input/fastai-sigsaw-1-epoch

In [None]:
!cp ../input/fastai-sigsaw-1-epoch/train_epoch_8.pth /kaggle/working/train_epoch_8.pth

In [None]:
# learn.path = '../input/fastai-sigsaw-1-epoch'
learn.load('train_epoch_8')
# learn.path = '/kaggle/working'

In [None]:
learn = learn.to_fp16()

In [None]:
learn.fit_one_cycle(1, 1e-6, moms=[0.8, 0.7])

In [None]:
learn.save('train_epoch_9')
learn.path = Path('.')
learn.export('./train_epoch_9.pkl')