# Submission: BERT + GPT2

## Setup

In [None]:
%ls ../input

In [None]:
import sys
package_dir_a = "../input/huggingfacepytorchpretrainedbert/pytorch-pretrained-bert-master/pytorch-pretrained-BERT-master"
sys.path.insert(0, package_dir_a)

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import warnings
import numpy as np
import gc

import torch
import torch.nn as nn
import torch.utils.data

# BERT 
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig

# GPT2
from pytorch_pretrained_bert.modeling_gpt2 import GPT2PreTrainedModel, GPT2Model, GPT2Config
from pytorch_pretrained_bert import GPT2Tokenizer
from pytorch_pretrained_bert import OpenAIAdam

warnings.filterwarnings(action='once')
device = torch.device('cuda')

In [None]:
import pdb

In [None]:
MODELS_PATH = '../input/submission-toxicity-classification'

In [None]:
BERT_MODEL_PATH = '../input/submission-toxicity-classification/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

In [None]:
# bert uncased
!mkdir ../input/bert_uncased
!cp ../input/submission-toxicity-classification/bert-models/bert-models/bert_config.json  ../input/bert_uncased/bert_config.json
!cp ../input/submission-toxicity-classification/bert-models/bert-models/pytorch_model.bin ../input/bert_uncased/pytorch_model.bin

## Data

In [None]:
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str) 

In [None]:
#test_df = test_df.iloc[:100]

## Helpers

In [None]:
def bert_get_preds(model, x_test):
    test_preds = np.zeros((len(x_test)))
    test = torch.utils.data.TensorDataset(torch.tensor(x_test, dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False)
    tk0 = tqdm(test_loader)
    for i, (x_batch,) in enumerate(tk0):
        pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
        test_preds[i*32:(i+1)*32] = pred[:, 0].detach().cpu().squeeze().numpy()
    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    return test_pred

In [None]:
def gpt2_get_preds(model, x_test):
    test_preds = np.zeros((len(x_test)))
    test = torch.utils.data.TensorDataset(torch.tensor(x_test, dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False)
    tk0 = tqdm(test_loader)
    for i, (x_batch,) in enumerate(tk0):
        pred = model(x_batch.to(device))
        test_preds[i*32:(i+1)*32] = pred[:, 0].detach().cpu().squeeze().numpy()
    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    return test_pred

In [None]:
def bert_convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

In [None]:
def gpt2_convert_lines(example, max_seq_length, tokenizer):
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[-max_seq_length:]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(tokens_a) + [0]*(max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

> ## BERT

In [None]:
class BertHead(nn.Module):
    
    def __init__(self, config, hidden_units=256, num_aux_targets=6):
        super(BertHead, self).__init__()
        self.hidden_units = hidden_units 
        self.num_aux_targets = num_aux_targets
        self.config = config
        
        self.bert_model = BertForSequenceClassification.from_pretrained(config, cache_dir=None, num_labels=hidden_units)
        param_optimizer = list(self.bert_model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        
        self.fc_out = nn.Linear(768, 1)
        self.fc_aux_out = nn.Linear(768, num_aux_targets)
        
        self.fc_dp = nn.Dropout(p=0.4)
        
        self.optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
            {'params': [p for p in self.fc_out.parameters()], 'weight_decay': 0.0}, 
            {'params': [p for p in self.fc_aux_out.parameters()], 'weight_decay': 0.0}
        ]

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert_model.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.bert_model.dropout(pooled_output)
        logits = self.fc_dp(pooled_output)
        out = self.fc_out(logits)
        out_aux = self.fc_aux_out(logits)
        return torch.cat([out, out_aux], 1) 

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True)

### Base

In [None]:
bert_base_models = ['bert_epoch_2_lb_093967.bin', 'bert_epoch_2_lb_093916.bin', 'bert_epoch_1_lb_093970.bin']

In [None]:
x_test = bert_convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), 220, bert_tokenizer)

In [None]:
bert_base_preds = np.zeros((x_test.shape[0], len(bert_base_models)))

for i, bert_model in enumerate(bert_base_models):
    model = BertHead('../input/bert_uncased')
    model.load_state_dict(torch.load(os.path.join(MODELS_PATH, bert_model)))
    model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    preds = bert_get_preds(model, x_test)
    bert_base_preds[:,i] = preds
    
    del model
    gc.collect()

## GPT2

In [None]:
class GPT2ClassificationHeadModel(GPT2PreTrainedModel):

    def __init__(self, config, clf_dropout=0.4, num_aux_targets=6):
        super(GPT2ClassificationHeadModel, self).__init__(config)
        self.transformer = GPT2Model(config)
        self.dropout = nn.Dropout(clf_dropout)
        
        self.fc_out = nn.Linear(config.n_embd * 2, 1)
        self.fc_aux_out = nn.Linear(config.n_embd * 2, num_aux_targets)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        avg_pool = torch.mean(hidden_states, 1)
        max_pool, _ = torch.max(hidden_states, 1)
        logits = torch.cat((avg_pool, max_pool), 1)
        logits = self.dropout(logits)
        
        out = self.fc_out(logits)
        out_aux = self.fc_aux_out(logits)
        
        return torch.cat([out, out_aux], 1) 

In [None]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('../input/submission-toxicity-classification/gpt2-models/')

### Base

In [None]:
gpt2_base_models = ['gpt2_epoch_1_lb_093912.bin', 'gpt2_epoch_1_lb_093904.bin', 'gpt2_k_epoch_2_lb_093902.bin']

In [None]:
x_test = gpt2_convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), 220, gpt2_tokenizer)

In [None]:
gpt2_base_preds = np.zeros((x_test.shape[0], len(gpt2_base_models)))

for i, gpt2_model in enumerate(gpt2_base_models):
    model = GPT2ClassificationHeadModel.from_pretrained('../input/submission-toxicity-classification/gpt2-models/')
    model.load_state_dict(torch.load(os.path.join(MODELS_PATH, gpt2_model)))
    model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    preds = gpt2_get_preds(model, x_test)
    gpt2_base_preds[:,i] = preds
    
    del model
    gc.collect()

## Submission

In [None]:
### sample average for now
test_pred = ( np.average(bert_base_preds, axis=1) + np.average(gpt2_base_preds, axis=1) ) / 2

In [None]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': test_pred
})
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()