In [None]:
import sys
package_dir = "../input/privatepytorchpretrainedbert/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.append(package_dir)

In [None]:
import torch.utils.data
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import warnings
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig
import math
import gc

warnings.filterwarnings(action='once')
device = torch.device('cuda')

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            first = int(math.ceil(max_seq_length * 0.25)) #first 25%
            last = max_seq_length - first
            print(first)
            print(last)
            tokens_first = tokens_a[:first]
            tokens_last = tokens_a[-last:]
            tokens_a = tokens_first + tokens_last
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
       
        all_tokens.append(one_token)
    print("Number of sequences longer: ", longer)
    return np.array(all_tokens)

In [None]:
all_preds = []

model_weights = ["../input/bestbert-360-e1/bestbert_360_len1.bin",
                 "../input/bestbert-360-m2-e1/bestbert_360_2_len1.bin",
                 "../input/bert-360-lin-inc-lr-m2-e1/bestbert_360_lin_dec_m2_1.bin",
                 "../input/bert360-lin-dec-e1-m1/bert360_lin_dec_pytorch1.bin",
                 "../input/bert-2-epoch/bert_pytorch_2_epoch/bert_pytorch.bin"]

model_paths = ["../input/bertpretraineduncasedmodel/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12",
               "../input/bertpretraineduncasedmodel/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12",
               "../input/bertpretraineduncasedmodel/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12",
               "../input/bertpretraineduncasedmodel/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12",
               "../input/bertpretraineduncasedmodel/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12"]

model_configs = ['../input/cicero-bert-config/cicero_bert_config.json',
                 '../input/cicero-bert-config/cicero_bert_config.json',
                 '../input/cicero-bert-config/cicero_bert_config.json',
                 '../input/cicero-bert-config/cicero_bert_config.json',
                 '../input/cicero-bert-config/cicero_bert_config.json']

max_seq_lengths = [360, 360, 360, 360, 220]

test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str)

for idx in range(len(model_weights)):
    
    MAX_SEQUENCE_LENGTH = max_seq_lengths[idx]
    BATCH_SIZE = 32
    BERT_MODEL_PATH = model_paths[idx]

    bert_config = BertConfig(model_configs[idx])
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
    
    X_test = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
    
    
    
    model = BertForSequenceClassification(bert_config, num_labels=7)
    model.load_state_dict(torch.load(model_weights[idx]))
    model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    
    test_preds = np.zeros((len(X_test)))
    test = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test, batch_size=32, shuffle=False)
    tk0 = tqdm(test_loader)
    for i, (x_batch,) in enumerate(tk0):
        pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
        test_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy()

    all_preds.append(torch.sigmoid(torch.tensor(test_preds)).numpy().ravel())
    
    del model
    gc.collect()

In [None]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': np.mean(all_preds, axis=0)
})
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()