**Pytorch BERT baseline**

### bert-base (uncased-v2, cased-v2) swa + xlnet (5 folds) + bert-base-uncased (question + answer) swa + bert-base-cased (question + answer) swa + xlnet (question + answer) swa + roberta (question + answer) + postprocessing (seefun's version)

In [None]:
!pip install ../input/sacremoses/sacremoses-master/
!pip install ../input/transformers/transformers-master/

### Required Imports

I've added imports that will be used in training too

In [None]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
DATA_DIR = '../input/google-quest-challenge'

In [None]:
!ls ../input

In [None]:
os.listdir("../input/roberta-transformers-pytorch/roberta-base")

In [None]:
os.listdir("../input/qaxlnetbasecasedaugdiffswaanswer")

In [None]:
os.listdir("../input/qaxlnetbasecasedaugdiffswaquestion")

In [None]:
os.listdir("../input/qabertbaseuncasedaugdiffswaanswer")

In [None]:
os.listdir("../input/qabertbaseuncasedaugdiffswaquestion")

In [None]:
os.listdir("../input/qabertbasecasedaugdiffswaanswer")

In [None]:
os.listdir("../input/qabertbasecasedaugdiffswaquestion")

In [None]:
os.listdir("../input/qabertbasecasedaugdiffv2swa")

In [None]:
os.listdir("../input/qabertuncasedaugdiffv2swa")

In [None]:
os.listdir("../input/qaxlnetbasecasedaugdiff")

In [None]:
os.listdir("../input/qarobertabasecasedaugdiffswaquestion")

In [None]:
sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
sub.head()

In [None]:
TARGET_COLUMNS = sub.columns.values[1:].tolist()
TARGET_COLUMNS

### Define dataset

In [None]:
train = pd.read_csv(f'{DATA_DIR}/train.csv')
train.head()

In [None]:
test = pd.read_csv(f'{DATA_DIR}/test.csv')
test.head()

In [None]:
import torch
import html
#import torch.utils.data as data
from torchvision import datasets, models, transforms
from transformers import *
from sklearn.utils import shuffle
import random
from math import floor, ceil
from sklearn.model_selection import GroupKFold

MAX_LEN = 512
#MAX_Q_LEN = 250
#MAX_A_LEN = 259
SEP_TOKEN_ID = 102

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, model_type="bert-base-cased", max_len=512, content="Question_Answer", train_mode=True, labeled=True):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        self.max_len = max_len
        self.content = content
        bert_tokenizer_path = '../input/pretrained-bert-models-for-pytorch/' + model_type + '-vocab.txt'
        xlnet_tokenizer_path = '../input/xlnet-pretrained-models-pytorch/' + model_type + '-spiece.model'
        roberta_tokenizer_path = '../input/roberta-transformers-pytorch/roberta-base/vocab.json'
        roberta_tokenizer_merges_file = '../input/roberta-transformers-pytorch/roberta-base/merges.txt'
        if model_type == "bert-base-uncased":
            self.tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
        elif model_type == "bert-base-cased":
            self.tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
        elif model_type == "xlnet-base-cased":
            self.tokenizer = XLNetTokenizer.from_pretrained(xlnet_tokenizer_path)
        elif model_type == "roberta-base":
            self.tokenizer = RobertaTokenizer(vocab_file=roberta_tokenizer_path, merges_file=roberta_tokenizer_merges_file)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids

    def __len__(self):
        return len(self.df)

    def select_tokens(self, tokens, max_num):
        if len(tokens) <= max_num:
            return tokens
        if self.train_mode:
            num_remove = len(tokens) - max_num
            remove_start = random.randint(0, len(tokens)-num_remove-1)
            return tokens[:remove_start] + tokens[remove_start + num_remove:]
        else:
            return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]
        
    def trim_input_single_content(self, title, content, max_sequence_length=512, 
                t_max_len=30, c_max_len=512-30-4, num_token=3):
        
        content = html.unescape(content)
        title = html.unescape(title)
        
        t = self.tokenizer.tokenize(title)
        c = self.tokenizer.tokenize(content)

        t_len = len(t)
        c_len = len(c)

        if (t_len+c_len+num_token) > max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                c_max_len = c_max_len + floor((t_max_len - t_len)/2)
            else:
                t_new_len = t_max_len

            if c_max_len > c_len:
                c_new_len = c_len 
            else:
                c_new_len = c_max_len


            if t_new_len+c_new_len+num_token > max_sequence_length:
                raise ValueError("New sequence length should be less or equal than %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+c_new_len+num_token)))
            
            # truncate
            if len(t) - t_new_len > 0:
                t = t[:t_new_len//4] + t[len(t)-t_new_len+t_new_len//4:]
            else:
                t = t[:t_new_len]

            if len(c) - c_new_len > 0:
                c = c[:c_new_len//4] + c[len(c)-c_new_len+c_new_len//4:]
            else:
                c = c[:c_new_len]

        # some bad cases
        if (len(t) + len(c) + num_token > max_sequence_length):
            more_token = len(t) + len(c) + num_token - max_sequence_length
            c = c[:(len(c)-more_token)]
        
        return t, c
            
    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=30, q_max_len=239, a_max_len=239, num_token=4):

        question = html.unescape(question)
        answer = html.unescape(answer)
        title = html.unescape(title)
        
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        q_len = len(q)
        a_len = len(a)

        if (t_len+q_len+a_len+num_token) > max_sequence_length:

            if t_max_len > t_len:
                t_new_len = t_len
                a_max_len = a_max_len + floor((t_max_len - t_len)/2)
                q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
            else:
                t_new_len = t_max_len

            if a_max_len > a_len:
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len


            if t_new_len+a_new_len+q_new_len+num_token > max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+a_new_len+q_new_len+num_token)))

            
            # truncate
            if len(t) - t_new_len > 0:
                t = t[:t_new_len//4] + t[len(t)-t_new_len+t_new_len//4:]
            else:
                t = t[:t_new_len]

            if len(q) - q_new_len > 0:
                q = q[:q_new_len//4] + q[len(q)-q_new_len+q_new_len//4:]
            else:
                q = q[:q_new_len]

            if len(a) - a_new_len > 0:
                a = a[:a_new_len//4] + a[len(a)-a_new_len+a_new_len//4:]
            else:
                a = a[:a_new_len]

        return t, q, a
        
    def get_token_ids(self, row):
        
        num_token = 4
        
        if self.content == "Question":
            num_token -= 1
        elif self.content == "Answer":
            num_token -= 1
        
        if self.content == "Question_Answer":   
            t_max_len=30
            q_max_len=int((self.max_len-t_max_len-num_token)/2)
            a_max_len=(self.max_len-t_max_len - num_token - int((self.max_len-t_max_len-num_token)/2))
        elif self.content == "Question":
            t_max_len=30
            q_max_len=self.max_len-t_max_len-num_token
            a_max_len=0
        elif self.content == "Answer":
            t_max_len=30
            q_max_len=0
            a_max_len=self.max_len-t_max_len-num_token  
        else:
            raise NotImplementedError
        
        if self.content == "Question_Answer":
            t_tokens, q_tokens, a_tokens = self.trim_input(row.question_title, row.question_body, row.answer, max_sequence_length=self.max_len, \
                t_max_len=t_max_len, q_max_len=q_max_len, a_max_len=a_max_len, num_token=num_token)
        elif self.content == "Question":
            t_tokens, c_tokens = self.trim_input_single_content(row.question_title, row.question_body, max_sequence_length=self.max_len, \
                t_max_len=t_max_len, c_max_len=q_max_len, num_token=num_token)
        elif self.content == "Answer":
            t_tokens, c_tokens = self.trim_input_single_content(row.question_title, row.answer, max_sequence_length=self.max_len, \
                t_max_len=t_max_len, c_max_len=a_max_len, num_token=num_token)
        else:
            raise NotImplementedError

        if self.content == "Question_Answer":
            tokens = ['[CLS]'] + t_tokens + ['[SEP]'] + q_tokens + ['[SEP]'] + a_tokens + ['[SEP]']
        elif ((self.content == "Question") or (self.content == "Answer")):
            tokens = ['[CLS]'] + t_tokens + ['[SEP]'] + c_tokens + ['[SEP]']
        else:
            raise NotImplementedError
                
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < self.max_len:
            token_ids += [0] * (self.max_len - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)
        
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == self.tokenizer.sep_token_id:
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)
        seg_ids[pad_idx] = 0

        return seg_ids

    def get_label(self, row):
        #print(row[TARGET_COLUMNS].values)
        return torch.tensor(row[TARGET_COLUMNS].values.astype(np.float32))

    def collate_fn(self, batch):
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
    
        if self.labeled:
            labels = torch.stack([x[2] for x in batch])
            return token_ids, seg_ids, labels
        else:
            return token_ids, seg_ids

def get_test_loader(model_type="bert-base-cased", max_len=512, content="Question_Answer", batch_size=4):
    df = pd.read_csv(f'{DATA_DIR}/test.csv')
    ds_test = QuestDataset(df, model_type, max_len=max_len, content=content, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    
    return loader, ds_test.tokenizer
        
def get_train_val_loaders(model_type="bert-base-cased", max_len=512, content="Question_Answer", batch_size=4, val_batch_size=4, ifold=0):
    df = pd.read_csv(f'{DATA_DIR}/train.csv')
    df = shuffle(df, random_state=42)
    #split_index = int(len(df) * (1-val_percent))
    gkf = GroupKFold(n_splits=5).split(X=df.question_body, groups=df.question_body)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        if fold == ifold:
            df_train = df.iloc[train_idx]
            df_val = df.iloc[valid_idx]
            break

    #print(df_val.head())
    #df_train = df[:split_index]
    #df_val = df[split_index:]

    print(df_train.shape)
    print(df_val.shape)

    ds_train = QuestDataset(df_train, model_type, max_len=max_len, content=content)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=2, collate_fn=ds_train.collate_fn, drop_last=True)
    train_loader.num = len(df_train)

    ds_val = QuestDataset(df_val, model_type, max_len=max_len, content=content, train_mode=False)
    val_loader = torch.utils.data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=2, collate_fn=ds_val.collate_fn, drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader, ds_train.tokenizer

def test_train_loader():
    loader, _, _ = get_train_val_loaders("xlnet-base-cased", 512, "Question", 4, 4, 1)
    for ids, seg_ids, labels in loader:
        print(ids)
        print(seg_ids.numpy())
        print(labels)
        break
def test_test_loader():
    loader, _ = get_test_loader("roberta-base", 512, "Question", 4)
    for ids, seg_ids in loader:
        print(ids)
        print(seg_ids)
        break

In [None]:
test_test_loader()

In [None]:
test_train_loader()

## Build Model

In [None]:
from transformers import *
import torch
import torch.nn as nn
import torch.nn.functional as F

class QuestModel(nn.Module):
    def __init__(self, model_type="xlnet-base-cased", tokenizer=None, n_classes=30, hidden_layers=[-1, -3, -5, -7, -9]):
        super(QuestModel, self).__init__()
        self.model_name = 'QuestModel'
        self.model_type = model_type
        self.hidden_layers = hidden_layers
        if model_type == "bert-base-uncased":
            bert_model_config = '../input/pretrained-bert-models-for-pytorch/bert-base-uncased/bert_config.json'
            bert_config = BertConfig.from_json_file(bert_model_config)
            bert_config.output_hidden_states = True
            model_path = os.path.join('../input/pretrained-bert-models-for-pytorch/' + model_type)
            self.bert_model = BertModel.from_pretrained(model_path, config=bert_config)   
        elif model_type == "bert-base-cased":
            bert_model_config = '../input/pretrained-bert-models-for-pytorch/bert-base-cased/bert_config.json'
            bert_config = BertConfig.from_json_file(bert_model_config)
            bert_config.output_hidden_states = True
            model_path = os.path.join('../input/pretrained-bert-models-for-pytorch/' + model_type)
            self.bert_model = BertModel.from_pretrained(model_path, config=bert_config)   
        elif model_type == "xlnet-base-cased":
            xlnet_model_config = '../input/xlnet-pretrained-models-pytorch/xlnet-base-cased-config.json'
            xlnet_config = XLNetConfig.from_json_file(xlnet_model_config)
            xlnet_config.output_hidden_states = True
            xlnet_config.hidden_dropout_prob = 0
            model_path = os.path.join('../input/xlnet-pretrained-models-pytorch/' + model_type + '-pytorch_model.bin')
            self.xlnet_model = XLNetModel.from_pretrained(model_path, config=xlnet_config)   
        elif model_type == "xlnet-large-cased":
            xlnet_model_config = '../input/xlnet-pretrained-models-pytorch/xlnet-large-cased-config.json'
            xlnet_config = XLNetConfig.from_json_file(xlnet_model_config)
            xlnet_config.output_hidden_states = True
            xlnet_config.hidden_dropout_prob = 0
            model_path = os.path.join('../input/xlnet-pretrained-models-pytorch/' + model_type + '-pytorch_model.bin')
            self.xlnet_model = XLNetModel.from_pretrained(model_path, config=xlnet_config)  
        elif model_type == "roberta-base":
            roberta_model_config = '../input/roberta-transformers-pytorch/roberta-base/config.json'
            roberta_config = RobertaConfig.from_json_file(roberta_model_config)
            roberta_config.output_hidden_states = True
            roberta_config.hidden_dropout_prob = 0
            model_path = os.path.join('../input/roberta-transformers-pytorch/roberta-base/pytorch_model.bin')
            self.roberta_model = RobertaModel.from_pretrained(model_path, config=roberta_config)  
            self.roberta_model.resize_token_embeddings(len(tokenizer)) 
        
        if model_type == "bert-base-uncased":
            self.hidden_size = 768
        elif model_type == "bert-large-uncased":
            self.hidden_size = 1024
        elif model_type == "bert-base-cased":
            self.hidden_size = 768
        elif model_type == "xlnet-base-cased":
            self.hidden_size = 768
        elif model_type == "xlnet-large-cased":
            self.hidden_size = 1024
        elif model_type == "roberta-base":
            self.hidden_size = 768
        else:
            raise NotImplementedError
            
        self.fc_1 = nn.Linear(self.hidden_size * len(hidden_layers), self.hidden_size)
        self.fc = nn.Linear(self.hidden_size, n_classes)
            
        self.selu = nn.SELU()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.dropouts = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])

    def forward(self, ids, seg_ids):
        attention_mask = (ids > 0)
        
        if ((self.model_type == "bert-base-uncased") \
            or (self.model_type == "bert-base-cased") \
            or (self.model_type == "bert-large-uncased") \
            or (self.model_type == "bert-large-cased")):
        
            outputs = self.bert_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
            hidden_states = outputs[2]
            
            # pooled_out = outputs[1] #  N * 768
        
            # sequence_out = torch.unsqueeze(outputs[0][:, 0], dim=-1) # N * 512 * 768 * 1, hidden_states[-1]
            # fuse_hidden = sequence_out
            
            # 13 (embedding + 12 transformers) for base
            # 26 (embedding + 25 transformers) for large
            
            # concat hidden
            for i in range(len(self.hidden_layers)):
                if i == 0:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = torch.mean(hidden_states[hidden_layer], dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    fuse_hidden = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                else:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = torch.mean(hidden_states[hidden_layer], dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    h = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                    fuse_hidden = torch.cat([fuse_hidden, h], dim=-1)
                    
            fuse_hidden = fuse_hidden.reshape(fuse_hidden.shape[0], -1)
            h = self.relu(self.fc_1(fuse_hidden))
        
        elif ((self.model_type == "xlnet-base-cased") \
            or (self.model_type == "xlnet-large-cased")):

            attention_mask = attention_mask.float()
            outputs = self.xlnet_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
            hidden_states = outputs[1]
            
            # last_hidden_out = outputs[0]
            # mem = outputs[1], when config.mem_len > 0
            
            # concat hidden, summary_type="first", first_dropout = 0
            for i in range(len(self.hidden_layers)):
                if i == 0:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = hidden_states[hidden_layer].mean(dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    fuse_hidden = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                else:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = hidden_states[hidden_layer].mean(dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    h = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                    fuse_hidden = torch.cat([fuse_hidden, h], dim=-1)
        
            fuse_hidden = fuse_hidden.reshape(fuse_hidden.shape[0], -1)
            h = self.relu(self.fc_1(fuse_hidden))
        elif (self.model_type == "roberta-base"):

            attention_mask = attention_mask.float()
            outputs = self.roberta_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
            # outputs = self.roberta_model(input_ids=ids, attention_mask=attention_mask)
            hidden_states = outputs[2]
            
            for i in range(len(self.hidden_layers)):
                if i == 0:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = hidden_states[hidden_layer].mean(dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    fuse_hidden = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                else:
                    hidden_layer = self.hidden_layers[i]
                    # hidden_state = hidden_states[hidden_layer].mean(dim=1)
                    hidden_state = hidden_states[hidden_layer][:, 0]
                    h = torch.unsqueeze(hidden_state, dim=-1) # N * 768 * 1
                    fuse_hidden = torch.cat([fuse_hidden, h], dim=-1)
        
            fuse_hidden = fuse_hidden.reshape(fuse_hidden.shape[0], -1)
            h = self.relu(self.fc_1(fuse_hidden))
            
            
            
        for j, dropout in enumerate(self.dropouts):
            
            if j == 0:
                logit = self.fc(dropout(h))
            else:
                logit += self.fc(dropout(h))
                
        return logit / len(self.dropouts)
    
def test_model(model_type="bert-base-cased", hidden_layers=[-1, -3, -5, -7, -9]):
    x = torch.tensor([[1,2,3,4,5, 0, 0], [1,2,3,4,5, 0, 0]])
    seg_ids = torch.tensor([[0,0,0,0,0, 0, 0], [0,0,0,0,0, 0, 0]])
    model = QuestModel(model_type=model_type, hidden_layers=hidden_layers)

    y = model(x, seg_ids)
    print(y)

In [None]:
test_model(model_type="bert-base-cased", hidden_layers=[-3, -4, -5, -6, -7])

In [None]:
def create_bert_base_uncased_models():
    models = []
    for i in range(10):
        model = QuestModel(model_type="bert-base-uncased", hidden_layers=[-1, -3, -5, -7, -9])
        model.load_state_dict(torch.load(f'../input/qabertuncasedaugdiffv2swa/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_bert_base_cased_models():
    models = []
    for i in range(10):
        model = QuestModel(model_type="bert-base-cased", hidden_layers=[-1, -3, -5, -7, -9])
        model.load_state_dict(torch.load(f'../input/qabertbasecasedaugdiffv2swa/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_xlnet_base_cased_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="xlnet-base-cased", hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qaxlnetbasecasedaugdiff/fold_{i}_checkpoint.pth'))
        model.eval()
        models.append(model)
    return models

def create_xlnet_base_cased_question_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="xlnet-base-cased", n_classes=21, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qaxlnetbasecasedaugdiffswaquestion/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_xlnet_base_cased_answer_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="xlnet-base-cased", n_classes=9, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qaxlnetbasecasedaugdiffswaanswer/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models


def create_bert_base_uncased_question_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="bert-base-uncased", n_classes=21, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qabertbaseuncasedaugdiffswaquestion/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_bert_base_uncased_answer_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="bert-base-uncased", n_classes=9, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qabertbaseuncasedaugdiffswaanswer/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_bert_base_cased_question_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="bert-base-cased", n_classes=21, hidden_layers=[-2, -4, -6, -8, -10])
        model.load_state_dict(torch.load(f'../input/qabertbasecasedaugdiffswaquestion/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_bert_base_cased_answer_models():
    models = []
    for i in range(5):
        model = QuestModel(model_type="bert-base-cased", n_classes=9, hidden_layers=[-2, -4, -6, -8, -10])
        model.load_state_dict(torch.load(f'../input/qabertbasecasedaugdiffswaanswer/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_roberta_base_question_models(tokenizer):
    models = []
    for i in range(5):
        model = QuestModel(model_type="roberta-base", tokenizer=tokenizer, n_classes=21, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qarobertabasecasedaugdiffswaquestion/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

def create_roberta_base_answer_models(tokenizer):
    models = []
    for i in range(5):
        model = QuestModel(model_type="roberta-base", tokenizer=tokenizer, n_classes=9, hidden_layers=[-3, -4, -5, -6, -7])
        model.load_state_dict(torch.load(f'../input/qarobertabasecasedaugdiffswaanswer/fold_{i}_checkpoint_swa.pth'))
        model.eval()
        models.append(model)
    return models

In [None]:
from tqdm import tqdm
import torch
def predict(models, test_loader):
    all_scores = []
    with torch.no_grad():
        for ids, seg_ids in tqdm(test_loader, total=test_loader.num // test_loader.batch_size):
            ids, seg_ids = ids.cuda(), seg_ids.cuda()
            scores = []
            for model in models:
                model = model.cuda()
                outputs = torch.sigmoid(model(ids, seg_ids)).cpu()
                scores.append(outputs)
            all_scores.append(torch.mean(torch.stack(scores), 0))

    all_scores = torch.cat(all_scores, 0).numpy()
    
    return all_scores

## predict with xlnet-base-cased

In [None]:
# test_loader, _ = get_test_loader(model_type="xlnet-base-cased", batch_size=32)

In [None]:
# xlnet_base_cased_models = create_xlnet_base_cased_models()
# xlnet_base_cased_preds = predict(xlnet_base_cased_models, test_loader)

In [None]:
# del xlnet_base_cased_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

## predict with xlnet-base-cased question and answer

In [None]:
# test_loader, _ = get_test_loader(model_type="xlnet-base-cased", content="Question", batch_size=32)

In [None]:
# xlnet_base_cased_question_models = create_xlnet_base_cased_question_models()
# xlnet_base_cased_question_preds = predict(xlnet_base_cased_question_models, test_loader)

In [None]:
# del xlnet_base_cased_question_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# test_loader, _ = get_test_loader(model_type="xlnet-base-cased", content="Answer", batch_size=32)

In [None]:
# xlnet_base_cased_answer_models = create_xlnet_base_cased_answer_models()
# xlnet_base_cased_answer_preds = predict(xlnet_base_cased_answer_models, test_loader)

In [None]:
# del xlnet_base_cased_answer_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# xlnet_base_cased_question_answer_preds = np.concatenate([xlnet_base_cased_question_preds, xlnet_base_cased_answer_preds], axis=1)

## predict with roberta-base question and answer

In [None]:
test_loader, tokenizer = get_test_loader(model_type="roberta-base", content="Question", batch_size=32)

In [None]:
roberta_base_question_models = create_roberta_base_question_models(tokenizer)
roberta_base_question_preds = predict(roberta_base_question_models, test_loader)

In [None]:
del roberta_base_question_models, test_loader, tokenizer
torch.cuda.empty_cache()
gc.collect()

In [None]:
test_loader, tokenizer = get_test_loader(model_type="roberta-base", content="Answer", batch_size=32)

In [None]:
roberta_base_answer_models = create_roberta_base_answer_models(tokenizer)
roberta_base_answer_preds = predict(roberta_base_answer_models, test_loader)

In [None]:
del roberta_base_answer_models, test_loader, tokenizer
torch.cuda.empty_cache()
gc.collect()

In [None]:
roberta_base_question_answer_preds = np.concatenate([roberta_base_question_preds, roberta_base_answer_preds], axis=1)

## predict with bert-base-cased question and answer

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-cased", content="Question", batch_size=32)

In [None]:
# bert_base_cased_question_models = create_bert_base_cased_question_models()
# bert_base_cased_question_preds = predict(bert_base_cased_question_models, test_loader)

In [None]:
# del bert_base_cased_question_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-cased", content="Answer", batch_size=32)

In [None]:
# bert_base_cased_answer_models = create_bert_base_cased_answer_models()
# bert_base_cased_answer_preds = predict(bert_base_cased_answer_models, test_loader)

In [None]:
# del bert_base_cased_answer_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# bert_base_cased_question_answer_preds = np.concatenate([bert_base_cased_question_preds, bert_base_cased_answer_preds], axis=1)

## predict with bert-base-uncased question and answer

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-uncased", content="Question", batch_size=32)

In [None]:
# bert_base_uncased_question_models = create_bert_base_uncased_question_models()
# bert_base_uncased_question_preds = predict(bert_base_uncased_question_models, test_loader)

In [None]:
# del bert_base_uncased_question_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-uncased", content="Answer", batch_size=32)

In [None]:
# bert_base_uncased_answer_models = create_bert_base_uncased_answer_models()
# bert_base_uncased_answer_preds = predict(bert_base_uncased_answer_models, test_loader)

In [None]:
# del bert_base_uncased_answer_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# bert_base_uncased_question_answer_preds = np.concatenate([bert_base_uncased_question_preds, bert_base_uncased_answer_preds], axis=1)

## predict with bert-base-cased

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-cased", batch_size=32)

In [None]:
# bert_base_cased_models = create_bert_base_cased_models()
# bert_base_cased_preds = predict(bert_base_cased_models, test_loader)

In [None]:
# del bert_base_cased_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

## predict with bert-base-uncased

In [None]:
# test_loader, _ = get_test_loader(model_type="bert-base-uncased", batch_size=32)

In [None]:
# bert_base_uncased_models = create_bert_base_uncased_models()
# bert_base_uncased_preds = predict(bert_base_uncased_models, test_loader)

In [None]:
# del bert_base_uncased_models, test_loader
# torch.cuda.empty_cache()
# gc.collect()

In [None]:
# preds = bert_base_uncased_question_answer_preds
# preds = ((bert_base_uncased_preds + bert_base_cased_preds)/2.0 \
#          + (xlnet_base_cased_preds + xlnet_base_cased_question_answer_preds)/2.0 \
#          + (bert_base_uncased_question_answer_preds + bert_base_cased_question_answer_preds)/2.0 \
#           ) / 3.0
# preds = bert_base_uncased_preds
preds = roberta_base_question_answer_preds

### Generate Submission

In [None]:
# sub[TARGET_COLUMNS] = bert_base_uncased_preds
# sub.to_csv('submission_bert_base_uncased.csv', index=False)
# sub[TARGET_COLUMNS] = bert_base_cased_preds
# sub.to_csv('submission_bert_base_cased.csv', index=False)
# sub[TARGET_COLUMNS] = xlnet_base_cased_preds
# sub.to_csv('submission_xlnet_base_cased.csv', index=False)

In [None]:
# pred = np.copy(preds)

In [None]:
# test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [None]:
# import pandas as pd
# optimization_results = pd.read_csv("../input/optyxx/optimization_resultsX.csv")

In [None]:
# ALL_COLUMNS = ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
#                      'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
#                      'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
#                      'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
#                      'question_type_compare', 'question_type_consequence', 'question_type_definition',
#                      'question_type_entity', 'question_type_instructions', 'question_type_procedure',
#                      'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
#                      'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
#                      'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
#                      'answer_type_reason_explanation', 'answer_well_written']

# OPTIMIZED_COLUMNS = [
#     'question_conversational',
#     'question_has_commonly_accepted_answer',
#     'question_not_really_a_question',
#     'question_type_choice',
#     'question_type_compare',
#     'question_type_consequence',
#     'question_type_definition',
#     'question_type_entity',
#     'question_type_instructions',
#     'question_interestingness_self', 
#     'answer_satisfaction'
# ]

# NON_OPTIMIZED_COLUMNS = list(set(ALL_COLUMNS) - set(OPTIMIZED_COLUMNS))

In [None]:
# for col in NON_OPTIMIZED_COLUMNS:
#     coeffs = optimization_results.loc[optimization_results.col==col, 'coeffs']
        
#     changerow = int(len(test) * coeffs)
#     colidx = NON_OPTIMIZED_COLUMNS.index(col)
    
#     if optimization_results.loc[optimization_results.col==col, 'choice'].values =='decrease':
#         rowidx = pred[:, colidx].argsort()[:changerow]
#         pred[rowidx, colidx] = pred[rowidx, colidx] * 0.9
#     elif optimization_results.loc[optimization_results.col==col, 'choice'].values =='increase':
#         rowidx = pred[:, colidx].argsort()[-changerow:]
#         pred[rowidx, colidx] = pred[rowidx, colidx] * 1.1
#     else:
#         pass

In [None]:
sub[TARGET_COLUMNS] = preds

In [None]:
sub.head()

In [None]:
test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [None]:
test = test.set_index('qa_id').join(sub.set_index('qa_id'))

In [None]:
test.head()

# Postprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler
    
def postprocessing(oof_df):
   
    scaler = MinMaxScaler()
    
    # type 1 column [0, 0.333333, 0.5, 0.666667, 1]
    # type 2 column [0, 0.333333, 0.666667]
    # type 3 column [0.333333, 0.444444, 0.5, 0.555556, 0.666667, 0.777778, 0.8333333, 0.888889, 1]
    # type 4 column [0.200000, 0.266667, 0.300000, 0.333333, 0.400000, \
    # 0.466667, 0.5, 0.533333, 0.600000, 0.666667, 0.700000, \
    # 0.733333, 0.800000, 0.866667, 0.900000, 0.933333, 1]
    
    # comment some columns based on oof result
    
    ################################################# handle type 1 columns
    type_one_column_list = [
       'question_conversational', \
       'question_has_commonly_accepted_answer', \
       'question_not_really_a_question', \
       'question_type_choice', \
       'question_type_compare', \
       'question_type_consequence', \
       'question_type_definition', \
       'question_type_entity', \
       'question_type_instructions', 
    ]
    
    oof_df[type_one_column_list] = scaler.fit_transform(oof_df[type_one_column_list])
    
    tmp = oof_df.copy(deep=True)
    
    for column in type_one_column_list:
        
        oof_df.loc[tmp[column] <= 0.16667, column] = 0
        oof_df.loc[(tmp[column] > 0.16667) & (tmp[column] <= 0.41667), column] = 0.333333
        oof_df.loc[(tmp[column] > 0.41667) & (tmp[column] <= 0.58333), column] = 0.500000
        oof_df.loc[(tmp[column] > 0.58333) & (tmp[column] <= 0.73333), column] = 0.666667
        oof_df.loc[(tmp[column] > 0.73333), column] = 1
    
    
    
    ################################################# handle type 2 columns      
#     type_two_column_list = [
#         'question_type_spelling'
#     ]
    
#     for column in type_two_column_list:
#         if sum(tmp[column] > 0.15)>0:
#             oof_df.loc[tmp[column] <= 0.15, column] = 0
#             oof_df.loc[(tmp[column] > 0.15) & (tmp[column] <= 0.45), column] = 0.333333
#             oof_df.loc[(tmp[column] > 0.45), column] = 0.666667
#         else:
#             t1 = max(int(len(tmp[column])*0.0013),2)
#             t2 = max(int(len(tmp[column])*0.0008),1)
#             thred1 = sorted(list(tmp[column]))[-t1]
#             thred2 = sorted(list(tmp[column]))[-t2]
#             oof_df.loc[tmp[column] <= thred1, column] = 0
#             oof_df.loc[(tmp[column] > thred1) & (tmp[column] <= thred2), column] = 0.333333
#             oof_df.loc[(tmp[column] > thred2), column] = 0.666667
    
    
    
    ################################################# handle type 3 columns      
    type_three_column_list = [
       'question_interestingness_self', 
    ]
    scaler = MinMaxScaler(feature_range=(0, 1))
    oof_df[type_three_column_list] = scaler.fit_transform(oof_df[type_three_column_list])
    tmp[type_three_column_list] = scaler.fit_transform(tmp[type_three_column_list])
    
    for column in type_three_column_list:
        oof_df.loc[tmp[column] <= 0.385, column] = 0.333333
        oof_df.loc[(tmp[column] > 0.385) & (tmp[column] <= 0.47), column] = 0.444444
        oof_df.loc[(tmp[column] > 0.47) & (tmp[column] <= 0.525), column] = 0.5
        oof_df.loc[(tmp[column] > 0.525) & (tmp[column] <= 0.605), column] = 0.555556
        oof_df.loc[(tmp[column] > 0.605) & (tmp[column] <= 0.715), column] = 0.666667
        oof_df.loc[(tmp[column] > 0.715) & (tmp[column] <= 0.8), column] = 0.833333
        oof_df.loc[(tmp[column] > 0.8) & (tmp[column] <= 0.94), column] = 0.888889
        oof_df.loc[(tmp[column] > 0.94), column] = 1
        
        
        
    ################################################# handle type 4 columns      
    type_four_column_list = [
        'answer_satisfaction'
    ]
    scaler = MinMaxScaler(feature_range=(0.2, 1))
    oof_df[type_four_column_list] = scaler.fit_transform(oof_df[type_four_column_list])
    tmp[type_four_column_list] = scaler.fit_transform(tmp[type_four_column_list])
    
    for column in type_four_column_list:
        
        oof_df.loc[tmp[column] <= 0.233, column] = 0.200000
        oof_df.loc[(tmp[column] > 0.233) & (tmp[column] <= 0.283), column] = 0.266667
        oof_df.loc[(tmp[column] > 0.283) & (tmp[column] <= 0.315), column] = 0.300000
        oof_df.loc[(tmp[column] > 0.315) & (tmp[column] <= 0.365), column] = 0.333333
        oof_df.loc[(tmp[column] > 0.365) & (tmp[column] <= 0.433), column] = 0.400000
        oof_df.loc[(tmp[column] > 0.433) & (tmp[column] <= 0.483), column] = 0.466667
        oof_df.loc[(tmp[column] > 0.483) & (tmp[column] <= 0.517), column] = 0.500000
        oof_df.loc[(tmp[column] > 0.517) & (tmp[column] <= 0.567), column] = 0.533333
        oof_df.loc[(tmp[column] > 0.567) & (tmp[column] <= 0.633), column] = 0.600000
        oof_df.loc[(tmp[column] > 0.633) & (tmp[column] <= 0.683), column] = 0.666667
        oof_df.loc[(tmp[column] > 0.683) & (tmp[column] <= 0.715), column] = 0.700000
        oof_df.loc[(tmp[column] > 0.715) & (tmp[column] <= 0.767), column] = 0.733333
        oof_df.loc[(tmp[column] > 0.767) & (tmp[column] <= 0.833), column] = 0.800000
        oof_df.loc[(tmp[column] > 0.883) & (tmp[column] <= 0.915), column] = 0.900000
        oof_df.loc[(tmp[column] > 0.915) & (tmp[column] <= 0.967), column] = 0.933333
        oof_df.loc[(tmp[column] > 0.967), column] = 1
    
    
    ################################################# round to i / 90 (i from 0 to 90)
    oof_values = oof_df[TARGET_COLUMNS].values
    DEGREE = len(oof_df)//45*9
#     if degree:
#         DEGREE = degree
#     DEGREE = 90
    oof_values = np.around(oof_values * DEGREE) / DEGREE  ### 90 To be changed
    oof_df[TARGET_COLUMNS] = oof_values
    
    return oof_df

In [None]:
test = postprocessing(test)

In [None]:
for column in TARGET_COLUMNS:
    print(test[column].value_counts())

# Assign postprocessed result

In [None]:
sub = test[TARGET_COLUMNS].reset_index()

In [None]:
sub[ sub[TARGET_COLUMNS] > 1.0] = 1.0

In [None]:
sub.head()

In [None]:
test = pd.read_csv(f'{DATA_DIR}/test.csv')

In [None]:
n=test['url'].apply(lambda x:(('ell.stackexchange.com' in x) or ('english.stackexchange.com' in x))).tolist()
spelling=[]
for x in n:
    if x:
        spelling.append(0.5)
    else:
        spelling.append(0.)

In [None]:
sub['question_type_spelling'] = spelling

In [None]:
sub.to_csv('submission.csv', index=False)