# Solution by XY

## All in One
I used this notebook for both training and prediction.

## Model
My model is ensemble of the following 7 bert-based models.

- 0206S0A1-squadBL CV413
- 0206S0A1E0 CV412 
- 0207S11A1E1 CV417
- 0206S3A1E0 CV402
- 0129SNA0 CV0403
- 0127aS0A1 CV0425
- 0129S11A1-squad2BL CV416


In [None]:
conf_spell_host_clip = True
conf_special_user_page = True


conf_extra_clip = False

conf_frac=1


conf_train = False
conf_save  = True # when train==False then False automaticaly

conf_save_dir = f'pth-0206S3A1E0'
conf_pretrain_dir = f'pth-0206S3A1E0'

conf_headerN = 2
conf_lr=3e-5
conf_max_epoch=15
conf_plot_result=True
conf_batch_size=8//conf_headerN
conf_num_fold=2
conf_cat_emb_dim = 3
conf_save=False if not conf_train else conf_save
conf_max_roll = 11 # max 11 since we have max 12 same question

In [None]:
%%capture
!pip install ../input/sacremoses/sacremoses-master/
!pip install ../input/transformers/transformers-2.2.2/

In [None]:
import pandas as pd
import numpy as np
import os,gc,random,glob 
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from scipy.stats import spearmanr

def _mkdir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
DATA_DIR = '../input/google-quest-challenge'
TRANSFORMERS_DIR='../input/bert-data'
TRANSFORMERS2_DIR='../input/albert'

In [None]:
df_train = pd.read_csv(f'{DATA_DIR}/train.csv')
df_test  = pd.read_csv(f'{DATA_DIR}/test.csv')
output_categories=df_train.columns[11:]
qcols=df_train.columns[11:32]

# QuestDataset 

In [None]:
import torch
import torch.optim as optim
from torchvision import datasets, models, transforms
from transformers import *
from math import floor, ceil
from sklearn.model_selection import GroupKFold

conf_CATEGORIES=['LIFE_ARTS', 'CULTURE', 'SCIENCE', 'STACKOVERFLOW', 'TECHNOLOGY']
conf_CHAR=['why']

def df_average(_dfs):

    ret_df = _dfs[0].copy()
    ret = np.zeros((_dfs[0].shape[0],_dfs[0].shape[1],len(_dfs)))
    for _i in range(len(_dfs)): 
        ret[:,:,_i] = _dfs[_i].values
        
    ret_df.iloc[:,:] = np.nanmean(ret,axis=-1)
    
    return ret_df

def compute_spearmanr(trues, preds,returnArray=False):
    rhos = []
    for col_trues, col_pred in zip(trues.T, preds.T):
        col_pred=np.around(col_pred, decimals=2)
        rho=spearmanr(col_trues, col_pred).correlation
        if np.isfinite(rho) | returnArray: 
            rhos.append(rho)
    return rhos if returnArray else np.mean(rhos)

def rollQuestion(_df):
    if len(_df)>1: # Augment Same Questions 
        ret_df = (pd.concat([_df for _ in range(min(conf_max_roll+1,len(_df)))])).reset_index(drop=True)
        ret_df[qcols] =pd.DataFrame({ col:np.concatenate([np.roll(_df[col],_i) for _i in range(min(conf_max_roll+1,len(_df))) ]) for col in qcols}).reset_index(drop=True)
    else:
        ret_df=_df.reset_index(drop=True)
    return ret_df

def seed_everything(seed: int):
    random.seed(seed);os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed);torch.manual_seed(seed);torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
def masked_mean(_x,_m,_dim=1):
    return torch.sum(_x*_m,_dim)/torch.sum(_m,_dim)
    
seed_everything(42)
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

MAX_LEN = 512;A_MAX_LEN=239;Q_MAX_LEN=239
SEP_TOKEN_ID = 102  # by checking self.tokenizer.convert_tokens_to_ids('[SEP]')　

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, header_n, train_mode=True, labeled=True):
        self.df = df
        self.header_n = header_n
        self.train_mode = train_mode
        self.labeled = labeled
        self.tokenizer = BertTokenizer.from_pretrained(f'{TRANSFORMERS_DIR}/bert-base-uncased')
        
    def __getitem__(self, index):
        row = self.df.iloc[index]
        aux = self.get_aux(row)
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, aux,labels
        else:
            return token_ids, seg_ids, aux

    def __len__(self):
        return len(self.df)

    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=30, q_max_len=239, a_max_len=239):
        
        ret_t=[];ret_q=[];ret_a=[]
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)
        t_len = len(t);q_len = len(q);a_len = len(a)

        if (t_len+q_len+a_len+4) > max_sequence_length:
            if t_max_len > t_len:
                t_new_len = t_len
                a_max_len = a_max_len + floor((t_max_len - t_len)/2)
                q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
            else:
                t_new_len = t_max_len
            if a_max_len > a_len:
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len
            if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d"% (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

            for idx in range(self.header_n):
                
                ret_t.append(t[:t_new_len])
                if idx%2 == 0:
                    ret_q.append(list(np.roll(q, -(idx//2)*q_new_len))[:q_new_len])
                    ret_a.append(list(np.roll(a, -(idx//2)*a_new_len))[:a_new_len])
                else:
                    ret_q.append(list(np.roll(q, -(idx//2)*q_new_len))[-q_new_len:])
                    ret_a.append(list(np.roll(a, -(idx//2)*a_new_len))[-a_new_len:])
        else:
           
            ret_t=[t]*self.header_n ; ret_q=[q]*self.header_n ; ret_a=[a]*self.header_n
                    
        return ret_t, ret_q, ret_a
        
    def get_token_ids(self, row):
        
        t_tokens, q_tokens, a_tokens = self.trim_input(row.question_title, row.question_body, row.answer)
        tokens=[];token_ids=[];ids=[];seg_ids=[]
        for idx in range(self.header_n):
        
            tokens.append(['[CLS]'] + t_tokens[idx] + ['[SEP]'] + q_tokens[idx] + ['[SEP]'] + a_tokens[idx] + ['[SEP]'])
                          
            token_ids.append( self.tokenizer.convert_tokens_to_ids(tokens[idx]) )
            
            if len(token_ids[idx]) < MAX_LEN:
                token_ids[idx] += [0] * (MAX_LEN - len(token_ids[idx]))
            ids.append(torch.tensor(token_ids[idx]))
            seg_ids.append(self.get_seg_ids(ids[idx]))
            
        return torch.stack(ids),torch.stack(seg_ids)
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)
        seg_ids[pad_idx] = 0

        return seg_ids

    def get_label(self, row):
        return torch.tensor(row[output_categories].values.astype(np.float32))
     
    def get_aux(self, row):
        return torch.tensor(row['aux'].astype(np.float32))

    def collate_fn(self, batch):
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        aux   = torch.stack([x[2] for x in batch])
        if self.labeled:
            labels = torch.stack([x[3] for x in batch])
            return token_ids, seg_ids, aux, labels
        else:
            return token_ids, seg_ids, aux

def get_tst_loader(df,header_n,batch_size=conf_batch_size):
    ds_test = QuestDataset(df,header_n, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    
    return loader
        
def get_trn_loader(df,header_n,batch_size=conf_batch_size,shuffle=True, drop_last=True):
    ds_train = QuestDataset(df,header_n)
    loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=shuffle, num_workers=4, collate_fn=ds_train.collate_fn, drop_last=drop_last)
    loader.num = len(df)
    
    return loader

def get_val_loader(df,header_n,batch_size=conf_batch_size):
    ds_val = QuestDataset(df,header_n, train_mode=False)
    loader = torch.utils.data.DataLoader(ds_val, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=ds_val.collate_fn, drop_last=False)
    loader.num = len(df)
    loader.df = df
    
    return loader

# Encode Labels 

In [None]:
uniqueLabels={}
for col in output_categories:
    uniqueLabels[col] = df_train[col].unique()
    uniqueLabels[col].sort()    
    uniqueLabels[col] = uniqueLabels[col][1:]
    
def encodeLabels(_labels,igCols=[],returnOrig=False):
    
    _nrow = _labels.shape[0]
    
    ret=[]
    
    for col in [ _c for _c in output_categories if _c not in igCols]:
            
        col_idx = list(output_categories).index(col)
            
        ret.append( np.expand_dims(_labels[:,col_idx],1)>=np.tile(uniqueLabels[col],(_nrow,1)) )
    
    _binaries = np.hstack(ret).astype('float')
    
    if returnOrig:
        return np.hstack([_labels,_binaries])
    else:
        return _binaries
    

def decodeLabels(_binaries,igCols=[]):
    
    ret=[]
    
    st=0
    for col in [ _c for _c in output_categories if _c not in igCols]:
        
        ed = st + len(uniqueLabels[col])
        ret.append(  np.mean(_binaries[:,st:ed],axis=-1) )
        
        st = ed
        
    return np.vstack(ret).T


# AddFeatures

In [None]:
class QuestDataset4Plugin(torch.utils.data.Dataset):
    def __init__(self, df, qmconf):
        self.df = df ; self.cfg = qmconf
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg['pgPath'])
        self.MAX_LEN=512
        self.SEP_ID = self.tokenizer.encode(self.tokenizer.sep_token,add_special_tokens=False)[0]
    def clip(self,x):
        return torch.tensor(x[:self.MAX_LEN])
    def __getitem__(self, index):
        row = self.df.iloc[index]
        q_input_ids = self.tokenizer.encode(row.question_title,row.question_body,max_length=self.MAX_LEN,pad_to_max_length=True)
        a_input_ids = self.tokenizer.encode(row.question_title,row.answer,max_length=self.MAX_LEN,pad_to_max_length=True)
        q_seg_ids = [0 if i <= q_input_ids.index(self.SEP_ID)  else 1 for i in range(len(q_input_ids)) ]
        a_seg_ids = [0 if i <= a_input_ids.index(self.SEP_ID)  else 1 for i in range(len(a_input_ids)) ]
        return self.clip(q_input_ids), self.clip(q_seg_ids), self.clip(a_input_ids), self.clip(a_seg_ids)
                     
    def __len__(self):
        return len(self.df)

    def collate_fn(self, batch):
        q_input_ids = torch.stack([x[0] for x in batch]);q_seg_ids = torch.stack([x[1] for x in batch])
        a_input_ids = torch.stack([x[2] for x in batch]);a_seg_ids = torch.stack([x[3] for x in batch])
        return q_input_ids, q_seg_ids, a_input_ids, a_seg_ids

def get_plugin_loader(df,qmconf,batch_size=1):
    ds_plugin = QuestDataset4Plugin(df,qmconf)
    loader = torch.utils.data.DataLoader(ds_plugin, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=ds_plugin.collate_fn, drop_last=False)
    loader.num = len(df)
    return loader


In [None]:
def append_pluginFeatures(_trn,_tst,qmconf):

    _config=AutoConfig.from_pretrained(qmconf['pgPath'], output_hidden_states=True, output_attentions=False)
    _model=AutoModel.from_pretrained(qmconf['pgPath'],config=_config)   
    _SIZE=_config.hidden_size ; _MAX_LEN=512 ; _UseQ = True
    
    _model.eval();_model.to(device)
    
    qmconf['plugin-size'] = _SIZE*(_UseQ+1)

    def append_pluginFeatureToDf(_df):
        
        aux_plg = np.zeros((len(_df),_SIZE*(_UseQ+1)))
        
        with torch.no_grad():

            q_hidden=[] ; a_hidden=[]   
            for _idx, (q_ids, q_seg_ids, a_ids, a_seg_ids)  in tqdm(enumerate(get_plugin_loader(_df, qmconf,batch_size=qmconf['pgBatchSize']))):

                q_ids, q_seg_ids, a_ids, a_seg_ids = q_ids.to(device), q_seg_ids.to(device), a_ids.to(device), a_seg_ids.to(device)
                q_outputs = _model(q_ids, token_type_ids=q_seg_ids) ; a_outputs = _model(a_ids, token_type_ids=a_seg_ids)
                q_hidden_states=q_outputs[-1] ; a_hidden_states=a_outputs[-1] 

                q_mask = (q_ids>0).unsqueeze(-1).float()
                a_mask = (a_ids>0).unsqueeze(-1).float()
                
                q_hidden.append( masked_mean(q_hidden_states[-1],q_mask,1).cpu().numpy().astype('float32') )
                a_hidden.append( masked_mean(a_hidden_states[-1],a_mask,1).cpu().numpy().astype('float32') )

        aux_plg[:,:_SIZE] = np.vstack(q_hidden)
        aux_plg[:,_SIZE:] = np.vstack(a_hidden)

        _df['aux_plugin']=[aux_plg[_i,:] for _i in range(len(_df))]
        
        return _df

    if qmconf['train']:
        _trn = append_pluginFeatureToDf(_trn)
    
    _tst = append_pluginFeatureToDf(_tst)
    
    del _config,_model;gc.collect();
    
    if device is torch.device("cuda:0"):
        torch.cuda.empty_cache()
    if qmconf['train']:
        return _trn,_tst,qmconf
    else:
        return _tst,qmconf

In [None]:
def addFeatrure(_trn,_tst,qmconf):

    def myCatOh(x):
        return np.eye(len(conf_CATEGORIES))[conf_CATEGORIES.index(x)] if x in conf_CATEGORIES else np.ones(len(conf_CATEGORIES))/len(conf_CATEGORIES)
    _trn['aux'] = _trn['category'].apply(myCatOh) ; _tst['aux'] = _tst['category'].apply(myCatOh)
    
    def myCatOhChar(x):
        return np.eye(1+len(conf_CHAR))[conf_CHAR.index(x)] if x in conf_CHAR else np.eye(1+len(conf_CHAR))[-1]
    
    _trn['aux_char'] = _trn.question_title.apply(lambda s:s[0:3].lower()).apply(myCatOhChar) 
    _tst['aux_char'] = _tst.question_title.apply(lambda s:s[0:3].lower()).apply(myCatOhChar)
    
    _trn['aux'] = [ np.concatenate([x0,x1]) for x0,x1 in zip(list(_trn['aux']),list(_trn['aux_char'])) ]
    _tst['aux'] = [ np.concatenate([x0,x1]) for x0,x1 in zip(list(_tst['aux']),list(_tst['aux_char'])) ]
    
    if qmconf['plugin']==True:
        
        if qmconf['train']:
            _trn,_tst,qmconf = append_pluginFeatures(_trn,_tst,qmconf)
            _trn['aux'] = [ np.concatenate([x0,x1]) for x0,x1 in zip(list(_trn['aux']),list(_trn['aux_plugin'])) ]
        else:
            _tst,qmconf = append_pluginFeatures(_trn,_tst,qmconf)
            
        _tst['aux'] = [ np.concatenate([x0,x1]) for x0,x1 in zip(list(_tst['aux']),list(_tst['aux_plugin'])) ]
    
    return _trn,_tst,qmconf
        

# Model 

In [None]:
from transformers import *
import torch
import torch.nn as nn
import torch.nn.functional as F

class QuestModel(nn.Module):

    def __init__(self,headerN, cfg, n_classes=30):
        super(QuestModel, self).__init__()
        
        self.model_name = 'QuestModel'
        self.headerN = headerN
        self.cfg = cfg
        self.pluginSize = self.cfg['plugin-size'] if self.cfg['plugin'] else 0
        
        # overwrite n_classes
        self.n_classes=30
        if self.cfg['enc'] or self.cfg['enc2']:
            self.n_classes = np.sum([len(uniqueLabels[col]) for col in output_categories])
        
        if self.cfg['enc2']:
            self.decs = nn.ModuleList( [ nn.Linear(len(uniqueLabels[col]),1) for col in output_categories ])
            for m in self.decs: 
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
        config = BertConfig.from_pretrained(f'{TRANSFORMERS_DIR}/bert-base-uncased/', output_hidden_states=True, output_attentions=False)
        
        # Don't load pretrain model at inference
        self.bert_model = BertForSequenceClassification.from_pretrained(f'{TRANSFORMERS_DIR}/bert-base-uncased/',config=config) if _qmconf['train'] else BertForSequenceClassification(config=config)

        self.num_hidden_cls = 3        
        self.num_hidden_seq = 2      
        
        # 2 sperate hidden state for Q and A if use QA augemntation
        self.fc = nn.Linear( self.pluginSize + ((1+len(conf_CHAR))//2) + conf_cat_emb_dim + self.headerN*((1+self.cfg['aug'])*self.num_hidden_seq+self.num_hidden_cls)*768, self.n_classes)
        
        self.emb  = nn.Linear( len(conf_CATEGORIES), conf_cat_emb_dim )
        self.emb2 = nn.Linear( 1+len(conf_CHAR), (1+len(conf_CHAR))//2)
        
        
    def forward(self, ids, seg_ids, aux):
        
        # cat emb
        catEmb = self.emb(aux[:,:len(conf_CATEGORIES)])
        
        # char emb
        charEmb = self.emb2(aux[:,len(conf_CATEGORIES):( len(conf_CATEGORIES) + (1+len(conf_CHAR)) )])
        
        # Calc fc input for each header, ex (q-head,a-head) (q-tail,a-tail) in 2 header conf
        all_header_fc_input=[]
        for hidx in range(self.headerN):

            mask = (ids[:,hidx,:] > 0)
            _, hidden_states = self.bert_model(input_ids=ids[:,hidx,:], token_type_ids=seg_ids[:,hidx,:], attention_mask=mask)

            cls_input = []
            for _ith in range( self.num_hidden_cls ):
                cls_input.append(hidden_states[-_ith][:, 0].reshape((-1,  768))) # [cls]
                
            # print(hidden_states[0].size())  [4, 512, 768]
                
            seq_inputQ = [] ; seq_inputA = [] ; seq_input  = []
            
            mask = mask.unsqueeze(-1).float() # > 4,512,1
            maskQ = (( seg_ids[:,hidx,:]==0 )*(ids[:,hidx,:] > 0)).unsqueeze(-1).float()
            maskA = (( seg_ids[:,hidx,:]==1 )*(ids[:,hidx,:] > 0)).unsqueeze(-1).float()
            
            for _ith in range( self.num_hidden_seq ):

                if self.cfg['aug']:

                    # masked-mean
                    seq_inputQ.append( masked_mean( hidden_states[-_ith],maskQ,1) )
                    seq_inputA.append( masked_mean( hidden_states[-_ith],maskA,1) )
        
                else:
                    
                    seq_input.append( masked_mean(hidden_states[-_ith],mask,1) )
                    
            cls_input  = torch.cat(cls_input, 1)
            cls_input  = F.dropout(cls_input, p=0.2, training=self.training) 
            
            if self.cfg['aug']:
                seq_inputQ = torch.cat(seq_inputQ, 1)
                seq_inputA = torch.cat(seq_inputA, 1)
                fc_input   = torch.cat([cls_input,seq_inputQ,seq_inputA], 1) 
            else:
                seq_input  = torch.cat(seq_input, 1)
                fc_input   = torch.cat([cls_input,seq_input], 1) 
            
            all_header_fc_input.append(fc_input)
        
        all_header_fc_input = torch.stack(all_header_fc_input)
        all_header_fc_input=all_header_fc_input.permute(1, 0, 2)
        all_header_fc_input=torch.flatten(all_header_fc_input, start_dim=1)
        
        # Final FC Input
        if self.cfg['plugin']:
            plugin_aux = aux[:,( len(conf_CATEGORIES) + (1+len(conf_CHAR)) ):]
            final_fc_input= torch.cat([all_header_fc_input,catEmb,charEmb,plugin_aux],1)
        else:
            final_fc_input = torch.cat([all_header_fc_input,catEmb,charEmb],1)
        
        # FC layer
        if self.cfg['msdropout']:
            logits=[]
            for _ in range(5):
                logits.append( self.fc( F.dropout(final_fc_input, p=0.5, training=self.training) ) )
            logit = torch.mean(torch.stack(logits),0)
        else:
            logit = self.fc( final_fc_input )
            
        # decode    
        if self.cfg['enc2']:
            
            decoded=[]
            st=0
            for _i,col in enumerate(output_categories):
                ed=st+len(uniqueLabels[col])
                decoded.append( self.decs[_i](torch.sigmoid(logit[:,st:ed])) )
                st=ed
                
            pred  = torch.cat(decoded,-1)
            logit = torch.cat([pred,logit],1)
            
        return logit


In [None]:
if conf_frac<1:
    df_train=df_train.sample(frac=conf_frac,random_state=2019);df_test =df_test.sample(frac=conf_frac,random_state=2019);print("SAMPLED")

# Train 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((0.01, 0.99))

C_token_ids, C_seg_ids, C_aux = [],[],[] # cash for estimation
def CC():#Clear Cash:
    C_token_ids.clear(), C_seg_ids.clear(), C_aux.clear()
    
def train_predict(_df_train,_df_test,_qmconf):
    
    tstPreds=[] ; valPreds=[] ; fold_split=[] ; fold_histories = []
    gkf = GroupKFold(n_splits=conf_num_fold).split(X=_df_train.question_body, groups=_df_train.question_body)
    
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        
        trn_df = _df_train.iloc[train_idx].copy()
        val_df = _df_train.iloc[valid_idx].copy()
        fold_split.append((train_idx, valid_idx))
        
        # Augmentation by swapping same question labels
        if _qmconf['train']:trn_df = trn_df.groupby('question_title').apply(rollQuestion).reset_index(drop=True) if _qmconf['aug']==True else trn_df

        print(f"FOLD{fold} ")
        _history={};_history['val']=[];_history['tst']=[];_history['rho']=[]
        best_rho = -100

        # model
        model = QuestModel(headerN=conf_headerN, cfg=_qmconf)
        plist = [{'params': model.parameters(), 'lr': conf_lr}]
        optimizer = optim.Adam(plist, lr=conf_lr) 
        
        criterion = torch.nn.BCEWithLogitsLoss()
        
        model.to(device)
        
        _max_epoch = conf_max_epoch if _qmconf['train'] else len(glob.glob(f"../input/{_qmconf['pretrain_dir'].lower()}*/MW_fold{fold}_part*.pth")) 
        
        for epoch in range(_max_epoch):

            torch.cuda.empty_cache()

            print('Epoch {}/{}'.format(epoch, _max_epoch),end=" ")

            # training
            if _qmconf['train']:
                
                model.train()    
                tr_loss = 0
                for step, (token_ids, seg_ids, aux, labels) in enumerate(get_trn_loader(trn_df,conf_headerN)):

                    if _qmconf['enc'] or _qmconf['enc2']:
                        labels = torch.tensor( encodeLabels(labels,returnOrig=_qmconf['enc2']),dtype=torch.float32 )
                  
                    token_ids, seg_ids, labels, aux = token_ids.to(device), seg_ids.to(device), labels.to(device), aux.to(device)

                    outputs = model(token_ids, seg_ids, aux)
                    
                    if _qmconf['enc2']:

                        loss = criterion(outputs[:30], labels[:30]) + criterion(outputs[:,30:], labels[:,30:]) 
                    
                    else:
                        loss = criterion(outputs, labels)
                    
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

            else:
                
                pthFileName = glob.glob(f"../input/{_qmconf['pretrain_dir'].lower()}*/MW_fold{fold}_part{epoch}.pth")[0]
                print(f"Loading {pthFileName} ")
                model.load_state_dict(torch.load(f"{pthFileName}"))

            with torch.no_grad():

                model.eval()

                # validate ( just load pre-calculated validation result at prediction mode )
                if  _qmconf['train']:
                    
                    valPred=[];valTrue=[]
                    for step, (token_ids, seg_ids, aux, labels)  in enumerate(get_val_loader(val_df,conf_headerN)):

                        token_ids, seg_ids, aux= token_ids.to(device), seg_ids.to(device), aux.to(device)
                        valPred.append(torch.sigmoid(model(token_ids, seg_ids, aux)).cpu())
                        valTrue.append(labels)

                    valPred = np.vstack(valPred);valTrue  = np.vstack(valTrue)
                    if _qmconf['enc']:
                        valPred = decodeLabels(valPred)
                    elif _qmconf['enc2']:
                        valPred0 = scaler.fit_transform(decodeLabels(valPred[:,30:]))
                        valPred1 = scaler.fit_transform(valPred[:,:30])
                        valPred = ( valPred0+valPred1)/2 

                    rho = compute_spearmanr(valTrue,valPred)
                    
                    _history['rho'].append(rho) ; _history['val'].append(valPred) ;

                # predict test
                tstPred=[]
                if len(C_token_ids) == 0: #no cash first run
                    
                    for step, (token_ids, seg_ids, aux) in enumerate(get_tst_loader(_df_test,conf_headerN,batch_size=_qmconf['BatchSize'])):

                        #create cash
                        C_token_ids.append(token_ids.clone());C_seg_ids.append(seg_ids.clone());C_aux.append(aux.clone())

                        token_ids, seg_ids, aux= token_ids.to(device), seg_ids.to(device), aux.to(device)
                        tstPred.append( torch.sigmoid(model(token_ids, seg_ids, aux)).cpu() )

                else: # using cahsed data
                    
                    for step, (token_ids, seg_ids, aux) in enumerate(zip(C_token_ids, C_seg_ids, C_aux)):

                        token_ids, seg_ids, aux= token_ids.to(device), seg_ids.to(device), aux.to(device)
                        tstPred.append( torch.sigmoid(model(token_ids, seg_ids, aux)).cpu() )

                tstPred = np.vstack(tstPred)
                if _qmconf['enc']:
                    tstPred = decodeLabels(tstPred)
                elif _qmconf['enc2']:
                    tstPred0 = scaler.fit_transform(decodeLabels(tstPred[:,30:]))
                    tstPred1 = scaler.fit_transform(tstPred[:,:30])
                    tstPred = ( tstPred0+tstPred1)/2 
                    
                _history['tst'].append(tstPred)

                if _qmconf['train']:
                    if conf_save:
                        _mkdir(_qmconf['save_dir']);_mkdir(_qmconf['save_dir']+f'/fold{fold}')
                        if epoch>2:
                            !rm ./{_qmconf['save_dir']}/fold{fold}/MW_ep{epoch-3}.pth
                        torch.save( model.state_dict(), f"./{_qmconf['save_dir']}/fold{fold}/MW_ep{epoch}.pth" )

                    if rho>best_rho:
                        best_rho=rho
                    else:
                        if epoch>1:
                            print(f"{rho:.4f} Stop training")
                            break

                    print(f"{_history['rho'][-1]:.4f}")


        fold_histories.append(_history)

        del model,optimizer,criterion,trn_df,val_df

        gc.collect();torch.cuda.empty_cache()

    # ckpt ensemble
    epochs = [ np.arange(-1,-1-min(3,len(fold_histories[_fold]['tst'])),-1) for _fold in range(conf_num_fold) ]
    
    test_predictions  = np.average( [ fold_histories[_fold]['tst'][_ep] for _fold in range(conf_num_fold) for _ep in epochs[_fold]  ] ,axis=0 )
    
    if  _qmconf['train']:
        valid_predictions = np.zeros_like(df_train[output_categories])
        for _fold in range(conf_num_fold):
            valid_predictions[fold_split[_fold][1]] = np.average( [ fold_histories[_fold]['val'][_ep] for _ep in  epochs[_fold] ] ,axis=0 )
    else:
        # load validation results calculated at training stage
        precal_val_df = pd.read_csv(glob.glob(f'../input/{_qmconf["pretrain_dir"].lower()}*/*predVal_df.csv')[0])
        valid_predictions = precal_val_df.values if conf_frac==1 else precal_val_df.sample(frac=conf_frac,random_state=2019).values
        
        
    return test_predictions, valid_predictions


# Main

In [None]:
%%time
questModelConfigs=[]
if conf_train:
    
    # train config 
    questModelConfigs.append({'seed':11,'plugin':False,'aug':True ,'train':conf_train,'pretrain_dir':'pth-0207S11A1E1','save_dir':'pth-0207S11A1E1',"BatchSize":16,'msdropout':True,'enc':False,'enc2':True})
    
else:
    
    # 7 models

    # base model
    questModelConfigs.append({'seed':'N','plugin':False,'aug':False ,'train':conf_train,'pretrain_dir':f'pth-0129SNA0','save_dir':f'pth-0129SNA0',"BatchSize":16,'msdropout':False,'enc':False,'enc2':False})
    
    # with data augmentation
    questModelConfigs.append({'seed':0,'plugin':False,'aug':True ,'train':conf_train,'pretrain_dir':"pth-0127aS0A1",'save_dir':"pth-0127aS0A1","BatchSize":16,'msdropout':False,'enc':False,'enc2':False})
    
    # predicts encoded label
    questModelConfigs.append({'seed':0,'plugin':False,'aug':True ,'train':conf_train,'pretrain_dir':'pth-0206S0A1E0','save_dir':'pth-0206S0A1E0',"BatchSize":16,'msdropout':True,'enc':True,'enc2':False})
    questModelConfigs.append({'seed':3,'plugin':False,'aug':True ,'train':conf_train,'pretrain_dir':'pth-0206S3A1E0','save_dir':'pth-0206S3A1E0',"BatchSize":16,'msdropout':True,'enc':True,'enc2':False})
    
    # predicts label and encoded label
    questModelConfigs.append({'seed':11,'plugin':False,'aug':True ,'train':conf_train,'pretrain_dir':'pth-0207S11A1E1','save_dir':'pth-0207S11A1E1',"BatchSize":16,'msdropout':True,'enc':False,'enc2':True})
    
    # uses hugging face pretrained model's output as aux features 
    questModelConfigs.append({'seed':'CLEAR_CASH'})
    questModelConfigs.append({'seed':0,'plugin':True,'aug':True,'train':conf_train,'pretrain_dir':'pth-0206S0A1-squadBL','save_dir':'pth-0206S0A1-squadBL',
                              'pgPath':'../input/bert-data/bert-large-cased-whole-word-masking-finetuned-squad','pgBatchSize':16,"BatchSize":16,'msdropout':False,'enc':False,'enc2':False})
    
    questModelConfigs.append({'seed':'CLEAR_CASH'})
    questModelConfigs.append({'seed':11,'plugin':True,'aug':True,'train':conf_train,'pretrain_dir':'pth-0129S11A1-squad2BL','save_dir':'pth-0129S11A1-squad2BL',
                               'pgPath':'../input/bert-data/bert-large-uncased-whole-word-masking-squad2','pgBatchSize':16,"BatchSize":16,'msdropout':False,'enc':False,'enc2':False})
    
    
history=[];predVals=[];predTsts=[]

for _qmconf in questModelConfigs:
    
    if _qmconf['seed'] is 'CLEAR_CASH':
        CC()
        continue
    
    trn = df_train.copy(); tst = df_test.copy()
    
    trn=trn.sample(frac=1,random_state=_qmconf['seed']) if _qmconf['seed']!='N' else trn 
    
    trn,tst,_qmconf = addFeatrure(trn,tst,_qmconf) # append 'plugin-size' size to _qmconf
    test_predictions,valid_predictions = train_predict(trn,tst,_qmconf)
    
    predVal_df = pd.DataFrame(valid_predictions, columns=output_categories, index=trn.index if _qmconf['train'] else df_train.index).sort_index()
    predTst_df = pd.DataFrame(test_predictions,  columns=output_categories, index=df_test.index)
    
    predVals.append(predVal_df)
    predTsts.append(predTst_df)
    
    predVal_df.to_csv(f'{_qmconf["save_dir"]}predVal_df.csv',index=False);predTst_df.to_csv(f'{_qmconf["save_dir"]}predTst_df.csv',index=False)

    del trn,tst;gc.collect()
    

# Blending

In [None]:
cvs=np.vstack([ compute_spearmanr(df_train.iloc[:,11:].values, predVals[idx].values,returnArray=True) for idx in range(7) ])
cvs=pd.DataFrame(cvs,columns=output_categories).round(3)
cvs.T

In [None]:
drop={}
drop['question_asker_intent_understanding']=[0]
drop['question_body_critical']=[0]
drop['question_conversational']=[]
drop['question_expect_short_answer']=[]
drop['question_fact_seeking']=[]
drop['question_has_commonly_accepted_answer']=[]
drop['question_interestingness_others']=[0,6]
drop['question_interestingness_self']=[]
drop['question_multi_intent']=[4]

drop['question_not_really_a_question']=[2,3,4,6]

drop['question_opinion_seeking']=[]
drop['question_type_choice']=[]
drop['question_type_compare']=[]
drop['question_type_consequence']=[0,4]
drop['question_type_definition']=[0,4]
drop['question_type_entity']=[2,3,4]
drop['question_type_instructions']=[]
drop['question_type_procedure']=[6]
drop['question_type_reason_explanation']=[]

drop['question_type_spelling']=[0,2,3,4] #[0,2,3,4]

drop['question_well_written']=[]
drop['answer_helpful']=[0]
drop['answer_level_of_information']=[]
drop['answer_plausible']=[0]
drop['answer_relevance']=[]
drop['answer_satisfaction']=[]
drop['answer_type_instructions']=[]
drop['answer_type_procedure']=[6]
drop['answer_type_reason_explanation']=[]
drop['answer_well_written']=[0]

In [None]:
for col in output_categories:
    
    for drop_id in drop[col]:
    
        predVals[ drop_id ].loc[:,col] = np.nan
        predTsts[ drop_id ].loc[:,col] = np.nan


In [None]:
predVal_df = df_average(predVals) # averaging by nanmean
predTst_df = df_average(predTsts)
predVal_df.to_csv('predVal_df.csv',index=False);predTst_df.to_csv('predTst_df.csv',index=False)

# Normalize

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler((0.01, 0.99))
predVal_df.iloc[:,:] = scaler.fit_transform(predVal_df)
predTst_df.iloc[:,:] = scaler.fit_transform(predTst_df)

# Clipping

In [None]:
if conf_frac<1:
    df_train = df_train.sort_index()

round_target = ['question_conversational','question_type_compare','question_type_consequence','question_type_definition', 'question_type_entity']

hard_target_cols = ['question_type_spelling','question_not_really_a_question'] 

extra_lower_cols=[]
if conf_extra_clip:
    extra_lower_cols=['question_asker_intent_understanding',
 'question_body_critical',
 'question_expect_short_answer',
 'question_fact_seeking',
 'question_has_commonly_accepted_answer',
 'question_interestingness_others',
 'question_interestingness_self',
 'question_multi_intent',
 'question_type_choice',
 'question_type_instructions',
 'question_type_reason_explanation',
 'question_well_written',
 'answer_helpful',
 'answer_level_of_information',
 'answer_plausible',
 'answer_relevance',
 'answer_satisfaction',
 'answer_type_instructions',
 'answer_type_reason_explanation',
 'answer_well_written']

for col in round_target+hard_target_cols+extra_lower_cols:

    threshold = np.sum(df_train[col]==df_train[col].min())/len(df_train) 

    isNearZeroVal = predVal_df[col].rank(pct=True) < threshold
    isNearZeroTst = predTst_df[col].rank(pct=True) < threshold

    predVal_df.loc[isNearZeroVal,col] = 0.01
    predTst_df.loc[isNearZeroTst,col] = 0.01
    
if conf_spell_host_clip:
    
    isEnglishHostVal = ( df_train.host=='english.stackexchange.com')  
    isEnglishHostTst = ( df_test.host =='english.stackexchange.com') 
    predVal_df.loc[~isEnglishHostVal,'question_type_spelling']=0.01
    predTst_df.loc[~isEnglishHostTst,'question_type_spelling']=0.01
    
    
if conf_special_user_page:
    
    special_user_page = list(df_train[df_train.question_not_really_a_question>0].question_user_page.unique())
    special_user_page_mean = df_train.loc[df_train.question_user_page.apply(lambda x:x in special_user_page),'question_not_really_a_question'].mean()

    isSpecialUserPageVal = df_train.question_user_page.apply(lambda x:x in special_user_page)
    isSpecialUserPageTst = df_test.question_user_page.apply(lambda x:x in special_user_page)

    predVal_df.loc[isSpecialUserPageVal,'question_not_really_a_question']=special_user_page_mean
    predTst_df.loc[isSpecialUserPageTst,'question_not_really_a_question']=special_user_page_mean

    
extra_higher_cols=[]
if conf_extra_clip:
    
    extra_higher_cols=['question_has_commonly_accepted_answer',
 'answer_helpful',
 'answer_plausible',
 'answer_relevance']
    
for col in extra_higher_cols:

    threshold = np.sum(df_train[col]==df_train[col].max())/len(df_train) 

    isNearOneVal = predVal_df[col].rank(pct=True) > threshold
    isNearOneTst = predTst_df[col].rank(pct=True) > threshold

    predVal_df.loc[isNearOneVal,col] = 0.99
    predTst_df.loc[isNearOneTst,col] = 0.99

    

# Rounding

In [None]:
predVal_df = predVal_df.round(2)
predTst_df = predTst_df.round(2)

# Submission 

In [None]:
sub = predTst_df.copy()
sub.insert(0,'qa_id',df_test.qa_id)

sub.iloc[:, 1:] = predTst_df.values


if ( predVal_df.iloc[:,1:].isnull().any().any()==True ) or ( (predVal_df.iloc[:,1:].std()==0).any()==True ):
    
    pass
    
else:
    
    sub.to_csv('submission.csv', index=False)


if conf_plot_result:
    sub.iloc[:,1:].hist(bins=100,figsize=(18,18))



# CV 

In [None]:
valid_true_df = df_train.loc[:,output_categories[:30]].copy()

# Caluclate rho of validation
rhos = compute_spearmanr(valid_true_df.values,predVal_df.values,returnArray=True)
for _i,col in enumerate(predVal_df.columns):
    print(f"{col}:{rhos[_i]:.3f}")
print("-"*30);print(f"average:{np.mean(rhos):.3f}")