In [None]:
def get_robin_preds():
    import os
    import math
    import sys
    import numpy as np
    import pandas as pd
    import time
    import datetime
    import gc
    import copy
    import random
    from scipy.stats import spearmanr

    from sklearn.model_selection import KFold, GroupKFold
    from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.utils.data import DataLoader, Dataset

    from tqdm import tqdm_notebook as tqdm

    os.system('pip install ../input/sacremoses/sacremoses-master/ > /dev/null')
    sys.path.insert(0, '../input/transformers/transformers-d46147294852694d1dc701c72b9053ff2e726265/')

    import transformers
    from transformers import BertModel, BertTokenizer, BertConfig, \
        RobertaModel, RobertaTokenizer, RobertaConfig, \
        XLNetModel, XLNetTokenizer, XLNetConfig, \
        AlbertModel, AlbertTokenizer, AlbertConfig


    N_TARGETS = 30
    N_Q_TARGETS = 21
    N_A_TARGETS = 9
    TARGETS = [
        'question_asker_intent_understanding', 'question_body_critical',
        'question_conversational', 'question_expect_short_answer',
        'question_fact_seeking', 'question_has_commonly_accepted_answer',
        'question_interestingness_others', 'question_interestingness_self',
        'question_multi_intent', 'question_not_really_a_question',
        'question_opinion_seeking', 'question_type_choice',
        'question_type_compare', 'question_type_consequence',
        'question_type_definition', 'question_type_entity',
        'question_type_instructions', 'question_type_procedure',
        'question_type_reason_explanation', 'question_type_spelling',
        'question_well_written', 'answer_helpful',
        'answer_level_of_information', 'answer_plausible', 'answer_relevance',
        'answer_satisfaction', 'answer_type_instructions',
        'answer_type_procedure', 'answer_type_reason_explanation',
        'answer_well_written'
    ]
    PRETRAINED_PATH = '../input/pretrained-models/'

    def get_categorical_features(train, test, feature):
        unique_vals = list(set(train[feature].unique().tolist() 
                               + test[feature].unique().tolist()))
        feat_dict = {i + 1: e for i, e in enumerate(unique_vals)}
        feat_dict_reverse = {v: k for k, v in feat_dict.items()}

        train_feat = train[feature].apply(lambda x: feat_dict_reverse[x]).values
        test_feat = test[feature].apply(lambda x: feat_dict_reverse[x]).values

        return train_feat, test_feat, feat_dict, feat_dict_reverse
    
    
    class TextDataset4(Dataset):

        def __init__(self, x_features, ids, seg_ids, idxs, targets=None):
            self.ids = ids[idxs].astype(np.long)
            self.seg_ids = seg_ids[idxs].astype(np.long)
            self.x_features = x_features[idxs].astype(np.float32)
            if targets is not None: self.targets = targets[idxs].astype(np.float32)
            else: self.targets = np.zeros((self.x_features.shape[0], N_TARGETS), dtype=np.float32)

        def __getitem__(self, idx):
            ids = self.ids[idx]
            seg_ids = self.seg_ids[idx]
            x_feats = self.x_features[idx]
            target = self.targets[idx]
            return (x_feats, ids, seg_ids), target

        def __len__(self):
            return len(self.x_features)
    

    class TextDataset5(Dataset):

        def __init__(self, x_features, question_ids, answer_ids, seg_question_ids, 
                     seg_answer_ids, idxs, targets=None):
            self.question_ids = question_ids[idxs].astype(np.long)
            self.answer_ids = answer_ids[idxs].astype(np.long)
            self.seg_question_ids = seg_question_ids[idxs].astype(np.long)
            self.seg_answer_ids = seg_answer_ids[idxs].astype(np.long)
            self.x_features = x_features[idxs].astype(np.float32)
            if targets is not None: self.targets = targets[idxs].astype(np.float32)
            else: self.targets = np.zeros((self.x_features.shape[0], N_TARGETS), dtype=np.float32)

        def __getitem__(self, idx):
            q_ids = self.question_ids[idx]
            a_ids = self.answer_ids[idx]
            seg_q_ids = self.seg_question_ids[idx]
            seg_a_ids = self.seg_answer_ids[idx]
            x_feats = self.x_features[idx]
            target = self.targets[idx]
            return (x_feats, q_ids, a_ids, seg_q_ids, seg_a_ids), target

        def __len__(self):
            return len(self.x_features)


    def to_cpu(x):
        return x.contiguous().detach().cpu()


    def to_numpy(x):
        return to_cpu(x).numpy()


    def to_device(xs, device):
        if isinstance(xs, tuple) or isinstance(xs, list):
            return [x.to(device) for x in xs]
        else: return [xs.to(device)]


    def infer_batch(inputs, model, device, to_numpy=True):
        inputs = to_device(inputs, device)
        predicted = model(*inputs)
        inputs = [x.cpu() for x in inputs]
        preds = torch.sigmoid(predicted)
        if to_numpy: preds = preds.cpu().detach().numpy().astype(np.float32)
        return preds


    def infer(model, loader, checkpoint_file=None, device=torch.device('cuda')):
        n_obs = len(loader.dataset)
        batch_sz = loader.batch_size
        predictions = np.zeros((n_obs, N_TARGETS))

        if checkpoint_file is not None:
            print(f'Starting inference for model: {checkpoint_file}')
            checkpoint = torch.load(checkpoint_file)
            model.load_state_dict(checkpoint['model_state_dict'])
        model.float()
        model.to(device)
        model.eval()

        with torch.no_grad():
            for i, (inputs, _) in enumerate(tqdm(loader)):
                start_index = i * batch_sz
                end_index = min(start_index + batch_sz, n_obs)
                batch_preds = infer_batch(inputs, model, device)
                predictions[start_index:end_index, :] += batch_preds

        return predictions


    def init_seed(seed=100):
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True


    class GELU(nn.Module):
        def forward(self, x):
            return x * torch.sigmoid(1.702 * x)


    def lin_layer(n_in, n_out, dropout):
        return nn.Sequential(nn.Linear(n_in, n_out), GELU(), nn.Dropout(dropout))


    class Head2(nn.Module):
        def __init__(self, n_h=512, n_feats=74, n_bert=768, dropout=0.2):
            super().__init__()
            n_x = n_feats + 2 * n_bert
            self.lin = lin_layer(n_in=n_x, n_out=n_h, dropout=dropout)
            self.lin_q = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
            self.lin_a = lin_layer(n_in=n_feats + n_bert, n_out=n_h, dropout=dropout)
            self.head_q = nn.Linear(2 * n_h, N_Q_TARGETS)
            self.head_a = nn.Linear(2 * n_h, N_A_TARGETS)

        def forward(self, x_feats, x_q_bert, x_a_bert):
            x_q = self.lin_q(torch.cat([x_feats, x_q_bert], dim=1))
            x_a = self.lin_a(torch.cat([x_feats, x_a_bert], dim=1))
            x = self.lin(torch.cat([x_feats, x_q_bert, x_a_bert], dim=1))
            x_q = self.head_q(torch.cat([x, x_q], dim=1))
            x_a = self.head_a(torch.cat([x, x_a], dim=1))
            return torch.cat([x_q, x_a], dim=1)


    class AvgPooledRoberta(RobertaModel):
        def forward(self, ids, seg_ids=None):
            att_mask = ids > 0
            x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
            att_mask = att_mask.unsqueeze(-1)
            return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)

        def resize_type_embeddings(self, new_num_types):
            old_embeddings = self.embeddings.token_type_embeddings
            model_embeds = self._get_resized_embeddings(old_embeddings, new_num_types)
            self.embeddings.token_type_embeddings = model_embeds
            self.config.type_vocab_size = new_num_types
            self.type_vocab_size = new_num_types


    class CustomRoberta(nn.Module):
        def __init__(self, n_h, n_feats, head_dropout=0.2):
            super().__init__()
            config = RobertaConfig.from_json_file(PRETRAINED_PATH + 'roberta-base/config.json')
            self.roberta = AvgPooledRoberta(config)
            self.roberta.resize_type_embeddings(2)
            self.head = Head2(n_h, n_feats, n_bert=768, dropout=head_dropout)

        def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
            x_q_bert = self.roberta(q_ids, seg_q_ids)
            x_a_bert = self.roberta(a_ids, seg_a_ids)
            return self.head(x_feats, x_q_bert, x_a_bert)


    class AvgPooledBert(BertModel):
        def forward(self, ids, seg_ids=None):
            att_mask = ids > 0
            x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
            att_mask = att_mask.unsqueeze(-1)
            return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)


    class CustomBert3(nn.Module):
        def __init__(self, n_h, n_feats):
            super().__init__()
            self.bert = AvgPooledBert(BertConfig())
            self.head = Head2(n_h, n_feats, n_bert=768)

        def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
            x_q_bert = self.bert(q_ids, seg_q_ids)
            x_a_bert = self.bert(a_ids, seg_a_ids)
            return self.head(x_feats, x_q_bert, x_a_bert)
        
        
    class AvgPooledXLNet(XLNetModel):
        def forward(self, ids, seg_ids=None):
            att_mask = (ids > 0).float()
            x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
            att_mask = att_mask.unsqueeze(-1)
            return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)


    class CustomXLNet(nn.Module):
        def __init__(self, n_h, n_feats, head_dropout=0.2):
            super().__init__()
            config = XLNetConfig.from_json_file(PRETRAINED_PATH + 'xlnet-base-cased/config.json')
            self.xlnet = AvgPooledXLNet(config)
            self.head = Head2(n_h, n_feats, n_bert=768, dropout=head_dropout)
    
        def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
            x_q_bert = self.xlnet(q_ids, seg_q_ids)
            x_a_bert = self.xlnet(a_ids, seg_a_ids)
            return self.head(x_feats, x_q_bert, x_a_bert)
        
        
    class AvgPooledAlbert(AlbertModel):
        def forward(self, ids, seg_ids=None):
            att_mask = ids > 0
            x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
            att_mask = att_mask.unsqueeze(-1)
            return (x_bert * att_mask).sum(dim=1) / att_mask.sum(dim=1)

        
    class CustomAlbert(nn.Module):
        def __init__(self, n_h, n_feats, head_dropout=0.2):
            super().__init__()
            config = AlbertConfig.from_json_file(PRETRAINED_PATH+'albert-base-v2/config.json')
            self.q_albert = AvgPooledAlbert(config)
            self.a_albert = AvgPooledAlbert(config)
            self.head = Head2(n_h, n_feats, n_bert=768, dropout=head_dropout)

        def forward(self, x_feats, q_ids, a_ids, seg_q_ids=None, seg_a_ids=None):
            x_q_bert = self.q_albert(q_ids, seg_q_ids)
            x_a_bert = self.a_albert(a_ids, seg_a_ids)
            return self.head(x_feats, x_q_bert, x_a_bert)
        

    pd.set_option('max_rows', 500)
    pd.set_option('max_columns', 500)
    path = '../input/google-quest-challenge/'
    sample_submission = pd.read_csv(f'{path}sample_submission.csv')
    test = pd.read_csv(f'{path}test.csv').fillna(' ')
    train = pd.read_csv(f'{path}train.csv').fillna(' ')


    def get_preds(train, test, ModelClass, tokenizer, model_name, checkpoint_dir, folds):

        seg_ids_test, ids_test = {}, {}
        max_seq_len = 512
        for mode, df in [('test', test)]:
            for text, cols in [('question', ['question_title', 'question_body']), 
                               ('answer', ['question_title', 'answer'])]:
                ids, seg_ids = [], []
                for x1, x2 in tqdm(df[cols].values):
                    encoded_inputs = tokenizer.encode_plus(
                        x1, x2, add_special_tokens=True, max_length=max_seq_len, pad_to_max_length=True, 
                        return_token_type_ids=True
                    )
                    ids.append(encoded_inputs['input_ids'])
                    seg_ids.append(encoded_inputs['token_type_ids'])
                ids_test[text] = np.array(ids)
                seg_ids_test[text] = np.array(seg_ids)

        train_category, test_category, category_dict, category_dict_reverse = \
            get_categorical_features(train, test, 'category')

        cat_features_train = train_category.reshape(-1, 1)
        cat_features_test = test_category.reshape(-1, 1)
        ohe = OneHotEncoder(handle_unknown='ignore')
        ohe.fit(cat_features_train)
        cat_features_test = ohe.transform(cat_features_test).toarray()

        num_workers = 8
        device = 'cuda'

        bs_test = 2
        test_loader = DataLoader(
            TextDataset5(cat_features_test, ids_test['question'], ids_test['answer'], 
                         seg_ids_test['question'], seg_ids_test['answer'], test.index),
            batch_size=bs_test, shuffle=False, num_workers=num_workers
        )

        init_seed()
        preds = np.zeros((len(test), N_TARGETS))
        for fold_id in folds:
            checkpoint_file = f'{checkpoint_dir}{model_name}_fold_{fold_id + 1}_best.pth'
            model = ModelClass(256, cat_features_test.shape[1]).to(device)
            test_preds = infer(model, test_loader, checkpoint_file, device)
            preds += test_preds / len(folds)

        return preds

    
    def get_preds2(train, test, ModelClass, tokenizer, model_name, checkpoint_dir, folds):

        sep_token = f' {tokenizer.sep_token} '
        max_seq_len = 512
        
        ids, seg_ids, sent_ids = [], [], []
        for x1, x2, x3 in tqdm(test[['question_title', 'question_body', 'answer']].values):
            encoded_inputs = tokenizer.encode_plus(
                x1 + sep_token + x2, x3, add_special_tokens=True, max_length=max_seq_len, pad_to_max_length=True, 
                return_token_type_ids=True
            )
            inp_ids = encoded_inputs['input_ids']
            raw_seg_ids = np.array(encoded_inputs['token_type_ids'])
            qa_split_idx = np.where(np.array(inp_ids) == tokenizer.sep_token_id)[0][0] + 1
            raw_seg_ids[qa_split_idx:] += 1
            raw_seg_ids = (raw_seg_ids * (np.array(inp_ids) != tokenizer.pad_token_id).astype(int)).tolist()

            ids.append(inp_ids)
            seg_ids.append(raw_seg_ids)
        
        ids_test = np.array(ids)
        seg_ids_test = np.array(seg_ids)

        train_category, test_category, category_dict, category_dict_reverse = \
            get_categorical_features(train, test, 'category')

        cat_features_train = train_category.reshape(-1, 1)
        cat_features_test = test_category.reshape(-1, 1)
        ohe = OneHotEncoder(handle_unknown='ignore')
        ohe.fit(cat_features_train)
        cat_features_test = ohe.transform(cat_features_test).toarray()

        num_workers = 8
        device = 'cuda'

        bs_test = 2
        test_loader = DataLoader(
            TextDataset4(cat_features_test, ids_test, seg_ids_test, test.index),
            batch_size=bs_test, shuffle=False, num_workers=num_workers
        )

        init_seed()
        preds = np.zeros((len(test), N_TARGETS))
        for fold_id in folds:
            checkpoint_file = f'{checkpoint_dir}{model_name}_fold_{fold_id + 1}_best.pth'
            model = ModelClass(256, cat_features_test.shape[1]).to(device)
            test_preds = infer(model, test_loader, checkpoint_file, device)
            preds += test_preds / len(folds)

        return preds
    

    def get_bert_preds(train, test):
        tokenizer = BertTokenizer.from_pretrained(PRETRAINED_PATH + 'bert-base-uncased/')
        model_name = 'siamese_bert_6_comb'
        checkpoint_dir = '../input/siamese-bert-models-6/'
        return get_preds(train, test, CustomBert3, tokenizer, model_name, checkpoint_dir, [0, 2, 3, 5, 6, 8, 9])

    
    def get_roberta_preds(train, test):
        tokenizer = RobertaTokenizer.from_pretrained(PRETRAINED_PATH + 'roberta-base/')
        model_name = 'siamese_roberta_1_comb'
        checkpoint_dir = '../input/siamese-roberta-models-1/'
        return get_preds(train, test, CustomRoberta, tokenizer, model_name, checkpoint_dir, [0, 1, 3, 4, 6, 7, 9])

    
    def get_xlnet_preds(train, test):
        tokenizer = XLNetTokenizer.from_pretrained(PRETRAINED_PATH + 'xlnet-base-cased/')
        model_name = 'siamese_xlnet_1_comb'
        checkpoint_dir = '../input/siamese-xlnet-models-1/'
        return get_preds(train, test, CustomXLNet, tokenizer, model_name, checkpoint_dir, [0, 1, 2, 4, 5, 7, 8])


    def get_albert_preds(train, test):
        tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_PATH + 'albert-base-v2/')
        model_name = 'siamese_albert_1_comb'
        checkpoint_dir = '../input/siamese-albert-models-1/'
        return get_preds(train, test, CustomAlbert, tokenizer, model_name, checkpoint_dir, [1, 2, 3, 4, 5, 6, 7])

    
    return get_albert_preds(train, test), get_roberta_preds(train, test), \
        get_bert_preds(train, test), get_xlnet_preds(train, test)
    
    
y_albert, y_roberta, y_bert, y_xlnet = get_robin_preds()

In [None]:
import gc

gc.collect()

In [None]:
def get_use_preds():

    import tensorflow as tf
    import tensorflow_hub as hub
    import numpy as np
    import pandas as pd
    from nltk import tokenize
    from tqdm import tqdm_notebook

    embed_fn = hub.load('../input/universalsentenceencoderlarge4/')

    df = pd.read_csv("../input/google-quest-challenge/train.csv")
    test_df = pd.read_csv("../input/google-quest-challenge/test.csv")
    outputs = df.columns[11:]


    def count_words(data):
        return len(str(data).split())

    def count_words_unique(data):
        return len(np.unique(str(data).split()))

    def questionowords(data):
        start_words = ['who', 'what', 'when', 'where', 'why', 'how', 'is', 'am','are','was','were','can','could','may','should','shall','does', 'do','did']
        sents = tokenize.sent_tokenize(data)
        qw = 0
        for sent in sents:
            if sent.lower().startswith(tuple(start_words)):
                qw+=1
        return qw

    def questionmarks(data):
        sents = tokenize.sent_tokenize(data)
        qm = 0
        for sent in sents:
            qm += sent.count("?")  
        return qm


    def get_numeric_features(df):
        df["qt_wc"] = df["question_title"].apply(count_words)
        df["qb_wc"] = df["question_body"].apply(count_words)
        df["a_wc"] = df["answer"].apply(count_words)
        df["qt_wcu"] = df["question_title"].apply(count_words_unique)
        df["qb_wcu"] = df["question_body"].apply(count_words_unique)
        df["a_wcu"] = df["answer"].apply(count_words_unique)


        df['qb_qw'] = df['question_body'].apply(questionowords)
        df['qt_qw'] = df['question_title'].apply(questionowords)
        df['qb_qm'] = df['question_body'].apply(questionmarks)
        df['qt_qm'] = df['question_title'].apply(questionmarks)
        return df

    test_df = get_numeric_features(test_df)

    features = ["qt_wc", "qb_wc", "a_wc", "qt_wcu", "qb_wcu", "a_wcu",
                "qb_qw", "qt_qw", "qb_qm", "qt_qm"]

    MAX_SEQ = 30

    def get_sentences(x):
        sentences = [s for s in tokenize.sent_tokenize(x) if s != ""]
        if len(sentences) > MAX_SEQ:
            return sentences[:MAX_SEQ]
        return sentences + [""]*(MAX_SEQ - len(sentences))


    def get_use(df):
        QT = embed_fn(df["question_title"].values)["outputs"].numpy()

        A = np.zeros((df.shape[0], MAX_SEQ, 512), dtype=np.float32)
        for i, x in tqdm_notebook(list(enumerate(df["answer"].values))):
            A[i] = embed_fn(get_sentences(x))["outputs"].numpy()

        QB = np.zeros((df.shape[0], MAX_SEQ, 512), dtype=np.float32)
        for i, x in tqdm_notebook(list(enumerate(df["question_body"].values))):
            QB[i] = embed_fn(get_sentences(x))["outputs"].numpy()

        return QT, A, QB

    QT_test, A_test, QB_test = get_use(test_df)

    import gc

    del embed_fn

    gc.collect()

    import tensorflow.keras.layers as KL


    def nn_block(input_layer, size, dropout_rate, activation):
        out_layer = KL.Dense(size, activation=None)(input_layer)
        #out_layer = KL.BatchNormalization()(out_layer)
        out_layer = KL.Activation(activation)(out_layer)
        out_layer = KL.Dropout(dropout_rate)(out_layer)
        return out_layer

    def cnn_block(input_layer, size, dropout_rate, activation):
        out_layer = KL.Conv1D(size, 1, activation=None)(input_layer)
        #out_layer = KL.LayerNormalization()(out_layer)
        out_layer = KL.Activation(activation)(out_layer)
        out_layer = KL.Dropout(dropout_rate)(out_layer)
        return out_layer

    def get_model():
        qt_input = KL.Input(shape=(QT_test.shape[1],))

        a_input = KL.Input(shape=(A_test.shape[1], A_test.shape[2]))
        qb_input = KL.Input(shape=(QB_test.shape[1], QB_test.shape[2]))

        dummy_input = KL.Input(shape=(1,))

        a_emb = KL.Flatten()(KL.Embedding(2, 8)(dummy_input))
        qb_emb = KL.Flatten()(KL.Embedding(2, 8)(dummy_input))

        embs = KL.concatenate([KL.RepeatVector(MAX_SEQ)(a_emb), KL.RepeatVector(MAX_SEQ)(qb_emb)], axis=-2)

        x = KL.concatenate([KL.SpatialDropout1D(0.7)(KL.RepeatVector(2*MAX_SEQ)(qt_input)), 
                            KL.SpatialDropout1D(0.3)(KL.concatenate([a_input, qb_input], axis=-2))])
        x = KL.concatenate([x, embs])

        x = cnn_block(x, 256, 0.1, "relu")
        x = KL.concatenate([KL.GlobalAvgPool1D()(x), KL.GlobalMaxPool1D()(x)])

        feature_input = KL.Input(shape=(len(features),))

        hidden_layer = KL.concatenate([KL.BatchNormalization()(feature_input), x])
        hidden_layer = nn_block(hidden_layer, 128, 0.1, "relu")

        out = KL.Dense(len(outputs), activation="sigmoid")(hidden_layer)

        model = tf.keras.models.Model(inputs=[qt_input, a_input, qb_input, feature_input, dummy_input], outputs=out)
        return model

    from sklearn.model_selection import KFold, GroupKFold
    from sklearn.metrics import mean_squared_error
    from tensorflow.keras.optimizers import Nadam
    from sklearn.preprocessing import MinMaxScaler
    from tensorflow.keras.backend import epsilon
    import tensorflow.keras.backend as K

    NUM_FOLDS = 10
    BATCH_SIZE = 32
    MODEL_FOLDER = "../input/qa-use-model-weights/"

    y_test = np.zeros((test_df.shape[0], len(outputs)))

    for fold in range(NUM_FOLDS):
        model_path = "{folder}model{fold}.h5".format(folder=MODEL_FOLDER, fold=fold)

        K.clear_session()
        model = get_model()
        model.load_weights(model_path)
        
        y_test += model.predict([QT_test, A_test, QB_test, test_df[features].values, np.ones(test_df.shape[0])], 
                                batch_size=BATCH_SIZE, verbose=0)/NUM_FOLDS

    K.clear_session()

    for i, col in enumerate(outputs):
        test_df[col] = y_test[:, i]
    
    return test_df, outputs

test_df, outputs = get_use_preds()

In [None]:
import itertools
import numpy as np
import pandas as pd



def scale(x, d):
    if d:
        return (x//(1/d))/d
    return x

def transform(y_oof, y_roberta, y_bert, y_xlnet, y_albert, params, c):
    d_global, d_local, w_use, w_roberta, w_bert, w_xlnet, w_albert = params
    y_temp = scale(y_oof[:, c], d_local)*w_use +\
                 scale(y_roberta[:, c], d_local)*w_roberta +\
                 scale(y_bert[:, c], d_local)*w_bert +\
                 scale(y_xlnet[:, c], d_local)*w_xlnet +\
                 scale(y_albert[:, c], d_local)*w_albert
    y_temp /= w_use + w_roberta + w_bert + w_xlnet + w_albert
    y_temp = scale(y_temp, d_global)
    return y_temp

param_list = [
 (64, 16, 4, 1, 1, 4, 4),
 (None, None, 2, 0, 1, 0, 1),
 (4, 32, 1, 4, 0, 1, 1),
 (16, 64, 1, 4, 2, 0, 1),
 (8, 32, 2, 2, 1, 4, 0),
 (4, None, 1, 4, 2, 1, 4),
 (32, 16, 4, 0, 1, 2, 4),
 (16, 32, 2, 1, 2, 1, 2),
 (8, 16, 2, 1, 1, 2, 2),
 (16, 16, 1, 0, 2, 2, 0),
 (32, 16, 2, 4, 1, 2, 2),
 (16, 4, 2, 2, 0, 1, 4),
 (4, None, 0, 4, 1, 4, 0),
 (4, 8, 0, 1, 0, 4, 0),
 (8, 8, 0, 4, 1, 0, 0),
 (4, 8, 2, 1, 0, 4, 0),
 (4, None, 2, 4, 0, 1, 1),
 (64, 16, 1, 4, 0, 4, 2),
 (32, 64, 2, 4, 1, 2, 4),
 (32, None, 0, 1, 1, 1, 0),
 (32, None, 2, 1, 4, 4, 1),
 (32, 8, 0, 4, 1, 1, 4),
 (64, None, 4, 4, 1, 1, 2),
 (32, None, 1, 2, 0, 0, 2),
 (8, None, 1, 2, 2, 1, 2),
 (32, 16, 1, 4, 0, 4, 4),
 (8, 32, 1, 2, 0, 0, 1),
 (32, None, 1, 2, 1, 1, 0),
 (64, 64, 2, 2, 0, 1, 2),
 (16, None, 0, 2, 4, 1, 2)]


y_combined = test_df[outputs].values

for c in range(y_combined.shape[1]):
    y_combined[:, c] = transform(y_combined, y_roberta, y_bert, y_xlnet, y_albert, param_list[c], c)
    
    val, counts = np.unique(y_combined[:, c], return_counts=True)
    print(c, len(val), counts.sum() - counts.max())
    
    
test_df[outputs] = y_combined

# test_df["eng"] = test_df["url"].apply(lambda x: x.startswith("http://english.") or x.startswith("http://ell."))
# test_df.loc[~test_df["eng"], outputs[19]] = 0

# test_df.loc[test_df["qa_id"] == 7525, outputs[19]] = 1

test_df[outputs] = np.clip(test_df[outputs], 0.00001, 0.999999)

In [None]:
test_df.to_csv("submission.csv", index=False, columns=["qa_id"] + outputs.tolist())