In [None]:
from collections import Counter, defaultdict
import copy
from functools import partial
import itertools
import os
from pathlib import Path
import random
import re
import string
import time
from typing import Dict, List, Tuple
import warnings

from gensim.models import Word2Vec
import joblib
from numba import cuda
import numpy as np
import pandas as pd
import plotly_express as px
import scipy

from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GroupKFold, KFold
from sklearn.utils import shuffle

import tensorflow as tf
import tensorflow_hub as hub

import torch
from torch import nn
from torch.optim.optimizer import Optimizer
from torch.utils.data import Dataset, Sampler, DataLoader

In [None]:
start_time = time.time()

INPUT_DIR = Path('../input/')
DATA_DIR = INPUT_DIR / 'google-quest-challenge'
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
SAMPLE_SUBMISSION_PATH = DATA_DIR / 'sample_submission.csv'

USE_DIR = INPUT_DIR / 'universal-sentence-encoder/universal-sentence-encoder-qa/universal-sentence-encoder-qa/'
EMBEDDING_PATH = INPUT_DIR / 'fasttext-vector/fasttext.pkl'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

num_targets = 30
target_names = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written']

train_df['question'] = train_df['question_title'] + ' ' + train_df['question_body']
test_df['question'] = test_df['question_title'] + ' ' + test_df['question_body']

n_splits = 5
n_epochs = 9
batch_size = 32

max_q_len = 512
max_a_len = 512

updates_per_epoch = 100
mu = 0.9

embed_size = 300
max_features = 60000

seed = 1029
device = torch.device('cuda')

ps = PorterStemmer()
lc = LancasterStemmer()
sb = SnowballStemmer('english')

warnings.filterwarnings('ignore')
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}


def replace_typical_misspell(text: str) -> str:
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))

    def replace(match):
        return misspell_dict[match.group(0)]

    return misspell_re.sub(replace, text)


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']


def clean_text(text: str) -> str:
    text = str(text)
    for punct in puncts + list(string.punctuation):
        if punct in text:
            text = text.replace(punct, f' {punct} ')
    return text


def clean_numbers(text: str) -> str:
    return re.sub(r'\d+', ' ', text)


def preprocess_text(text: str) -> str:
    text = text.lower()
    text = replace_typical_misspell(text)
    text = clean_text(text)
    text = clean_numbers(text)
    text = text.strip()
    return text

In [None]:
text_columns = ['question', 'answer']
train_texts = train_df[text_columns].applymap(preprocess_text).values
test_texts = test_df[text_columns].applymap(preprocess_text).values
all_texts = list(itertools.chain(*train_texts, *test_texts))

In [None]:
def build_vocab(texts: List[str], max_features: int = 100000) -> Dict[str, Dict]:
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {}
    vocab['token2id'] = {
        token: _id + 1 for _id, (token, count) in
        enumerate(counter.most_common(max_features))}
    vocab['token2id']['<PAD>'] = 0
    vocab['token2id']['<UNK>'] = len(vocab['token2id'])
    vocab['id2token'] = {v: k for k, v in vocab['token2id'].items()}
    vocab['word_freq'] = {
        **{'<PAD>': 0, '<UNK>': 0},
        **dict(counter.most_common(max_features)),
    }
    return vocab


def tokenize(texts: List[str],
             token2id: Dict[str, int],
             max_len: int = 200) -> List[List[int]]:
    
    def text2ids(text, token2id, max_len):
        return [
            token2id.get(token, len(token2id) - 1)
            for token in text.split()[:max_len]]
    
    tokenized = [
        text2ids(text, token2id, max_len)
        for text in texts]
    return tokenized

In [None]:
def load_embedding(embedding_path: str, word_index: Dict[str, int]) -> np.ndarray:
    embeddings_index = joblib.load(embedding_path)

    # word_index = tokenizer.word_index
    nb_words = min(max_features + 2, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))

    for key, i in word_index.items():
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue

    return embedding_matrix


def w2v_fine_tune(all_texts: List[str], vocab: Dict, embedding_matrix: np.ndarray) -> np.ndarray:
    model = Word2Vec(min_count=1, workers=1, iter=3, size=300)
    model.build_vocab_from_freq(vocab['word_freq'])
    idxmap = np.array(
        [vocab['token2id'][w] for w in model.wv.index2entity])
    model.wv.vectors[:] = embedding_matrix[idxmap]
    model.trainables.syn1neg[:] = embedding_matrix[idxmap]
    model.train(all_texts, total_examples=len(all_texts), epochs=model.epochs)
    embedding_matrix = np.vstack([np.zeros((1, 300)), model.wv.vectors, np.zeros((1, 300))])
    return embedding_matrix

In [None]:
vocab = build_vocab(itertools.chain(*train_texts, *test_texts), max_features)
embedding_matrix = load_embedding(EMBEDDING_PATH, vocab['token2id'])
embedding_matrix = w2v_fine_tune(all_texts, vocab, embedding_matrix)

train_q = tokenize(train_texts[:, 0], vocab['token2id'], max_q_len)
train_a = tokenize(train_texts[:, 1], vocab['token2id'], max_a_len)
train_x = np.array([train_q, train_a]).T
train_y = train_df[target_names].values
train_group = train_df['question_body'].values

test_q = tokenize(test_texts[:, 0], vocab['token2id'], max_q_len)
test_a = tokenize(test_texts[:, 1], vocab['token2id'], max_a_len)
test_x = np.array([test_q, test_a]).T

# target scaling
t_max = train_y.max(axis=0)[np.newaxis, :]
t_min = train_y.min(axis=0)[np.newaxis, :]
train_y = (train_y - t_min) / (t_max - t_min)

In [None]:
class UniversalSentenceEncoder(object):
    
    def __init__(self, model_dir: str, batch_size: int = 128):
        self.module = hub.load(str(model_dir))
        self.batch_size = batch_size

    def __call__(self, texts: List[str], mode: str) -> torch.FloatTensor:
        assert mode in ['question', 'answer']
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            text = texts[i:(i + self.batch_size)]
            if mode == 'question':
                h_embedding = self.module.signatures['question_encoder'](
                    tf.constant(text))['outputs']
            else:
                h_embedding = self.module.signatures['response_encoder'](
                    input=tf.constant(text),
                    context=tf.constant(text))['outputs']
            h_embedding = torch.FloatTensor(h_embedding.numpy())
            embeddings.append(h_embedding)
        return torch.cat(embeddings, 0)


In [None]:
use = UniversalSentenceEncoder(USE_DIR, batch_size=32)

q_texts = list(itertools.chain(train_df['question'].values, test_df['question'].values))
a_texts = list(itertools.chain(train_df['answer'].values, test_df['answer'].values))
q_emb = use(q_texts, mode='question')
a_emb = use(a_texts, mode='answer')

train_q_emb, test_q_emb = q_emb[:len(train_df)], q_emb[len(train_df):]
train_a_emb, test_a_emb = a_emb[:len(train_df)], a_emb[len(train_df):]

train_text_emb = torch.cat((train_q_emb, train_a_emb), 1)
test_text_emb = torch.cat((test_q_emb, test_a_emb), 1)

del use
cuda.select_device(0)
cuda.close()

In [None]:
class TextDataset(Dataset):

    def __init__(self, seqs, targets=None):
        self.seqs = seqs
        self.targets = targets
        
    def __len__(self):
        return len(self.seqs)
        
    def get_keys(self):
        return np.vectorize(len)(self.seqs).sum(axis=1)
        
    def __getitem__(self, index):
        if self.targets is None:
            return index, self.seqs[index]
        return index, self.seqs[index], self.targets[index]


def collate_fn(data):

    def _pad_sequences(seqs):
        lens = [len(seq) for seq in seqs]
        max_len = max(lens)

        padded_seqs = torch.zeros(len(seqs), max_len).long()
        for i, seq in enumerate(seqs):
            start = max_len - lens[i]
            padded_seqs[i, start:] = torch.LongTensor(seq)
        return padded_seqs

    transposed = list(zip(*data))
    index = transposed[0]
    q_seqs, a_seqs = zip(*transposed[1])
    q_seqs = _pad_sequences(q_seqs)
    a_seqs = _pad_sequences(a_seqs)
    seqs = [q_seqs, a_seqs]
    if len(transposed) == 2:  # targets == None
        return index, seqs
    return index, seqs, torch.FloatTensor(transposed[2])


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_keys, bucket_size=None, batch_size=1048, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_keys = sort_keys
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_keys)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_keys)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_keys)
        
    def prepare_buckets(self, indices=None):
        lens = - self.sort_keys
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lens)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lens), dtype=np.int32))
                lens = lens[indices]
            else:
                indices = np.arange(len(lens), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data
    
        new_indices = []
        extra_batch = None
        for chunk_index, chunk in divide_chunks(lens, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk, axis=-1)]
            batches = []
            for _, batch in divide_chunks(indices_sorted, self.batch_size):
                if len(batch) == self.batch_size:
                    batches.append(batch.tolist())
                else:
                    assert extra_batch is None
                    assert batch is not None
                    extra_batch = batch
    
            # shuffling batches within buckets
            if self.shuffle:
                batches = shuffle(batches)
            for batch in batches:
                new_indices.extend(batch)
    
        if extra_batch is not None:
            new_indices.extend(extra_batch)
        return indices[new_indices]

In [None]:
test_dataset = TextDataset(test_x)
test_sampler = BucketSampler(test_dataset, test_dataset.get_keys(),
                             batch_size=batch_size, shuffle_data=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=test_sampler,
                         shuffle=False, num_workers=0, collate_fn=collate_fn)

In [None]:
class LstmUnit(nn.Module):
    
    def __init__(self, embedding_matrix, lstm_hidden_size=120, gru_hidden_size=60):
        super(LstmUnit, self).__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.2)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], lstm_hidden_size, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(lstm_hidden_size * 2, gru_hidden_size, bidirectional=True, batch_first=True)
        
    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding
    
    def flatten_parameters(self):
        self.lstm.flatten_parameters()
        self.lstm2.flatten_parameters()

    def forward(self, x):
        batch_size = x.size(0)
        h_embedding = self.embedding(x)
        h_embedding = self.apply_spatial_dropout(h_embedding)

        h_lstm, _ = self.lstm(h_embedding)
        h_lstm, _ = self.lstm2(h_lstm)

        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)

        out = torch.cat((avg_pool, max_pool), 1)
        return out


class LstmModel(nn.Module):
    
    def __init__(self, embedding_matrix):
        super(LstmModel, self).__init__()
        q_lstm_size = 120
        q_lstm2_size = 120
        
        a_lstm_size = 120
        a_lstm2_size = 120
        
        self.q_lstm = LstmUnit(embedding_matrix, q_lstm_size, q_lstm2_size)
        self.a_lstm = LstmUnit(embedding_matrix, a_lstm_size, a_lstm2_size)

        self.linear = nn.Linear((q_lstm2_size + a_lstm2_size) * 4 + 512 * 2, 200)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(200, num_targets)
        
    def flatten_parameters(self):
        self.q_lstm.flatten_parameters()
        self.a_lstm.flatten_parameters()
        
    def forward(self, q_seqs, a_seqs, text_emb):
        h_q = self.q_lstm(q_seqs)
        h_a = self.a_lstm(a_seqs)

        conc = torch.cat((h_q, h_a, text_emb), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)

        return out

In [None]:
class EMA(object):

    def __init__(self, model, mu, level='batch', n=1):
        # self.ema_model = copy.deepcopy(model)
        self.mu = mu
        self.level = level
        self.n = n
        self.cnt = self.n
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data

    def _update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def set_weights(self, ema_model):
        for name, param in ema_model.named_parameters():
            if param.requires_grad:
                param.data = self.shadow[name]

    def on_batch_end(self, model):
        if self.level is 'batch':
            self.cnt -= 1
            if self.cnt == 0:
                self._update(model)
                self.cnt = self.n
                
    def on_epoch_end(self, model):
        if self.level is 'epoch':
            self._update(model)

In [None]:
def get_scores(y_true, y_pred) -> Dict[str, float]:
    # y_true, y_pred: np.ndarray with shape (sample_size, num_targets)
    assert y_true.shape == y_pred.shape
    assert y_true.shape[1] == num_targets
    scores = {}
    for target_name, i in zip(target_names, range(y_true.shape[1])):
        scores[target_name] = scipy.stats.spearmanr(y_true[:, i], y_pred[:, i])[0]
    return scores


def predict(model: nn.Module,
            data_loader: DataLoader,
            text_emb: torch.Tensor,
            device: torch.device = torch.device('cuda')) -> np.ndarray:
    model.eval()
    preds_fold = np.zeros((len(data_loader.dataset), num_targets))

    with torch.no_grad():
        for index, x_batch in data_loader:
            x_batch = (x.to(device) for x in x_batch)
            emb_batch = text_emb[list(index)].to(device)
            y_pred = model(*x_batch, emb_batch).detach()
            preds_fold[list(index)] = torch.sigmoid(y_pred.cpu()).numpy()
    return preds_fold

In [None]:
gkf = GroupKFold(n_splits=n_splits)
cv_scores = []
ema_cv_scores = []
fold_scores = []
train_preds = np.zeros((len(train_x), num_targets))
test_preds = np.zeros((len(test_x), num_targets))
ema_train_preds = np.zeros((len(train_x), num_targets))
ema_test_preds = np.zeros((len(test_x), num_targets))

for i, (train_idx, valid_idx) in enumerate(list(gkf.split(train_x, train_y, train_group))):
    print(f'fold {i + 1}')
    train_fold_x, train_fold_y = train_x[train_idx], train_y[train_idx]
    valid_fold_x, valid_fold_y = train_x[valid_idx], train_y[valid_idx]
    train_fold_emb = train_text_emb[train_idx]
    valid_fold_emb = train_text_emb[valid_idx]
    
    train_dataset = TextDataset(train_fold_x, train_fold_y)
    valid_dataset = TextDataset(valid_fold_x)

    train_sampler = BucketSampler(train_dataset, train_dataset.get_keys(),
                                  bucket_size=batch_size * 20, batch_size=batch_size)
    valid_sampler = BucketSampler(valid_dataset, valid_dataset.get_keys(),
                                  batch_size=batch_size, shuffle_data=False)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False,
                              sampler=train_sampler, num_workers=0, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False,
                              sampler=valid_sampler, collate_fn=collate_fn)

    model = LstmModel(embedding_matrix).to(device)
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    ema_model = copy.deepcopy(model)
    ema_model.eval()
    
    ema_n = int(len(train_loader.dataset) / (updates_per_epoch * batch_size))
    ema = EMA(model, mu, n=ema_n)

    for epoch in range(n_epochs):
        epoch_start_time = time.time()
        model.train()
        for index, x_batch, y_batch in train_loader:
            x_batch = (x.to(device) for x in x_batch)
            y_batch = y_batch.to(device)
            emb_batch = train_fold_emb[list(index)].to(device)
            y_preds = model(*x_batch, emb_batch)
            
            loss = nn.BCEWithLogitsLoss()(y_preds, y_batch)
            loss.backward()
            optimizer.step()
            model.zero_grad()
            ema.on_batch_end(model)
        
        valid_preds = predict(model, valid_loader, valid_fold_emb, device=device)
        scores = get_scores(valid_fold_y, valid_preds)
        score = np.mean(list(scores.values()))
        fold_scores.append({
            'fold': i + 1,
            'epoch': epoch + 1,
            'score': score
        })
        elapsed_time = time.time() - epoch_start_time
        print('Epoch {}/{} \t score: {:.4f} \t time: {:.2f}s'.format(
            epoch + 1, n_epochs, score, elapsed_time))
        ema.on_epoch_end(model)
        
    ema.set_weights(ema_model)
    ema_model.flatten_parameters()
    
    train_preds[valid_idx] = valid_preds
    ema_valid_preds = predict(ema_model, valid_loader, valid_fold_emb, device=device)
    ema_train_preds[valid_idx] = ema_valid_preds

    cv_scores.append(score)
    ema_scores = get_scores(valid_fold_y, ema_valid_preds)
    ema_score = np.mean(list(ema_scores.values()))
    print(f'EMA score: {ema_score:.4f}')
    ema_cv_scores.append(ema_scores)
    
    test_preds += predict(model, test_loader, test_text_emb, device=device) / n_splits
    ema_test_preds += predict(ema_model, test_loader, test_text_emb, device=device) / n_splits

In [None]:
fig = px.line(pd.DataFrame(fold_scores), x='epoch', y='score', color='fold')
fig.show()

In [None]:
ema_cv_scores = pd.DataFrame(ema_cv_scores).mean().reset_index()
ema_cv_scores.columns = ['target_name', 'score']
ema_cv_scores.sort_values('score', inplace=True)
fig = px.bar(ema_cv_scores, x='score', y='target_name', orientation='h')
fig.show()

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[target_names] = ema_test_preds
submission.to_csv('submission.csv', index=False)

In [None]:
cv_score = np.mean(cv_scores)
ema_score = ema_cv_scores['score'].mean()
print(f'CV score: {cv_score:.4f}')
print(f'EMA score: {ema_score:.4f}')

In [None]:
class OptimizedRounder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.threshold = [0., 1.]
        self.ab_start = [(0., 0.2), (0.8, 1.)]
    
    def fit(self, train_labels, train_preds):
        assert train_labels.shape == train_preds.shape
        assert train_labels.ndim == 1
        
        self.best_score = self.score(train_labels, train_preds)
        self._golden_section_search(train_labels, train_preds, 0)  # lower threshold
        score = self.score(train_labels, train_preds)
        if score > self.best_score + 1e-3:
            self.best_score = score
        else:
            self.threshold[0] = 0.
        
        self._golden_section_search(train_labels, train_preds, 1)  # higher threshold
        score = self.score(train_labels, train_preds)
        if score > self.best_score + 1e-3:
            self.best_score = score
        else:
            self.threshold[1] = 1.

    def _golden_section_search(self, train_labels, train_preds, idx):
        # idx == 0 -> lower threshold search
        # idx == 1 -> higher threshold search
        golden1 = 0.618
        golden2 = 1 - golden1
        for _ in range(10):
            a, b = self.ab_start[idx]
            # calc losses
            self.threshold[idx] = a
            la = -self.score(train_labels, train_preds)
            self.threshold[idx] = b
            lb = -self.score(train_labels, train_preds)
            for _ in range(20):
                # choose value
                if la > lb:
                    a = b - (b - a) * golden1
                    self.threshold[idx] = a
                    la = -self.score(train_labels, train_preds)
                else:
                    b = b - (b - a) * golden2
                    self.threshold[idx] = b
                    lb = -self.score(train_labels, train_preds)

    def transform(self, preds):
        transformed = np.clip(preds, *self.threshold)
        if np.unique(transformed).size == 1:
            return preds
        return transformed
        
    def score(self, labels, preds):
        p = self.transform(preds)
        score = scipy.stats.spearmanr(labels, p)[0]
        return score

In [None]:
kf = KFold(n_splits=3, shuffle=True, random_state=1029)
train_scores = []
valid_scores = []
train_optimized_scores = defaultdict(list)
valid_optimized_scores = defaultdict(list)
thresholds = defaultdict(list)

for train_idx, valid_idx in kf.split(train_y):
    train_fold_preds, train_fold_y = ema_train_preds[train_idx], train_y[train_idx]
    valid_fold_preds, valid_fold_y = ema_train_preds[valid_idx], train_y[valid_idx]
    train_scores.append(get_scores(train_fold_y, train_fold_preds))
    valid_scores.append(get_scores(valid_fold_y, valid_fold_preds))
    
    for i, target_name in enumerate(target_names):
        optimizer = OptimizedRounder()
        optimizer.fit(train_y[train_idx, i], ema_train_preds[train_idx, i])
        train_score = optimizer.score(train_y[train_idx, i], ema_train_preds[train_idx, i])
        valid_score = optimizer.score(train_y[valid_idx, i], ema_train_preds[valid_idx, i])
        train_optimized_scores[target_name].append(train_score)
        valid_optimized_scores[target_name].append(valid_score)
        thresholds[target_name].append(optimizer.threshold)

In [None]:
scores = pd.DataFrame(valid_optimized_scores).mean().loc[ema_cv_scores['target_name']].reset_index()
scores.columns = ['target_name', 'score']
fig = px.bar(scores, x='score', y='target_name', orientation='h')
fig.show()

In [None]:
train_score = pd.DataFrame(train_scores).values.mean()
valid_score = pd.DataFrame(valid_scores).values.mean()
train_optimized_score = np.mean(list(train_optimized_scores.values()))
valid_optimized_score = np.mean(list(valid_optimized_scores.values()))
print(f'train score: {train_score:.4f} -> {train_optimized_score:.4f}')
print(f'valid score: {valid_score:.4f} -> {valid_optimized_score:.4f}')

In [None]:
for i, target_name in enumerate(target_names):
    optimizer = OptimizedRounder()
    optimizer.threshold = np.mean(thresholds[target_name], axis=0)
    ema_test_preds[:, i] = optimizer.transform(ema_test_preds[:, i])

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[target_names] = ema_test_preds
submission.to_csv('submission.csv', index=False)
print(f'all processes done in {(time.time() - start_time) / 60:.2f} min.')