In [1]:
import numpy as np
import pandas as pd
import warnings
import itertools
from tqdm import tqdm_notebook
from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv(f'data/train.csv')
TARGETS = train.columns[11:]

for col in TARGETS:
    train[col] = train[col].rank(method="average")
train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS])
y = train[TARGETS].values
ix = np.where(train.groupby("question_body")["host"].transform("count") == 1)[0]

y_use = pd.read_csv('oofs/use_oof.csv').values
y_albert = pd.read_csv('oofs/siamese_albert_1_comb_oofs.csv', index_col=0).values
y_bert = pd.read_csv('oofs/siamese_bert_6_oofs.csv', index_col=0).values
y_roberta = pd.read_csv('oofs/siamese_roberta_1_comb_oofs.csv', index_col=0).values
y_xlnet = pd.read_csv('oofs/siamese_xlnet_1_comb_oofs.csv', index_col=0).values
# y_xlnet = pd.read_csv('oofs/siamese_xlnet_2_comb_oofs.csv', index_col=0).values

preds = [y_use, y_roberta, y_bert, y_xlnet, y_albert]
n_models = len(preds)

In [3]:
def bce(y, y_pred):
    return -np.mean(y*np.log(y_pred + 1e-5) + (1-y)*np.log(1-y_pred + 1e-5))


def evaluate(y, y_pred, verbose=False, exclude=[]):
    score = 0
    for i in range(y.shape[1]):
        if i not in exclude:
            col_score = spearmanr(y[:, i], y_pred[:, i])[0]
            if verbose:
                print(TARGETS[i], np.round(col_score, 3))
            score += col_score/(y.shape[1]-len(exclude))
    return np.round(score, 4)


ds = [4, 8, 16, 32, 64, None]
ws = [0, 1, 2, 4]


def scale(x, d):
    if d:
        return (x//(1/d))/d
    return x

dws5 = list(itertools.product(ds, ds, *(n_models*[ws])))


def transform(preds, params, c):
    (d_global, d_local), ws = params[:2], params[2:]
    y_temp = 0
    for pred, w in zip(preds, ws):
        y_temp += w * scale(pred[:, c], d_local)
    y_temp /= sum(ws)
    y_temp = scale(y_temp, d_global)
    return y_temp


bce_weight = 0.25


def f(c):
    max_score = spearmanr(y[:, c], y_roberta[:, c])[0] - bce_weight * bce(y[:, c], y_roberta[:, c])
    best_index = -1
    for i, params in enumerate(dws5):
        if sum(params[2:]) == 0:
            continue
        y_temp = transform(preds, params, c)
        score = spearmanr(y[:, c], y_temp)[0] - bce_weight * bce(y[:, c], y_temp)
        if score > max_score:
            max_score = score
            best_index = i
            
    return best_index

In [None]:
import multiprocessing

pool = multiprocessing.Pool(15)
out = pool.map(f, range(30))

In [None]:
y_combined = np.zeros(y.shape)

for c in range(30):
    print(c, TARGETS[c], dws5[out[c]])
    y_combined[:, c] = transform(preds, dws5[out[c]], c)

In [None]:
evaluate(y, y_combined, True)

In [None]:
evaluate(y, y_combined, True, exclude=[9,19])

In [None]:
[dws5[out[c]] for c in range(30)]