From: https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0

In [1]:
import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
import datetime
import pandas as pd
from time import time
# from autograd import grad
# import autograd.numpy as np
import numpy as np
from numba import njit
from scipy.optimize import minimize, fsolve

from sklearn.metrics import log_loss

# Helpers

In [3]:
# CPMP's logloss from https://www.kaggle.com/c/lish-moa/discussion/183010
def log_loss_numpy(y_pred):
    y_true_ravel = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = np.where(y_true_ravel == 1, - np.log(y_pred), - np.log(1 - y_pred))
    return loss.mean()

def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    return log_loss_numpy(oof_blend)

def grad_func(weights):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

@njit
def grad_func_jit(weights):
    oof_clip = np.minimum(1 - 1e-15, np.maximum(oof, 1e-15))
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], np.zeros((oof.shape[1], oof.shape[2]))
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

def log_loss_metric(y_true, y_pred):
    metrics = []
    for _target in train_targets.columns:
        metrics.append(log_loss(y_true.loc[:, _target], y_pred.loc[:, _target].astype(float), labels = [0,1]))
    return np.mean(metrics)

# Load data

In [4]:
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
del train_targets['sig_id']

train_features = pd.read_csv('../input/lish-moa/train_features.csv')
del train_features['sig_id']

In [5]:
keras_val_submission = pd.read_csv('../input/keras-neural-net/val-submission.csv')
np.save('keras-oof.npy', keras_val_submission.values)

xboost_val_submission = np.load('../input/xgboost-baseline-saved-model-kadri/xgboos-oof.npy')

#log_reg_val_submission = np.load('../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/log-reg-oof.npy', allow_pickle=True)
#np.save('log-reg-oof.npy', log_reg_val_submission)


marge_keras_val_submission = pd.read_csv('../input/marge-keras-v2-load-model/val-submission.csv')
np.save('marge-keras-oof.npy', marge_keras_val_submission.values)


#val_submissions = [keras_val_submission, xboost_val_submission, log_reg_val_submission, marge_keras_val_submission]

In [6]:
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
if len(test_features) == 3982: # if public test set, we can use existing submissions
    keras_submission = pd.read_csv('../input/keras-neural-net/submission.csv')
    xboost_submission = pd.read_csv('../input/xgboost-baseline-saved-model-kadri/submission.csv')
    #logreg_submission = pd.read_csv('../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/submission.csv')
    marge_keras_submission = pd.read_csv('../input/marge-keras-v2-load-model/submission.csv')
else: # if private test set, we have to rerun inference  
    print("Reruning inference for keras nn")
    !python ../input/keras-neural-net/inference.py
    keras_submission = pd.read_csv('./submission.csv')
    
    print("Reruning inference for xgboost")
    ! python ../input/xgboost-baseline-saved-model-kadri/inference.py
    xboost_submission = pd.read_csv('./submission.csv')
    
    #print("Reruning inference for logistic regression")
    #! python ../input/nb-log-reg-3-multilabelskf-ver2-saved-pyinference/inference.py
    #logreg_submission = pd.read_csv('./submission.csv')
    
    print("Reruning inference for keras nn2")
    ! python ../input/marge-keras-v2-load-model/inference.py
    marge_keras_submission = pd.read_csv('./submission.csv')
    
del keras_submission['sig_id']
del xboost_submission['sig_id']
#del logreg_submission['sig_id']
del marge_keras_submission['sig_id']

# Calculate weights

In [7]:
y_true = train_targets.values

oof_dict = {'Keras NN (Romet)': 'keras-oof.npy',
            'XGBoost (Kadri)': '../input/xgboost-baseline-saved-model-kadri/xgboos-oof.npy',
            #'Log regression (Linda)': 'log-reg-oof.npy',
            "Keras NN (Marge)": 'marge-keras-oof.npy'
           }

oof = np.zeros((len(oof_dict), y_true.shape[0], y_true.shape[1]))
for i in range(oof.shape[0]):
    oof[i] = np.load(list(oof_dict.values())[i], allow_pickle=True)

In [8]:
%%time

log_loss_scores = {}
for n, key in enumerate(oof_dict.keys()):
    score_oof = log_loss_numpy(oof[n])
    log_loss_scores[key] = score_oof
    print(f'{key} CV:\t', score_oof)
print('-' * 50)

Keras NN (Romet) CV:	 0.014628500036354452
XGBoost (Kadri) CV:	 0.016686917384286584
Keras NN (Marge) CV:	 0.014927192786466119
--------------------------------------------------
CPU times: user 621 ms, sys: 132 ms, total: 754 ms
Wall time: 757 ms


In [9]:
#test_weights = np.array([1 / oof.shape[0]] * oof.shape[0])
#%timeit -r 10 grad_func(test_weights)
#%timeit -r 10 grad_func_jit(test_weights)

In [10]:
tol = 1e-10
init_guess = [1 / oof.shape[0]] * oof.shape[0]
bnds = [(0, 1) for _ in range(oof.shape[0])]
cons = {'type': 'eq', 
        'fun': lambda x: np.sum(x) - 1, 
        'jac': lambda x: [1] * len(x)}

print('Inital Blend OOF:', func_numpy_metric(init_guess))
start_time = time()
res_scipy = minimize(fun = func_numpy_metric, 
                     x0 = init_guess, 
                     method = 'SLSQP', 
                     jac = grad_func_jit, # grad_func 
                     bounds = bnds, 
                     constraints = cons, 
                     tol = tol)
print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
weights = res_scipy.x
print('Optimised Weights:', weights)

Inital Blend OOF: 0.014911209252197482
[00:25] Optimised Blend OOF: 0.014596764317620416
Optimised Weights: [0.78402541 0.01172581 0.20424878]


In [11]:
print('Check the sum of all weights:', np.sum(res_scipy.x))
if np.sum(res_scipy.x) - 1 <= tol:
    print('Great! The sum of all weights equals to 1!')
else:
    print('Manual adjustion is needed to modify the weights.')

Check the sum of all weights: 1.0
Great! The sum of all weights equals to 1!


In [12]:
for i, model_name in enumerate(oof_dict):
    print(f"{model_name}: {weights[i]:.4f}")

Keras NN (Romet): 0.7840
XGBoost (Kadri): 0.0117
Keras NN (Marge): 0.2042


In [13]:
#validation_submission = np.dot(val_submissions, weights)
validation_submission = keras_val_submission * weights[0] + xboost_val_submission * weights[1] + marge_keras_val_submission * weights[2]
print(f'Weighted val loss: {log_loss_metric(train_targets, validation_submission)}')

Weighted val loss: 0.014596764317620421


# Submission

In [14]:
test_submission = keras_submission * weights[0] + xboost_submission * weights[1] + marge_keras_submission * weights[2]

submission = pd.read_csv('../input/lish-moa/sample_submission.csv')
submission.iloc[:, 1:] = 0
submission.iloc[:, 1:] = test_submission
submission.to_csv('submission.csv', index=False)