## This notebook forked from [Optimise Blending Weights with Bonus :0](https://www.kaggle.com/gogo827jz/optimise-blending-weights-with-bonus-0) by [Yirun Zhang](https://www.kaggle.com/gogo827jz).

In [None]:
import datetime
import pandas as pd
from time import time

import numpy as np
from scipy.optimize import minimize, fsolve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

### Define major metrics

In [None]:
def log_loss_numpy(y_pred):
    loss = 0
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    for i in range(y_pred.shape[1]):
        loss += - np.mean(y_true[:, i] * np.log(y_pred_clip[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_pred_clip[:, i]))
    return loss / y_pred.shape[1]

def calc_auc(y_pred):
    auc = 0
    for task_id in range(y_pred.shape[1]):
        auc -= roc_auc_score(y_true=y_true[:, task_id], y_score=y_pred[:, task_id])
    return auc / y_pred.shape[1]

def func_numpy_metric2(weights):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    score = calc_auc(oof_blend)
    return score

def func_numpy_metric(weights):
    oof_blend = np.tensordot(weights, oof, axes = ((0), (0)))
    score = log_loss_numpy(oof_blend)
    return score

def grad_func(weights):
    oof_clip = np.clip(oof, 1e-15, 1 - 1e-15)
    gradients = np.zeros(oof.shape[0])
    for i in range(oof.shape[0]):
        a, b, c = y_true, oof_clip[i], 0
        for j in range(oof.shape[0]):
            if j != i:
                c += weights[j] * oof_clip[j]
        gradients[i] = -np.mean((-a*b+(b**2)*weights[i]+b*c)/((b**2)*(weights[i]**2)+2*b*c*weights[i]-b*weights[i]+(c**2)-c))
    return gradients

### open oof csv files

In [None]:
train_targets_scored = pd.read_csv('../input/oof-weight-optimizer-public/model1.csv')
target_columns = [c for c in train_targets_scored.columns if c not in ['id']]

y_true = train_targets_scored['target'].values
y_true = y_true.reshape(-1,1)
oof_dict = {
    'model1':"../input/oof-weight-optimizer-public/model1.csv",
    'model2':"../input/oof-weight-optimizer-public/model2.csv",
    'model3':"../input/oof-weight-optimizer-public/model3.csv",   
    'model4':"../input/oof-weight-optimizer-public/model4.csv",   
    'model5':"../input/oof-weight-optimizer-public/model5.csv",   
    'model6':"../input/oof-weight-optimizer-public/model6.csv",   
}

target_columns  = ['preds']
oof = np.zeros((len(oof_dict), y_true.shape[0], len(target_columns) ))

for i in range(oof.shape[0]):
    valid = pd.read_csv( list(oof_dict.values())[i] )
    valid = train_targets_scored.drop(columns=target_columns).merge(valid[['id']+target_columns], on='id', how='left').fillna(0)
    oof[i] = valid[target_columns].values 

In [None]:
%%time

log_loss_scores = {}
for n, key in enumerate(oof_dict.keys()):
    score_oof = calc_auc(oof[n])
    log_loss_scores[key] = score_oof
    print(f'{key:40s} CV:', score_oof)
    
print('-' * 60)

### Observe correlation

In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')

submit = pd.read_csv("../input/g2net-gravitational-wave-detection/training_labels.csv")
subs = np.zeros((len(oof_dict), y_true.shape[0], len(target_columns) ))

for i, p in enumerate(oof_dict.keys()):
    print(i,p)
    tmp = pd.read_csv(oof_dict[p])
    valid = train_targets_scored.drop(columns=target_columns).merge(tmp[['id']+target_columns], on='id', how='left').fillna(0)
    subs[i,:,:] = valid[target_columns].values 

corr = np.corrcoef(subs.reshape(len(oof_dict), -1))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 12))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, annot=True, fmt="g",
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
ax.set_ylim(corr.shape[0], 0)
plt.yticks(rotation=0)

### Blending Weights Optimize

In [None]:
tol = 1e-10
init_guess = [1 / oof.shape[0]] * oof.shape[0]
bnds = [(0, 1) for _ in range(oof.shape[0])]
cons = {'type': 'eq', 
        'fun': lambda x: np.sum(x) - 1, 
        'jac': lambda x: [1] * len(x)}

print('Inital Blend OOF:', func_numpy_metric2(init_guess))
start_time = time()

res_scipy = minimize(fun = func_numpy_metric2, 
                     x0 = init_guess, 
                     method = 'Nelder-Mead', 
                     #method='SLSQP',
                     jac = grad_func, 
                     bounds = bnds, 
                     constraints = cons, 
                     tol = tol)

print(f'[{str(datetime.timedelta(seconds = time() - start_time))[2:7]}] Optimised Blend OOF:', res_scipy.fun)
print('Optimised Weights:', res_scipy.x)
print('-' * 70)

for n, key in enumerate(oof_dict.keys()):
    print(f'{key:40s} Optimised Weights:', res_scipy.x[n])

The nelder-mead method requires normalization because the sum of the weights does not equal 1.

In [None]:
ws = [ res_scipy.x[i] for i in range(len(oof_dict.keys()))]
ws/np.sum(ws)

### Rank Averaging from [tips: rank averaging](https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification/discussion/205564) by [Tawara](https://www.kaggle.com/ttahara).

In [None]:
oof = np.zeros((len(oof_dict), y_true.shape[0], len(target_columns) ))

for i in range(oof.shape[0]):
    valid = pd.read_csv( list(oof_dict.values())[i] )
    valid = train_targets_scored.drop(columns=target_columns).merge(valid[['id']+target_columns], on='id', how='left').fillna(0)
    oof[i] = valid[target_columns].values

In [None]:
from scipy.stats import rankdata
for i in range(oof.shape[0]):
    for j in range(len(target_columns)):
        oof[i,:,j] = rankdata(oof[i,:,j],method='average')

In [None]:
overall_oof = np.mean(oof, axis=0)/np.max(oof)
print(calc_auc(overall_oof))

# Making submission

In [None]:
predictions = []
target_columns = ['target']
label_cols = target_columns

In [None]:
preds_dict = {
    'model1':"../input/oof-weight-optimizer-public/submission1.csv",
    'model2':"../input/oof-weight-optimizer-public/submission2.csv",
    'model3':"../input/oof-weight-optimizer-public/submission3.csv",
    'model4':'../input/oof-weight-optimizer-public/submission4.csv',
    'model5':'../input/oof-weight-optimizer-public/submission5.csv',
    'model6':'../input/oof-weight-optimizer-public/submission6.csv',
}

In [None]:
for i in range(len(preds_dict)):
    sub = pd.read_csv(list(preds_dict.values())[i])
    predictions.append(sub)

In [None]:
y_pred = predictions[0]['target'].values
train_targets_scored = predictions[0]
preds = np.zeros((len(preds_dict), y_pred.shape[0], len(target_columns) ))

### Using optimized weight

In [None]:
weights = ws
weights = weights/np.sum(weights)

In [None]:
weighted_y_pred = pd.DataFrame()
weighted_y_pred['id'] = predictions[0]['id']
for column in label_cols:
    column_data = []
    for i in range(len(preds_dict)):
        column_data.append(predictions[i][column] * weights[i])
    weighted_y_pred[column] = np.sum(column_data, axis=0)

In [None]:
submission = weighted_y_pred
submission.to_csv('submission_optimized.csv', index=False)
submission.head()

### Simple averaged

In [None]:
weighted_y_pred = pd.DataFrame()
weighted_y_pred['id'] = predictions[0]['id']
for column in label_cols:
    column_data = []
    for i in range(len(preds_dict)):
        column_data.append(predictions[i][column] / len(preds_dict))
    weighted_y_pred[column] = np.sum(column_data, axis=0)

In [None]:
submission = weighted_y_pred
submission.to_csv('submission_mean.csv', index=False)
submission.head()

### Rank averaging

In [None]:
preds = np.zeros((len(preds_dict), y_pred.shape[0], len(target_columns) ))
#print(preds.shape)
weighted_y_pred = pd.DataFrame()
weighted_y_pred['id'] = predictions[0]['id']
for i in range(preds.shape[0]):
    valid = pd.read_csv( list(preds_dict.values())[i] )
    valid = train_targets_scored.drop(columns=target_columns).merge(valid[['id']+target_columns], on='id', how='left').fillna(0)
    preds[i] = valid[target_columns].values
    
from scipy.stats import rankdata

for i in range(preds.shape[0]):
    for j in range(len(target_columns)):
        preds[i,:,j] = rankdata(preds[i,:,j],method='average')
        
weighted_y_pred[label_cols] = np.mean(preds,axis=0)

In [None]:
submission = weighted_y_pred
submission.to_csv('submission_rank.csv', index=False)
submission.head()