In [None]:
import pandas as pd, numpy as np, os
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [None]:
!pwd

In [None]:
!ls /kaggle/input/

In [None]:
PATH = '../input/ranzcr-oof-and-subs/'
FILES = os.listdir(PATH)
FILES

In [None]:
# removed subs and oofs with potential data leakage
#FILES = [fn for fn in FILES if '9' not in fn and '10' not in fn]
FILES = [fn for fn in FILES if '9' not in fn and '10' not in fn and '4' not in fn]
FILES

In [None]:
OOF = np.sort( [f for f in FILES if 'oof' in f] )
OOF_CSV = [pd.read_csv(PATH+k) for k in OOF]

print('We have %i oof files...'%len(OOF))
print(); print(OOF)

In [None]:
OOF_CSV[0].head()

In [None]:
OOF_CSV[0].columns

In [None]:
OOF_CSV[0].iloc[:, 1:12].columns.tolist()

In [None]:
target_cols = OOF_CSV[0].iloc[:, 1:12].columns.tolist()
pred_cols = OOF_CSV[0].iloc[:, 15:].columns.tolist()

In [None]:
pred_cols

In [None]:
def macro_multilabel_auc(label, pred):
    aucs = []
    for i in range(len(target_cols)):
        aucs.append(roc_auc_score(label[:, i], pred[:, i]))
    #print(np.round(aucs, 4))
    return np.mean(aucs)

In [None]:
x = np.zeros(( len(OOF_CSV[0]), len(OOF)*len(pred_cols)))

In [None]:
x.shape

In [None]:
for k in range(len(OOF)):
    x[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] = OOF_CSV[k][pred_cols].values

In [None]:
TRUE = OOF_CSV[0][target_cols].values
TRUE

In [None]:
all = []
for k in range(len(OOF)):
    #auc = roc_auc_score(OOF_CSV[0].target,x[:,k])
    auc = macro_multilabel_auc(OOF_CSV[0][target_cols].values, x[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))])
    all.append(auc)
    print('Model %i has OOF AUC = %.4f'%(k,auc))
    
m = [np.argmax(all)]; w = []

In [None]:
m

In [None]:
w

# Build OOF Ensemble. Maximize CV Score

In [None]:
old = np.max(all); 

RES = 500;#200; 
PATIENCE = 30;#10; 
TOL = 0.00005#0.0003
'''
RES = 200;#200; 
PATIENCE = 10;#10; 
TOL = 0.0003#0.0003
'''
DUPLICATES = False

print('Ensemble AUC = %.4f by beginning with model %i'%(old,m[0]))
print()

for kk in range(len(OOF)):
    
    # BUILD CURRENT ENSEMBLE
    md = x[:,int(m[0]*len(pred_cols)):int((m[0]+1)*len(pred_cols))]
    for i, k in enumerate(m[1:]):
        md = w[i]*x[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] + (1-w[i])*md
        
    # FIND MODEL TO ADD
    mx = 0; mx_k = 0; mx_w = 0
    print('Searching for best model to add... ')
    
    # TRY ADDING EACH MODEL
    for k in range(len(OOF)):
        print(k,', ',end='')
        if not DUPLICATES and (k in m): continue
            
        # EVALUATE ADDING MODEL K WITH WEIGHTS W
        bst_j = 0; bst = 0; ct = 0
        for j in range(RES):
            tmp = j/RES*x[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] + (1-j/RES)*md
            auc = macro_multilabel_auc(TRUE,tmp)
            if auc>bst:
                bst = auc
                bst_j = j/RES
            else: ct += 1
            if ct>PATIENCE: break
        if bst>mx:
            mx = bst
            mx_k = k
            mx_w = bst_j
            
    # STOP IF INCREASE IS LESS THAN TOL
    inc = mx-old
    if inc<=TOL: 
        print(); print('No increase. Stopping.')
        break
        
    # DISPLAY RESULTS
    print(); #print(kk,mx,mx_k,mx_w,'%.5f'%inc)
    print('Ensemble AUC = %.4f after adding model %i with weight %.3f. Increase of %.4f'%(mx,mx_k,mx_w,inc))
    print()
    
    old = mx; m.append(mx_k); w.append(mx_w)

In [None]:
print('We are using models',m)
print('with weights',w)
print('and achieve ensemble AUC = %.4f'%old)

In [None]:
md = x[:, int(m[0]*len(pred_cols)):int((m[0]+1)*len(pred_cols))]
for i, k in enumerate(m[1:]):
    md = w[i]*x[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] + (1-w[i])*md
plt.hist(md,bins=100)
plt.title('Ensemble OOF predictions')
plt.show()

In [None]:
df = OOF_CSV[0].copy()
df.pred = md
df.to_csv('ensemble_oof.csv',index=False)
df.head()

# Load SUB Files

In [None]:
SUB = np.sort( [f for f in FILES if 'sub' in f] )
SUB_CSV = [pd.read_csv(PATH+k) for k in SUB]

print('We have %i submission files...'%len(SUB))
print(); print(SUB)

In [None]:
# VERFIY THAT SUBMISSION FILES MATCH OOF FILES
a = np.array( [ int( x.split('_')[1].split('.')[0]) for x in SUB ] )
b = np.array( [ int( x.split('_')[1].split('.')[0]) for x in OOF ] )
if len(a)!=len(b):
    print('ERROR submission files dont match oof files')
else:
    for k in range(len(a)):
        if a[k]!=b[k]: print('ERROR submission files dont match oof files')

In [None]:
y = np.zeros(( len(SUB_CSV[0]), len(SUB)*len(pred_cols)))
for k in range(len(SUB)):
    y[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] = SUB_CSV[k][target_cols].values

# Build SUB Ensemble

In [None]:
md2 = y[:, int(m[0]*len(pred_cols)):int((m[0]+1)*len(pred_cols))]
for i, k in enumerate(m[1:]):
    md2 = w[i]*y[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] + (1-w[i])*md2
plt.hist(md2,bins=100)
plt.show()

In [None]:
df = SUB_CSV[0].copy()
df[target_cols] = md2
df.to_csv('ensemble_sub.csv',index=False)
df.head()

In [None]:
pd.read_csv('../input/ranzcr-clip-catheter-line-classification/sample_submission.csv',usecols=[0],index_col=0).join(pd.read_csv('ensemble_sub.csv').set_index('StudyInstanceUID')).fillna(0).to_csv('submission.csv')   