In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from scipy.stats import rankdata
import glob
from scipy.stats import describe
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Stacking Approach using Rank

In [None]:
print(os.listdir('../input'))
LABELS = ["isFraud"]

In [None]:
outs = []

# LGBM Models

outs.append(pd.read_csv("../input/ieee-team/submission-.9485.csv")[LABELS].values)
outs.append(pd.read_csv("../input/ieee-team/submission-.9483.csv")[LABELS].values)
outs.append(pd.read_csv("../input/ieee-team/submission-.9480.csv")[LABELS].values)
outs.append(pd.read_csv("../input/ieee-team/submission-.9469.csv")[LABELS].values)
outs.append(pd.read_csv("../input/ieee-team/submission-.9467.csv")[LABELS].values)
outs.append(pd.read_csv("../input/ieee-team/submission-.9466.csv")[LABELS].values)

# XGBOOST
outs.append(pd.read_csv("../input/ieee-team/submission_xgb-.9445.csv")[LABELS].values)
#outs.append(pd.read_csv("../input/ieee-team/submission_xgb-9459.csv")[LABELS].values)

# CATBOOST
outs.append(pd.read_csv("../input/ieee-team/submissioncatb-.9407.csv")[LABELS].values)
#outs.append(pd.read_csv("../input/ieee-team/submissioncatb-9454.csv")[LABELS].values)

In [None]:
import warnings
warnings.filterwarnings("ignore")
print("Rank averaging on ", len(outs), " files")
predictions = np.zeros_like(outs[0])
for predict in outs:
    for i in range(1):
        predictions[:, i] = np.add(predictions[:, i], rankdata(predict[:, i])/predictions.shape[0])  
predictions /= len(outs)

submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
submission[LABELS] = predictions
submission.to_csv('submission_stacker.csv', index=False)

Stacking Approach using GMean

In [None]:
LABELS = ["isFraud"]
all_files = glob.glob("../input/ieee-team/*.csv")
scores = np.zeros(len(all_files))
for i in range(len(all_files)):
    scores[i] = float('.'+all_files[i].split(".")[3])

In [None]:
top = scores.argsort()[::-1]
for i, f in enumerate(top):
    print(i,scores[f],all_files[f])

In [None]:
top = scores.argsort()[::-1]
for i, f in enumerate(top):
    print(i,scores[f],all_files[f])

In [None]:
outs = [pd.read_csv(all_files[f], index_col=0) for f in top]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "m" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols

In [None]:
# check correlation
corr = concat_sub.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(len(cols)+2, len(cols)+2))

# Draw the heatmap with the mask and correct aspect ratio
_ = sns.heatmap(corr,mask=mask,cmap='prism',center=0, linewidths=1,
                annot=True,fmt='.4f', cbar_kws={"shrink":.2})

In [None]:
mean_corr = corr.mean()
mean_corr = mean_corr.sort_values(ascending=True)
mean_corr

GMEAN of models with low average correlation

In [None]:
m_gmean1 = 0
for n in mean_corr.index:
    m_gmean1 += np.log(concat_sub[n])
m_gmean1 = np.exp(m_gmean1/len(mean_corr))

Weighted GMEAN by inverse correlation

In [None]:
rank = np.tril(corr.values,-1)
#rank[rank<0.92] = 1  # 0.9523 with 0.92 cut
rank[rank<0.91] = 1
m = (rank>0).sum() - (rank>0.95).sum()
m_gmean2, s = 0, 0
for n in range(m):
    mx = np.unravel_index(rank.argmin(), rank.shape)
    w = (m-n)/m
    m_gmean2 += w*(np.log(concat_sub.iloc[:,mx[0]])+np.log(concat_sub.iloc[:,mx[1]]))/2
    s += w
    rank[mx] = 1
m_gmean2 = np.exp(m_gmean2/s)

In [None]:
top_mean = 0
s = 0
#for n in [0,1,2,6,7]: # 0.9523 with this
for n in [0,1,2,3,4,5,6,7]:
    top_mean += concat_sub.iloc[:,n]*scores[top[n]]
    s += scores[top[n]]
top_mean /= s

In [None]:
#m_gmean = np.exp(0.3*np.log(m_gmean1) + 0.15*np.log(m_gmean2) + 0.55*np.log(top_mean))

# This is tested just to see the influence of the weight choice.
# 0.9523 < 0.9525 in original. Why? 
# Answer: I don't know
wbek = np.log(m_gmean1*m_gmean2*top_mean)
m_gmean = np.exp( ( np.log(m_gmean1)*np.log(m_gmean1) + np.log(m_gmean2)*np.log(m_gmean2) + np.log(top_mean)*np.log(top_mean) )/wbek ) # [Bek]
describe(m_gmean)

In [None]:
concat_sub['isFraud'] = m_gmean
concat_sub[['isFraud']].to_csv('stack_gmean.csv')

In [None]:
#Final Blend
concat_sub['isFraud_1'] = predictions
concat_sub['isFraud_2'] = m_gmean
concat_sub['isFraud'] = .45 * concat_sub['isFraud_1'] + .55 * concat_sub['isFraud_2']
concat_sub[['isFraud']].to_csv('submission.csv')