In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
submission = pd.read_csv('../input/cat-in-the-dat/sample_submission.csv', index_col='id')

In [None]:
kernels = pd.read_csv('../input/cat-in-dat-kernels/kernels.csv', index_col='id')

In [None]:
kernels.head()

In [None]:
import glob

def make_filename(idx):
    return glob.glob('../input/cat-in-dat-kernels/' + str(idx) + '__submission__*.csv')[0]

def read_predictions(idx):
    temp = pd.read_csv(make_filename(idx), index_col='id')
    temp.columns = [str(idx)]
    return temp


predictions = pd.concat([read_predictions(idx) for idx in kernels.index], axis=1)
predictions.shape

In [None]:
predictions.head()

## Correlation matrix

In [None]:
# From https://seaborn.pydata.org/examples/many_pairwise_correlations.html

import seaborn as sns
import matplotlib.pyplot as plt

corr = predictions.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(12, 12))

sns.heatmap(corr, mask=mask, cmap='Blues', vmin=0.95, center=0, linewidths=1, annot=True, fmt='.4f')

## Stack

In [None]:
submission['target'] = predictions.mean(axis=1)
submission.to_csv('stack-mean.csv')

In [None]:
submission.head()

## Weighted sum

In [None]:
scores = kernels['score']

sum_scores = sum(scores)

weights = [x / sum_scores for x in scores]

In [None]:
sum_predictions = predictions.dot(pd.Series(weights, index=predictions.columns))

In [None]:
sum_predictions.head()

In [None]:
submission['target'] = sum_predictions
submission.to_csv('stack-weighted-sum.csv')

## Filter

In [None]:
N = 3

selected = kernels.sort_values('score', ascending=False).head(N)

In [None]:
print('Max selected score =', selected['score'].max())
print('Min selected score =', selected['score'].min())

In [None]:
filter_predictions = predictions.loc[:,selected.index.values.astype(str)]

In [None]:
filter_predictions.head()

In [None]:
submission['target'] = filter_predictions.mean(axis=1)
submission.to_csv('stack-filtered.csv')