## Get the 1st place (score `0.37353`) in Predicting a Biological Response competition on the private leaderboard.
Based on [this great solution](https://github.com/emanuele/kaggle_pbr/blob/master/blend.py).

In [None]:
import numpy as np
import pandas as pd

In [None]:
def read_data(file_path):
    with open(file_path) as f:
        f.readline()    # skip header
        samples = []
        for line in f:
            line = line.strip().split(",")
            sample = [float(x) for x in line]
            samples.append(sample)
    return samples

def load(train_path, test_path):
    train = read_data(train_path)
    y_train = np.array([x[0] for x in train])
    X_train = np.array([x[1:] for x in train])
    X_test = np.array(read_data(test_path))
    return X_train, y_train, X_test

train_path = '/kaggle/input/bioresponse/train.csv'
test_path = '/kaggle/input/bioresponse/test.csv'

X, y, X_submission = load(train_path, test_path)
print(X.shape)
print(X[0])
print(y.shape)
print(y[0])
print(X_submission.shape)
print(X_submission[0])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

In [None]:
np.random.seed(0)   # seed to shuffle the train set

n_folds = 10
shuffle = False

if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]

skf = list(StratifiedKFold(n_folds).split(X, y))

clfs = [RandomForestClassifier(n_estimators=1000, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=1000, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(
            learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=600)]

clfs = [CalibratedClassifierCV(clf, method='isotonic', cv=StratifiedKFold(5))
        for clf in clfs]

# Creating train and test sets for blending
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))             # (3751, 5)
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))   # (2501, 5)

for clf_idx, clf in enumerate(clfs):
    print(clf_idx, clf)
    dataset_blend_test_cv = np.zeros((X_submission.shape[0], len(skf)))
    for fold_idx, (train_idxs, test_idxs) in enumerate(skf):
        print('Fold', fold_idx)
        X_train, y_train = X[train_idxs], y[train_idxs]
        X_test, y_test = X[test_idxs], y[test_idxs]
        clf.fit(X_train, y_train)
        dataset_blend_train[test_idxs, clf_idx] = clf.predict_proba(X_test)[:, 1]
        dataset_blend_test_cv[:, fold_idx] = clf.predict_proba(X_submission)[:, 1]
    dataset_blend_test[:, clf_idx] = dataset_blend_test_cv.mean(1)

In [None]:
# Blending
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

# Linear stretch of predictions to [0, 1]
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

# Saving Results
tmp = np.vstack([range(1, len(y_submission) + 1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
           header='MoleculeId,PredictedProbability', comments='')