In [None]:
counts = lambda df: df.apply(pd.Series.value_counts, axis=0)

def get_errors(X, y, model):
    errors = []
    for i in range(X.shape[0]):
        obs = X.iloc[i:i+1]
        real = y.iloc[i]
        y_pred = model.predict(obs)
        if y_pred != [real]:
            errors.append(i)
    errs = pd.concat([X.iloc[errors], y.iloc[errors]], axis=1, join='outer')
    print('Errors:', errs.shape[0])
    errs.sort_values('survived', inplace=True)
    return errs

def test_data(X, y, title, clf, sampler=None, splits=3):
    print('\n', '='*10, title, '='*10)
    kfold = KFold(n_splits=splits)
    for train_i, test_i in kfold.split(X):
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]
        if sampler:
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        model = clf.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(' '.join('{0: <7.7}{1}'.format(k, v) for v, k in sorted(zip(model.feature_importances_, X.columns), reverse=True)))
        print(metrics.accuracy_score(y_test, preds))
        test_survived, test_size = sum(y_test), y_test.shape[0]
        print(test_survived, test_size, round(test_survived/test_size), 2)
        report = metrics.classification_report(y_test, preds, output_dict=True)
        print(' '.join('{0}{1:.2f}'.format(k, v) for k, v in report.items())

def sample_features(X, y, clf, population, repeat, fixed=None, size=6):
    for _ in range(repeat):
        sample = random.sample(population, size)
        if fixed is not None:
            sample.append(fixed)
        X_sample = X.iloc[:, sample]
        test_data(X_sample, y, clf)