Blending sklearn models pipeline without any hyperparameters optimalization.

version 1


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.optimize import fmin # for blending
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn import linear_model


In [None]:
def set_seed(seed = 42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

In [None]:
path = '../input/tabular-playground-series-mar-2021/'

train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')
sample_sub = pd.read_csv(path + 'sample_submission.csv')

In [None]:
cat_cols = train_df.select_dtypes(include='object').columns
cont_cols = train_df.select_dtypes(exclude='object').columns

In [None]:
train_df['target'] = train_df['target'].astype(float)

## Short EDA

In [None]:
train_df.target.hist()

In [None]:
train_df.target.value_counts()

In [None]:
f, axes = plt.subplots(nrows=len(cat_cols), ncols=1, figsize=(30, 4 * len(cat_cols)))

for col, ax in zip(cat_cols, axes):
    ax.hist(train_df[col])
    ax.set_title(col)

In [None]:
f, axes = plt.subplots(nrows=len(cont_cols), ncols=3, figsize=(30, 4 * len(cont_cols)))

for col, ax in zip(cont_cols, axes):
    g = sns.kdeplot(train_df[col], shade=True, label="%.2f"%(train_df[col].skew()), ax=ax[0])
    g = g.legend(loc="best")
    scipy.stats.probplot(train_df[col], plot=ax[1])
    sns.boxplot(x=col, data=train_df, orient='h', ax=ax[2]);

In [None]:
corr = train_df[cont_cols].corr().abs()

fig, ax = plt.subplots(figsize=(14, 14))

sns.heatmap(corr, mask=None, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)

plt.yticks(rotation=0)
plt.show()

In [None]:
n_folds = 5

def trainFn(model_fn, data, target, cv=None, test=None, predict_proba=False):
    
    if cv is not None:
        OOF = np.zeros_like(train_df.target.values)
    else:
        cv = KFold(5)

    if test is not None:
        test_preds = np.zeros_like(sample_sub.target.values)

    for fold, (train_idx, val_idx) in enumerate(cv.split(data)):
        
        print(f'Fold: {fold + 1}')
        
        train_X, val_X = data[train_idx], data[val_idx]
        train_y, val_y = target[train_idx], target[val_idx]
        model = model_fn()
        
        
        model.fit(train_X, train_y)
        
        if predict_proba:
            preds = model.predict_proba(val_X)[:, -1]
        else:
            preds = model.predict(val_X)
        
        OOF[val_idx] += preds
        
        print(f'\tauc: {roc_auc_score(val_y, preds)}')
        
        if test is not None:
            if predict_proba:
                test_preds += model.predict_proba(test)[:, -1]
            else:
                test_preds += model.predict(test)
    
    oof_auc = roc_auc_score(train_df.target.astype(int).values, OOF)
    print(f'OOF auc: {oof_auc}')
    
    if test is not None and cv is not None:
        test_preds /= n_folds
        
    return OOF, oof_auc, test_preds

In [None]:
# one hot encoding
pipeline1 = make_pipeline(
    ColumnTransformer([
        ('oh', OneHotEncoder(handle_unknown = 'ignore'), cat_cols)
    ],
    remainder = 'passthrough')
)

In [None]:
results = dict()

## Linear models

In [None]:
OOF1, oof_auc1, test_preds1 = trainFn(model_fn = linear_model.LinearRegression,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df))

results['LinearRegression'] = oof_auc1

In [None]:
OOF2, oof_auc2, test_preds2 = trainFn(model_fn = linear_model.Ridge,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df))

results['Ridge'] = oof_auc2

In [None]:
OOF3, oof_auc3, test_preds3 = trainFn(model_fn = linear_model.BayesianRidge,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray())

results['BayesianRidge'] = oof_auc3

In [None]:
OOF4, oof_auc4, test_preds4 = trainFn(model_fn = linear_model.ARDRegression,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray())

results['ARDRegression'] = oof_auc4

### Blending

In [None]:
class Blender():
    def __init__(self):
        self.best_weights = None
    
    def _blend(self, preds, weights):
        return np.matmul(weights, preds).reshape(preds.shape[-1])
        
    def blend(self, preds):
        return self._blend(preds, self.best_weights)
    
    def fit(self, oof, target = train_df.target.values):
        initial_weights = np.ones(len(oof), dtype=np.float32).reshape(1, -1) / len(oof)
        
        print(f'Initial auc: {roc_auc_score(target, self._blend(oof, initial_weights))}')
        
        def objective(weights):
            blend = self._blend(oof, weights)
            
            return -roc_auc_score(target, blend)
        
        self.best_weights = fmin(objective, initial_weights)
        
        print(f'After: {roc_auc_score(target, self.blend(oof))}')
        
blender  = Blender()

In [None]:
OOF = np.array([OOF1, OOF2, OOF3, OOF4])
test_preds = np.array([test_preds1, test_preds2, test_preds3, test_preds4])

blender.fit(OOF)

In [None]:
sample_sub['target'] = blender.blend(test_preds)
sample_sub.to_csv('submission_lin.csv', index=False)

## Tree models

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn import ensemble

In [None]:
OOF5, oof_auc5, test_preds5 = trainFn(model_fn = ensemble.AdaBoostClassifier,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray(),
        predict_proba = True)

results['AdaBoostClassifier'] = oof_auc5

In [None]:
OOF6, oof_auc6, test_preds6 = trainFn(model_fn = ensemble.BaggingClassifier,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray(),
        predict_proba = True)

results['BaggingClassifier'] = oof_auc6

In [None]:
OOF7, oof_auc7, test_preds7 = trainFn(model_fn = ensemble.ExtraTreesClassifier,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray(),
        predict_proba = True)

results['ExtraTreesClassifier'] = oof_auc7

In [None]:
OOF8, oof_auc8, test_preds8 = trainFn(model_fn = ensemble.GradientBoostingClassifier,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray(),
        predict_proba = True)

results['GradientBoostingClassifier'] = oof_auc8

In [None]:
OOF9, oof_auc9, test_preds9 = trainFn(model_fn = ensemble.RandomForestClassifier,
        data = pipeline1.fit_transform(train_df.drop(['target'], axis = 1)).toarray(),
        target = train_df.target.values,
        cv=KFold(n_folds),
        test=pipeline1.transform(test_df).toarray(),
        predict_proba = True)

results['RandomForestClassifier'] = oof_auc9

In [None]:
OOF = np.array([OOF5, OOF6, OOF7, OOF8, OOF9])
test_preds = np.array([test_preds5, test_preds6, test_preds7, test_preds8, test_preds9])

blender.fit(OOF)

In [None]:
sample_sub['target'] = blender.blend(test_preds)
sample_sub.to_csv('submission_tree.csv', index=False)

In [None]:
OOF = np.array([OOF1, OOF2, OOF3, OOF4, OOF5, OOF6, OOF7, OOF8, OOF9])
test_preds = np.array([test_preds1, test_preds2, test_preds3, test_preds4, test_preds5, test_preds6, test_preds7, test_preds8, test_preds9])

blender.fit(OOF)

In [None]:
sample_sub['target'] = blender.blend(test_preds)
sample_sub.to_csv('submission_all.csv', index=False)