Hi Kagglers.
In this notebook I would like to improve my model performance with blending. It seems to me that in most competitions kagglers using average predictions or blending prediction with weights to get better results. I will blend results of 3 models with their defaults, use their predictions to find optimal weights and blend it.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")
sample = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
cat_feats = [col for col in train_df.columns if col.startswith("cat")]
num_feats = [col for col in train_df.columns if col.startswith("cont")]

In [None]:
test_df['target'] = -1

In [None]:
all_df = pd.concat([train_df, test_df])

In [None]:
all_df.head()

In [None]:
dummies_df = pd.get_dummies(all_df[cat_feats], drop_first=True)
new_df = pd.concat([all_df['id'],dummies_df, all_df[num_feats], all_df['target']], axis=1)

In [None]:
new_df.head()

In [None]:
train = new_df[new_df["target"] != -1]
test = new_df[new_df["target"] == -1]
test = test.drop("target", axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif["variables"] = num_feats
vif["VIF"] = [variance_inflation_factor(train[num_feats].values, i) for i in range(train[num_feats].shape[1])]

In [None]:
vif = vif.sort_values(by=["VIF"], ascending=False)
vif.style.background_gradient(cmap="magma")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
train = train.sample(frac=1).reset_index(drop=True)

In [None]:
y = train['target'].values

In [None]:
skf = StratifiedKFold(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, y)):
    train.loc[valid_idx, "kfold"] = fold

In [None]:
def run_training(df, algo, fold, model_name, test):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = df_train.drop(["id", "kfold", "target"], axis=1).values
    xvalid = df_valid.drop(["id", "kfold", "target"], axis=1).values
    
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    model = algo
    model.fit(xtrain, ytrain)
    
    preds = model.predict_proba(xvalid)[:, 1]
    auc = roc_auc_score(yvalid, preds)
    print(f"fold={fold}, auc={auc}")
    
    df_valid.loc[:, model_name] = preds
    
    sub_preds = model.predict_proba(test)[:, 1]
    
    return df_valid[["id", "kfold", "target", model_name]], sub_preds

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=200, verbose=1)

In [None]:
test_df_sub = pd.DataFrame({"id": test["id"].values})

In [None]:
dfs = []
test_temp = np.zeros(len(test))

for fold in range(5):
    temp_df, test_preds = run_training(train, clf, fold, "random_forest",test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub[f"forest_mean_fold"] = test_temp / 5   
fin_valid_df_rfc = pd.concat(dfs)

In [None]:
fin_valid_df_rfc.head()

In [None]:
roc_auc_score(fin_valid_df_rfc["target"], fin_valid_df_rfc["random_forest"])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(max_iter=100000,verbose=1)

In [None]:
dfs = []
test_temp = np.zeros(len(test))

for fold in range(5):
    temp_df, test_preds = run_training(train, lr, fold, "logisticRegression",test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub[f"lr_mean_fold"] = test_temp / 5   
fin_valid_df_lr = pd.concat(dfs)

In [None]:
fin_valid_df_lr.head()

In [None]:
roc_auc_score(fin_valid_df_lr["target"], fin_valid_df_lr["logisticRegression"])

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(use_label_encoder=False)

In [None]:
dfs = []
test_temp = np.zeros(len(test))

for fold in range(5):
    temp_df, test_preds = run_training(train, xgb, fold, "xgboost",test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub[f"xgb_mean_fold"] = test_temp / 5   
fin_valid_df_xgb = pd.concat(dfs)

In [None]:
fin_valid_df_xgb.head()

In [None]:
roc_auc_score(fin_valid_df_xgb["target"], fin_valid_df_xgb["xgboost"])

In [None]:
test_df_sub.head()

# Blending - optimal weights

In [None]:
from functools import partial
from scipy.optimize import fmin

In [None]:
class OptimizerAUC:
    def __init__(self):
        self.coef_ = 0
        
    def _auc(self, coef, X, y):
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=1)
        auc_score = roc_auc_score(y, predictions)
        return -1.0 * auc_score
    
    def fit(self, X, y):
        partial_loss = partial(self._auc, X=X, y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[1]))
        self.coef_ = fmin(partial_loss, init_coef, disp=True)
    
    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis=1)
        return predictions      


def run_training2(pred_df, fold, col_names):

    train_df = pred_df[pred_df.kfold !=fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True)
    
    xtrain = train_df[col_names].values
    xvalid = valid_df[col_names].values
    
    ytrain = train_df.target.values
    yvalid = valid_df.target.values
    
    opt = OptimizerAUC()
    opt.fit(xtrain, ytrain)
    preds = opt.predict(xvalid)
    
    auc = roc_auc_score(yvalid, preds)
    print(f"Fold={fold}, AUC={auc}")
    
    return opt.coef_

In [None]:
df = None

dfs_list = [fin_valid_df_lr, fin_valid_df_rfc, fin_valid_df_xgb]
for i in range(len(dfs_list)):
    if df is None:
        df = dfs_list[i]
    else:
        df = df.merge(dfs_list[i], on="id", how="left")

In [None]:
df.head()

In [None]:
targets = df.target.values
col_names = ["logisticRegression", "random_forest", "xgboost"]

In [None]:
coefs = []
for j in range(5):
    coefs.append(run_training2(df, j, col_names))

In [None]:
coefs = np.array(coefs)
coefs_mean = np.mean(coefs, axis=0)
print(coefs_mean)

In [None]:
col_names

In [None]:
wt_avg = (
    coefs_mean[0] * df.logisticRegression.values
    + coefs_mean[1] * df.random_forest.values
    + coefs_mean[2] * df.xgboost.values
)
print("Optimal auc after finding coefs")
wt_auc = roc_auc_score(targets, wt_avg)
print(f"Optimized weighted avg of auc: {wt_auc}")

## Submit blending predictions

In [None]:
test_df_sub

In [None]:
wt_avg_blend = (
    coefs_mean[0] * test_df_sub["lr_mean_fold"].values
    + coefs_mean[1] * test_df_sub["forest_mean_fold"].values
    + coefs_mean[2] + test_df_sub["xgb_mean_fold"].values
)

In [None]:
sample['target'] = wt_avg_blend

In [None]:
sample.to_csv("blend_avg_weights_sub2.csv", index=False)

## Add more features

Some suggest that adding predictions as new features can improve the model, let's test that.

In [None]:
col_names

In [None]:
col_names.append("id")

In [None]:
new_train = train.merge(df[col_names], on="id", how="left")

In [None]:
new_test = test.merge(test_df_sub, on="id", how="left")

In [None]:
new_train.head()

In [None]:
xgb = XGBClassifier(use_label_encoder=False)

In [None]:
test_df_sub2 = pd.DataFrame()

In [None]:
dfs = []
test_temp = np.zeros(len(new_test))

for fold in range(5):
    temp_df, test_preds = run_training(new_train, xgb, fold, "xgboost",new_test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub2[f"xgb_mean_fold"] = test_temp / 5   
fin_valid_df_xgb = pd.concat(dfs)

In [None]:
roc_auc_score(fin_valid_df_xgb["target"], fin_valid_df_xgb["xgboost"])

In [None]:
test_df_sub2.head()

In [None]:
rfc = RandomForestClassifier(n_estimators=200, verbose=1)

In [None]:
dfs = []
test_temp = np.zeros(len(new_test))

for fold in range(5):
    temp_df, test_preds = run_training(new_train, rfc, fold, "random_forest",new_test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub2["rfc_mean_fold"] = test_temp / 5   
fin_valid_df_rfc = pd.concat(dfs)

In [None]:
roc_auc_score(fin_valid_df_rfc["target"], fin_valid_df_rfc["random_forest"])

In [None]:
test_df_sub2.head()

In [None]:
lr = LogisticRegression(max_iter=100000,verbose=1)

In [None]:
dfs = []
test_temp = np.zeros(len(test))

for fold in range(5):
    temp_df, test_preds = run_training(new_train, lr, fold, "logisticRegression",new_test.drop("id", axis=1))
    dfs.append(temp_df)
    test_temp += test_preds
    
test_df_sub2[f"lr_mean_fold"] = test_temp / 5   
fin_valid_df_lr = pd.concat(dfs)

roc_auc_score(fin_valid_df_lr["target"], fin_valid_df_lr["logisticRegression"])

In [None]:
df = None

dfs_list = [fin_valid_df_lr, fin_valid_df_rfc, fin_valid_df_xgb]
for i in range(len(dfs_list)):
    if df is None:
        df = dfs_list[i]
    else:
        df = df.merge(dfs_list[i], on="id", how="left")
        
targets = df.target.values
col_names = ["logisticRegression", "random_forest", "xgboost"]

coefs = []
for j in range(5):
    coefs.append(run_training2(df, j, col_names))

In [None]:
coefs = np.array(coefs)
coefs_mean = np.mean(coefs, axis=0)
print(coefs_mean)

In [None]:
wt_avg = (
    coefs_mean[0] * df.logisticRegression.values
    + coefs_mean[1] * df.random_forest.values
    + coefs_mean[2] * df.xgboost.values
)
print("Optimal auc after finding coefs")
wt_auc = roc_auc_score(targets, wt_avg)
print(f"Optimized weighted avg of auc: {wt_auc}")

In [None]:
test_df_sub2.head()

In [None]:
wt_avg_blend = (
    coefs_mean[0] * test_df_sub["lr_mean_fold"].values
    + coefs_mean[1] * test_df_sub["forest_mean_fold"].values
    + coefs_mean[2] + test_df_sub["xgb_mean_fold"].values
)

sample['target'] = wt_avg_blend
sample.to_csv("blend_avg_weights_sub3.csv", index=False)

Indeed, adding pedictions as new features can improve the model as we can see it in this notebook, however all those actions haven't improve my score in Kaggle competition LB. Can anyone explain that to me. Please leave feedback if you found it interesting.