## Find optimal ensemble bases of oof predictions

In [None]:
import pandas as pd
import numpy as np
from functools import partial
import os
import random
import joblib
import json
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import accuracy_score
from scipy.optimize import differential_evolution

import gc
from functools import reduce
from itertools import combinations, chain
from tqdm import tqdm
from sklearn.model_selection import KFold
from itertools import chain
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder


In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
TARGET_NAME = 'target'
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
le = LabelEncoder()
train_data[TARGET_NAME] = le.fit_transform(train_data[TARGET_NAME])
train_data.head()

In [None]:
oof_catboost = pd.read_csv('../input/tabular-may-2021-oof/oof_catboost.csv')
oof_catboost.drop(['Class_1','Class_2','Class_3','Class_4'], axis=1, inplace=True)

oof_lightautoml = pd.read_csv('../input/tabular-may-2021-oof/oof_lightautoml.csv')
oof_lightautoml.drop(['Class_1','Class_2','Class_3','Class_4'], axis=1, inplace=True)

oof_lightgbm = pd.read_csv('../input/tabular-may-2021-oof/oof_lightgbm.csv')
oof_lightgbm.drop(['Class_1','Class_2','Class_3','Class_4'], axis=1, inplace=True)

oof_xgboost = pd.read_csv('../input/tabular-may-2021-oof/oof_xgboost.csv')
oof_xgboost.drop(['Class_1','Class_2','Class_3','Class_4'], axis=1, inplace=True)

In [None]:
oof_xgboost.rename(columns={'lightgbm': 'xgboost'}, inplace=True)
oof_xgboost.head(5)

In [None]:
oof_lightgbm.head(5)

In [None]:
oof_lightautoml.rename(columns={'lightgbm': 'lightautoml'}, inplace=True)
oof_lightautoml.head(5)

In [None]:
oof_catboost.head(5)

In [None]:
oof_predictions_v3 = pd.concat([oof_catboost['catboost'], oof_lightautoml['lightautoml'], 
          oof_lightgbm['lightgbm'], oof_xgboost['xgboost'], train_data['target']], axis=1)
columns = ['catboost', 'lightautoml', 'lightgbm', 'xgboost']

In [None]:
oof_predictions_v3['catboost'] = oof_predictions_v3['catboost'].apply(lambda x: np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
oof_predictions_v3['lightautoml'] = oof_predictions_v3['lightautoml'].apply(lambda x: np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
oof_predictions_v3['lightgbm'] = oof_predictions_v3['lightgbm'].apply(lambda x: np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))
oof_predictions_v3['xgboost'] = oof_predictions_v3['xgboost'].apply(lambda x: np.fromstring(
                               x.replace('\n','')
                                .replace('[','')
                                .replace(']','')
                                .replace('  ',' '), sep=' '))

In [None]:
oof_predictions_v3

Check different combintaions of oof predictions and get optimal models ensemble

In [None]:
combined = []
for i in range(len(columns)):
    combined.append(list(combinations(columns, i+1)))

def evaluate_ensemble(df, columns):
    return df[[*columns]].apply(lambda x: np.argmax([np.sum(v) for v in zip(*[x[c] for c in columns])]), axis=1).values

results = dict()
with tqdm(total=len(list(chain(*combined)))) as process_bar:
    for c in list(chain(*combined)):
        process_bar.update(1)  
        results[c] = accuracy_score(oof_predictions_v3.target.values, evaluate_ensemble(oof_predictions_v3, c))

Get top 50 combinations

In [None]:
{k: results[k] for k in sorted(results, key=results.get, reverse=True)[0:50]}

In [None]:
oof_predictions_v3

In [None]:
considered_models = oof_predictions_v3[["target","catboost","lightautoml","lightgbm","xgboost"]]

Calculate weights for each model

In [None]:
kfold = KFold(n_splits=4)

yhats = considered_models.iloc[:,1:].values
y = considered_models.target.values
n_models = yhats.shape[1]

accuracy = []
for fold, (train_idx, test_idx) in enumerate(kfold.split(yhats, y)):
    
    print(f"Iteration {fold+1}")
    
    weights = np.array([1.0/n_models for _ in range(n_models)])
    bounds = [(0.0, 1.0) for _ in range(n_models)]
    minimizeargs = (np.take(yhats, train_idx, axis=0), np.take(y, train_idx, axis=0))
    
    def calculate_accuracy(y_true, y_pred):
        return np.average(y_true == y_pred)

    def loss_func(weights, Yhat, Y):
        w = np.mean(weights * Yhat, axis=1)
        return 1 - calculate_accuracy(Y, list(map(lambda x: np.argmax(x), w)))

    sol = differential_evolution(loss_func, bounds, minimizeargs, maxiter=20, tol=1e-5, disp=True, seed=8)
    
    
    # Calculate oof accuracy of optimized weights
    oof_accuracy = calculate_accuracy(np.take(y, test_idx, axis=0),
                                      list(map(lambda x: np.argmax(x), np.mean(
                                          np.take(yhats, test_idx, axis=0) * sol.x, axis=1))))
    
    print(f"{oof_accuracy}")
    
    accuracy.append((sol.x, oof_accuracy))

In [None]:
accuracy

In [None]:
preds_xgboost = pd.read_csv('../input/tabular-may-2021-oof/preds_xgboost.csv')
preds_lightautoml = pd.read_csv('../input/tabular-may-2021-oof/preds_lightautoml.csv')
preds_lightgbm = pd.read_csv('../input/tabular-may-2021-oof/preds_lightgbm.csv')
preds_catboost = pd.read_csv('../input/tabular-may-2021-oof/preds_catboost.csv')

In [None]:
submit = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
submit[['Class_1','Class_2','Class_3','Class_4']] =  \
    0.25 * preds_lightautoml[['Class_1','Class_2','Class_3','Class_4']] + \
    0.25 * preds_lightgbm[['Class_1','Class_2','Class_3','Class_4']] + \
    0.2 * preds_catboost[['Class_1','Class_2','Class_3','Class_4']] + \
    0.1 * preds_xgboost[['Class_1','Class_2','Class_3','Class_4']]

In [None]:
submit.to_csv('submit.csv', index=False)