### **Credits and References:**
* https://www.kaggle.com/code/adaubas/tps-jul22-lgbm-extratree-qda-soft-voting  
* https://www.kaggle.com/code/pourchot/simple-soft-voting  
* https://www.kaggle.com/code/ricopue/tps-jul22-clusters-and-lgb  
* https://www.kaggle.com/code/ambrosm/tpsjul22-gaussian-mixture-cluster-analysis  
* https://www.kaggle.com/code/thedevastator/how-to-ensemble-clustering-algorithms-updated  
* https://www.kaggle.com/code/eduus710/getting-cluster-ensembles-to-work  
* https://www.kaggle.com/code/plarmuseau/bruteforce-clustering  
* https://www.kaggle.com/code/thedevastator/bruteforce-clustering  

Thank you everyone for great ideas.

# Librairies

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

from sklearn.mixture import BayesianGaussianMixture

import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from cuml.svm import SVC
from cuml.neighbors import KNeighborsClassifier

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

# Parameters

In [2]:
class CFG:
    seed_bgm = 1
    seed = 42
    n_splits = 10
    n_clusters = 7
    threshold = 0.7

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

# Load data

In [4]:
df = pd.read_csv("../input/tabular-playground-series-jul-2022/data.csv").drop('id', axis=1)
df.head()

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28
0,-0.38942,-0.912791,0.648951,0.589045,-0.830817,0.733624,2.25856,2,13,14,5,13,6,6,-0.469819,0.358126,1.068105,-0.55965,-0.366905,-0.478412,-0.757002,-0.763635,-1.090369,1.142641,-0.884274,1.137896,1.309073,1.463002,0.813527
1,-0.689249,-0.453954,0.654175,0.995248,-1.65302,0.86381,-0.090651,2,3,6,4,6,16,9,0.591035,-0.396915,0.145834,-0.030798,0.471167,-0.428791,-0.089908,-1.784204,-0.839474,0.459685,1.759412,-0.275422,-0.852168,0.562457,-2.680541
2,0.809079,0.324568,-1.170602,-0.624491,0.105448,0.783948,1.988301,5,11,5,8,9,3,11,-0.679875,0.469326,0.349843,-0.288042,0.29147,-0.413534,-1.602377,1.190984,3.267116,-0.088322,-2.168635,-0.974989,1.335763,-1.110655,-3.630723
3,-0.500923,0.229049,0.264109,0.23152,0.415012,-1.221269,0.13885,6,2,13,8,9,6,4,-0.389456,0.626762,-1.074543,-1.521753,-1.150806,0.619283,1.287801,0.532837,1.036631,-2.041828,1.44049,-1.900191,-0.630771,-0.050641,0.238333
4,-0.671268,-1.039533,-0.270155,-1.830264,-0.290108,-1.852809,0.781898,8,7,5,3,1,13,11,-0.120743,-0.615578,-1.064359,0.444142,0.428327,-1.62883,-0.434948,0.322505,0.284326,-2.438365,1.47393,-1.044684,1.602686,-0.405263,-1.987263


In [5]:
all_scores = []
best_features = [f"f_{i:02d}" for i in list(range(7, 14)) + list(range(22, 29))]
features = df.columns

def scores(preds, lib, df=df[best_features], verbose=True, compute_silhouette=True): 
    # Silhouette is very slow
    sil = 0
    if compute_silhouette:
        sil = silhouette_score(df, preds, metric='euclidean')
    
    s = (lib,
         sil, 
         calinski_harabasz_score(df, preds), 
         davies_bouldin_score(df, preds))
    
    if verbose:
        print(f"{s[0]} : Silhouette : {s[1]:.1%} | Calinski Harabasz : {s[2]:.1f} | Davis Bouldin : {s[3]:.3f}")
        
    return s

# Bayesian Gaussian Mixture

In [6]:
seed_everything(CFG.seed_bgm)
df_scaled = pd.DataFrame(PowerTransformer().fit_transform(df[features]), columns=features)

BGM = BayesianGaussianMixture(n_components=CFG.n_clusters, covariance_type='full', random_state=CFG.seed_bgm, max_iter=300, n_init=1, tol=1e-3)
BGM.fit(df_scaled[best_features])

BGM_predict_proba = BGM.predict_proba(df_scaled[best_features])
BGM_predict = np.argmax(BGM_predict_proba, axis=1)

all_scores.append(scores(BGM_predict, lib="BayesianGaussianMixture after powertransformer"))

BayesianGaussianMixture after powertransformer : Silhouette : 5.0% | Calinski Harabasz : 8156.3 | Davis Bouldin : 2.656


# Trusted Data

In [7]:
# get trusted data to train LGB model.
proba_threshold = CFG.threshold

df_scaled['predict'] = BGM_predict
df_scaled['predict_proba'] = 0
for n in range(CFG.n_clusters):
    df_scaled[f'predict_proba_{n}'] = BGM_predict_proba[:, n]
    df_scaled.loc[df_scaled['predict']==n, 'predict_proba'] = df_scaled[f'predict_proba_{n}']
    
    
idxs = np.array([])
for n in range(CFG.n_clusters):
    median = df_scaled[df_scaled.predict==n]['predict_proba'].median()
    idx = df_scaled[(df_scaled.predict==n) & (df_scaled.predict_proba > proba_threshold)].index
    idxs = np.concatenate((idxs, idx))
    print(f'Class n{n}  |  Median : {median:.4f}  |  Training data : {len(idx)/len(df_scaled[(df_scaled.predict==n)]):.1%}')
    
X = df_scaled.loc[idxs][best_features].reset_index(drop=True)
y = df_scaled.loc[idxs]['predict'].reset_index(drop=True)

Class n0  |  Median : 0.9116  |  Training data : 75.2%
Class n1  |  Median : 0.8669  |  Training data : 72.0%
Class n2  |  Median : 0.8685  |  Training data : 71.8%
Class n3  |  Median : 0.9074  |  Training data : 77.1%
Class n4  |  Median : 0.7312  |  Training data : 54.3%
Class n5  |  Median : 0.9833  |  Training data : 87.9%
Class n6  |  Median : 0.9376  |  Training data : 78.6%


In [8]:
display(X.shape)
display(X)

(72688, 14)

Unnamed: 0,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_22,f_23,f_24,f_25,f_26,f_27,f_28
0,0.021718,1.203800,-0.635157,2.238616,0.517399,-0.928325,0.317221,0.611256,-0.308725,-1.252728,1.090145,-1.912390,0.438324,-0.602595
1,-0.598123,1.888883,-0.912475,1.054360,-0.171515,0.755755,-0.735370,-0.298416,-0.570828,-1.415415,1.050300,0.300649,-1.584098,0.515559
2,0.530261,-0.048856,-0.912475,1.531077,-0.682876,0.936024,-0.162832,0.027034,0.217398,-2.073375,0.830367,0.847741,0.142683,1.019868
3,0.530261,0.622133,-0.394246,2.104246,-0.682876,0.565556,0.533888,-0.026342,0.154460,-3.183273,2.213231,1.917207,0.043816,-0.954962
4,-0.270117,1.557045,-1.243439,1.054360,-0.682876,0.148356,0.086073,-0.096009,-0.718806,0.329544,-0.586925,-0.127960,-0.571388,-1.630053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72683,-1.440987,1.383372,0.356049,-0.090749,1.532430,-2.346865,0.317221,1.207564,-0.181893,-1.655839,1.440076,0.857999,-2.711254,-1.022097
72684,0.286548,1.383372,-1.663579,-0.837020,2.439447,0.565556,0.738456,-0.491050,0.675622,0.495494,0.462718,0.182702,-0.749529,0.710768
72685,-0.977987,1.383372,-0.635157,-0.567775,1.142180,-0.928325,-0.735370,-0.225471,0.597403,-0.406256,-0.212425,0.815487,0.891106,-1.406752
72686,-0.270117,1.017648,-0.394246,-0.320556,1.142180,-0.083596,1.295599,-0.056586,2.480347,1.429287,-0.485778,0.655260,-0.518800,1.525309


# Supervised learning
### XGBoost / ExtraTreesClassifier / QuadraticDiscriminantAnalysis / SVC / KNeighborsClassifier

In [9]:
def get_score(labels, preds, probas):
    s = (balanced_accuracy_score(labels, preds),
        roc_auc_score(labels, probas, average="weighted", multi_class="ovo"))
    return s

In [10]:
params_xgb = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'learning_rate': 4e-2,
    'num_class': CFG.n_clusters,
    'seed': CFG.seed,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
    }

In [11]:
seed_everything(CFG.seed)

xgb_predict_proba = 0
etc_predict_proba = 0
qda_predict_proba = 0
svc_predict_proba = 0
knc_predict_proba = 0
classif_scores = []

skf = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"===== fold{fold} =====")
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    # XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_valid = xgb.DMatrix(X_valid, label=y_valid)

    model = xgb.train(params_xgb,
                      dtrain=xgb_train,
                      evals=[(xgb_train, 'train'),(xgb_valid, 'eval')],
                      verbose_eval=False,
                      num_boost_round=20000,
                      early_stopping_rounds=500,
                     )
    
    y_pred_proba = model.predict(xgb_valid, iteration_range=(0, model.best_ntree_limit))
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"XGBoost    AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    xgb_predict_proba += model.predict(
        xgb.DMatrix(df_scaled[best_features]),
        iteration_range=(0, model.best_ntree_limit)
    ) / CFG.n_splits
    
    del xgb_train, xgb_valid, model, s, y_pred, y_pred_proba
    gc.collect()
    
    # ExtraTreesClassifier
    model = ExtraTreesClassifier(n_estimators=300, random_state=CFG.seed)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"ExtraTree  AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    etc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()
    
    # QuadraticDiscriminantAnalysis
    model = QuadraticDiscriminantAnalysis(priors=CFG.n_clusters)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"QDA        AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

    # SVC
    model = SVC(probability=True)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"SVC        AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    svc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

    # KNeighborsClassifier
    model = KNeighborsClassifier(n_neighbors=20)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"KNeighbors AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    knc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

all_scores.append(scores(np.argmax(xgb_predict_proba, axis=1), lib="XGB after BayesianGaussianMixture"))
all_scores.append(scores(np.argmax(etc_predict_proba, axis=1), lib="ExtraTrees after BayesianGaussianMixture"))
all_scores.append(scores(np.argmax(qda_predict_proba, axis=1), lib="QuadraticDiscriminantAnalysis n1 after BayesianGaussianMixture"))
all_scores.append(scores(np.argmax(svc_predict_proba, axis=1), lib="SVC after BayesianGaussianMixture"))
all_scores.append(scores(np.argmax(knc_predict_proba, axis=1), lib="KNeighbors after BayesianGaussianMixture"))

pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

===== fold0 =====
XGBoost    AUC : 1.000 | Accuracy : 99.1%
ExtraTree  AUC : 0.999 | Accuracy : 96.3%
QDA        AUC : 1.000 | Accuracy : 99.9%
SVC        AUC : 1.000 | Accuracy : 99.7%
KNeighbors AUC : 0.999 | Accuracy : 97.4%
===== fold1 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 96.9%
QDA        AUC : 1.000 | Accuracy : 100.0%
SVC        AUC : 1.000 | Accuracy : 99.7%
KNeighbors AUC : 1.000 | Accuracy : 97.6%
===== fold2 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 96.4%
QDA        AUC : 1.000 | Accuracy : 100.0%
SVC        AUC : 1.000 | Accuracy : 99.8%
KNeighbors AUC : 1.000 | Accuracy : 97.2%
===== fold3 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 96.4%
QDA        AUC : 1.000 | Accuracy : 99.9%
SVC        AUC : 1.000 | Accuracy : 99.8%
KNeighbors AUC : 1.000 | Accuracy : 97.4%
===== fold4 =====
XGBoost    AUC : 1.000 | Accuracy : 99.1%
ExtraTree  AUC : 0.999 | A

balanced_accuracy_score    0.985180
roc_auc_score              0.999658
dtype: float64

# Soft voting

In [12]:
def soft_voting(preds_probas, weights):
    pred_test = np.zeros((df.shape[0], CFG.n_clusters))
    
    for i, (p, w) in enumerate(zip(preds_probas, weights)):
        preds = np.argmax(p, axis=1)
        pred_idx = pd.Series(preds).value_counts().index.tolist()
        pred_test += p[:, pred_idx] * w
    
    return np.argmax(pred_test, axis=1)

In [13]:
sv_predict = soft_voting(
    [xgb_predict_proba, etc_predict_proba, qda_predict_proba, svc_predict_proba, knc_predict_proba],
    [0.75, 1.0, 1.0, 1.0, 1.0]
)
all_scores.append(scores(sv_predict, lib="Soft voting: XGB, ExtraTrees, QDA, SVC and KNeighbors"))

Soft voting: XGB, ExtraTrees, QDA, SVC and KNeighbors : Silhouette : 5.4% | Calinski Harabasz : 8380.6 | Davis Bouldin : 2.570


In [14]:
pd.DataFrame(all_scores, columns=["Model", "silhouette", "Calinski_Harabasz", "Davis_Bouldin"])

Unnamed: 0,Model,silhouette,Calinski_Harabasz,Davis_Bouldin
0,BayesianGaussianMixture after powertransformer,0.05047,8156.277251,2.656073
1,XGB after BayesianGaussianMixture,0.054219,8371.949575,2.582043
2,ExtraTrees after BayesianGaussianMixture,0.060124,8660.408553,2.493443
3,QuadraticDiscriminantAnalysis n1 after Bayesia...,0.055672,8424.110099,2.572165
4,SVC after BayesianGaussianMixture,0.053886,8359.172173,2.588035
5,KNeighbors after BayesianGaussianMixture,0.052781,8277.559998,2.55189
6,"Soft voting: XGB, ExtraTrees, QDA, SVC and KNe...",0.054334,8380.647444,2.569684


# Submissions

In [15]:
sub = pd.read_csv("../input/tabular-playground-series-jul-2022/sample_submission.csv")
sub['Predicted'] = sv_predict
sub.to_csv("submission.csv", index=False)