# This notebook is the result of my research related to tps-05 competition data.



In this notebook, I conducted three data encoding experiments that I think could have benefited from a better result in the competition. Finally, I built the models with two leading autoML solutions - LightAutoML and MLJAR.

<div class="alert alert-success">
  <strong>This notebook provides three way to encode TPS-05 data using:</strong>
    <ul>
        <li>NN (Embedding) - Categorical Encoding</li>
        <li>AutoEncoder</li>
        <li>Denoise Autoencoder</li>
    </ul>
</div>

#### I know that asking directly in notebook for votes is not the best idea but please appreciate my work. I put a lot of effort into it to get the best result. At the same time, I do not hide my research. I share with you.

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold


import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Flatten, Dropout, BatchNormalization, Embedding, Input
from keras.layers.merge import concatenate
from keras.utils import to_categorical

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv", index_col = 'id')
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv", index_col = 'id')

X = train_df.drop('target', axis = 1)

lencoder = LabelEncoder()
y_df = pd.DataFrame(lencoder.fit_transform(train_df['target']), columns=['target'])

df_all = pd.concat([X, test_df], axis = 0)
df_all = df_all.astype("category")

# PART 1. CATEGORICAL ENCODING using Embedding

This part will be improved. Now Categorical Encoding is not perfect (I am going to use cross validation for building Embedding).

In [None]:
class __LabelEncoder__(LabelEncoder):

    def transform(self, y):

        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)

        unseen = len(self.classes_)

        e = np.array([
                     np.searchsorted(self.classes_, x)
                     if x in self.classes_ else unseen
                     for x in y
                     ])

        if unseen in e:
            self.classes_ = np.array(self.classes_.tolist() + ['unseen'])

        return e

def get_encoded_data(data, categorical_variables=None):
   
    encoders = {}

    df = data.copy()

    if categorical_variables is None:
        categorical_variables = [col for col in df.columns if df[col].dtype == 'category'] 

    for var in categorical_variables:
        encoders[var] = __LabelEncoder__()
        df.loc[:, var] = encoders[var].fit_transform(df[var])

    return df.astype("category"), encoders

In [None]:
df_all, encoders = get_encoded_data(df_all)
train, test, y = df_all[:len(train_df)].to_numpy(), df_all[len(train_df):].to_numpy(), y_df.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=1)

In [None]:
X_train_enc = [X_train[:, i] for i in range(X_train.shape[1])]
X_test_enc = [X_test[:, i] for i in range(X_test.shape[1])]
test_enc = [test[:, i] for i in range(test.shape[1])]

y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)

X_train_enc[0:5]

In [None]:
categorical_variables = df_all.select_dtypes(include='category').columns
info = {col:(df_all[col].nunique(),min(50,(df_all[col].nunique()+ 1) //2)) for col in categorical_variables}

### For coding we will use Embedding leayer (3 embeddings for each categorical feature)

In [None]:
input_layers = list()
embedding_layers = list()

for feature in categorical_variables:
    n_labels = df_all[feature].nunique()
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(n_labels, 3)(input_layer)
    input_layers.append(input_layer)
    embedding_layers.append(embedding_layer)
    
merge = concatenate(embedding_layers)

dense_1 = Dense(128, kernel_initializer='normal', activation='relu')(merge)
x = BatchNormalization()(dense_1)
x = Dropout(0.5)(x)
dense_2 = Dense(32, kernel_initializer='normal', activation='relu')(x)
x = BatchNormalization()(dense_2)
x = Dropout(0.25)(x)
flatten = Flatten()(x)
output = Dense(4, activation='softmax')(flatten)
model = Model(inputs=input_layers, outputs=output)

In [None]:
model.compile(loss = "categorical_crossentropy", optimizer = tf.keras.optimizers.Adam(), metrics=['accuracy'])

In [None]:
model.fit(X_train_enc, y_train_enc, validation_data=(X_test_enc, y_test_enc), epochs=20, batch_size=64, verbose=2)

In [None]:
embs = list(map(lambda x: x.get_weights()[0], [x for x in model.layers if 'Embedding' in str(x)]))
embeddings = {var: emb for var, emb in zip(info.keys(), embs)}

In [None]:
embeddings_df = {}
for cat_var in tqdm(embeddings.keys()):
    df = pd.DataFrame(embeddings[cat_var])
    df.index = encoders[cat_var].classes_
    df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
    embeddings_df[cat_var] = df

### Let's look into feature_0 coded 

In [None]:
embeddings_df['feature_0'].head(5)

In [None]:
embeddings_df['feature_1'].head(5)

### Let's look into graphical representation of feature embedding

In [None]:
fig = px.scatter_3d(embeddings_df['feature_3'], x='feature_3_embedding_0', y='feature_3_embedding_1', z='feature_3_embedding_2', color =embeddings_df['feature_3'].index)
fig.show()

In [None]:
fig = px.scatter_3d(embeddings_df['feature_35'], x='feature_35_embedding_0', y='feature_35_embedding_1', z='feature_35_embedding_2', color =embeddings_df['feature_35'].index)
fig.show()

In [None]:
fig = px.scatter_3d(embeddings_df['feature_18'], x='feature_18_embedding_0', y='feature_18_embedding_1', z='feature_18_embedding_2', color =embeddings_df['feature_18'].index)
fig.show()

### Let's encode input data into embedding values

In [None]:
def fit_transform(data, embeddings, encoders, drop_categorical_vars=False):

    dfs={}
    for cat_var in tqdm(embeddings.keys()):
        df = pd.DataFrame(embeddings[cat_var])
        df.index = encoders[cat_var].classes_
        df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
        data = data.merge(df, how='left', left_on=cat_var, right_index=True)

    if drop_categorical_vars:
        return data.drop(list(embeddings.keys()), axis=1)
    else:
        return data

df_categorical_coded = fit_transform(df_all, embeddings, encoders, True)
train_categorical_coded, test_categorical_coded= df_categorical_coded[:len(train_df)], df_categorical_coded[len(train_df):]
train_categorical_coded['target'] = y

In [None]:
train_categorical_coded.head(5)

In [None]:
test_categorical_coded.head(5)

In [None]:
train_categorical_coded.to_csv("tps-05-train_categorical_coded.csv")
test_categorical_coded.to_csv("tps-05-test_categorical_coded.csv")

# PART 2. Autoencoder on Tabular Data


From post in comments - Alexander Ryzhkov:
> What are the fix variants which can help:
> 1) Try not to use target while you create embeddings - you can use autoencoder for that
> 2) If you want to use the target, you can do it based on cross-validation, but in this situation you can use only OOF predictions instead of categorical embeddings because for 2 different runs on k-1 folds embeddings for sure do not have same columns to concat them vertically.

In [None]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()

X_train = scaler.fit_transform(df_all[:len(train_df)])
X_validation = scaler.transform(df_all[len(train_df):])

In [None]:
# Let's define simple AutoEncoder

encoding_dim = 40

input_size = len(df_all.columns)

input_df = Input(shape=(input_size,))
x = Dense(32, kernel_initializer='normal', activation='relu')(input_df)
encoded = Dense(encoding_dim, activation='relu')(x)
x = Dense(32, kernel_initializer='normal', activation='relu')(encoded)
decoded = Dense(input_size, activation='sigmoid')(x)

autoencoder = Model(input_df, decoded)

autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')

autoencoder.fit(X_train, X_train,
                epochs=250,
                batch_size=256,
                shuffle=True,
                validation_data=(X_validation, X_validation))

In [None]:
encoder = Model(input_df, encoded)

auto_enc_df_all = pd.DataFrame(encoder.predict(df_all))
auto_enc_df_all.columns = ['f_' + str(num) for num in auto_enc_df_all.columns]

In [None]:
auto_df_all = pd.concat([df_all, auto_enc_df_all], axis=1)


auto_enc_X_train = auto_df_all[:len(train_df)]
auto_enc_X_train['target'] = y

auto_enc_test = auto_df_all[len(train_df):]

In [None]:
auto_enc_X_train.head(5)

In [None]:
auto_enc_test.head(5)

In [None]:
auto_enc_X_train.to_csv("tps-05-train_autoencoder-40_coded.csv")
auto_enc_test.to_csv("tps-05-test_autoencoder-40_coded.csv")

# PART 2. Denoise AutoEncoder (DAE) on Tabular Data
I will use Kaggler API

In [None]:
!pip install -U kaggler -q

In [None]:
import kaggler
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder
print(kaggler.__version__)

In [None]:
encoding_dim = 128
seed = 42
n_fold = 5
n_class = 4

In [None]:
df_all = df_all.astype("int64")
dae = DAE(cat_cols=df_all.columns.to_list(), num_cols=[], encoding_dim=encoding_dim, random_state=seed, 
          swap_prob=.3, n_layer=3)
X = dae.fit_transform(df_all)
df_dae = pd.DataFrame(X, columns=[f'dae1_{x}' for x in range(X.shape[1])])
print(df_dae.shape)
df_dae.head()

In [None]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
te = TargetEncoder(cv=cv)

feature_cols = df_all.columns.tolist()
y = y_df.squeeze()

te.fit(train_df[feature_cols], y)
df_te = te.transform(df_all[feature_cols])
df_te.columns = [f'te_{x}' for x in df_all.columns]
df_te.head(5)

In [None]:
dae_df_all = pd.concat([df_all, df_te, df_dae], axis=1)
dae_train = dae_df_all[:len(train_df)]
dae_train['target'] = y

dae_test = dae_df_all[len(train_df):]

In [None]:
dae_train.to_csv("tps-05-train_dae_coded.csv")
dae_test.to_csv("tps-05-test_dae_coded.csv")

# AUTO ML PREDICTION

In [None]:
# Train LightAutoML on AutoEncoder or Categorical Encoding

DATA = "DAE" # Denoise Autoencoder
#DATA = "EMB" # Categorical Encoding (Embedding)
#DATA = "AUE" # AutoEncoder

In [None]:
if DATA == "AUE":
    train_AutoML = auto_enc_X_train
    test_AutoML = auto_enc_test
elif DATA == "EMB":
    train_AutoML = train_categorical_coded
    test_AutoML = test_categorical_coded
else: #DAE
    train_AutoML = dae_train
    test_AutoML = dae_test

## 1. LightAutoML

In [None]:
pip install -U lightautoml -q

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

import pandas as pd

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

In [None]:
task = Task('multiclass',)

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

In [None]:
automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train_AutoML, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

In [None]:
laml_test_pred = automl.predict(test_AutoML)
print('Prediction for test data:\n{}\nShape = {}'.format(laml_test_pred[:10], laml_test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_AutoML[TARGET_NAME].values, oof_pred.data)))

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

laml_submission = submission.copy()

laml_submission.iloc[:, 1:] = laml_test_pred.data

In [None]:
laml_submission.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
laml_submission.to_csv("sub-tps-05-laml_submission.csv",  index = False)

## 2. MLJAR

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@dev -q

In [None]:
from supervised.automl import AutoML # mljar-supervised

In [None]:
x_cols = train_AutoML.columns[1:-1].tolist()
y_col = train_AutoML.columns[-1]

automl = AutoML(
    mode="Compete", 
    total_time_limit=4*3600
)
automl.fit(train_AutoML[x_cols], train_AutoML[y_col])

In [None]:
mljar_preds = automl.predict_proba(test_AutoML)

In [None]:
mljar_submission = submission.copy()

mljar_submission[mljar_submission.columns[1:]] = mljar_preds

In [None]:
mljar_submission.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
mljar_submission.to_csv("sub-tps-05-mljar_submission.csv", index = False)

## 3. BLENDING

Dirty Kaggle code :) This is not the best ML practice ... but ... we can call this .... creativity :)

In [None]:
blended_submission = pd.read_csv("../input/tps05blender-v2/tps05-remek-blender_v2.csv")

In [None]:
def ensemble(a, b, c = 0):
    if (not isinstance(c, pd.DataFrame)):
        output = a.copy()
        output["Class_1"] = (a.Class_1 * 0.5 + b.Class_1 * 0.5) 
        output["Class_2"] = (a.Class_2 * 0.5 + b.Class_2 * 0.5) 
        output["Class_3"] = (a.Class_3 * 0.5 + b.Class_3 * 0.5) 
        output["Class_4"] = (a.Class_4 * 0.5 + b.Class_4 * 0.5)
        return output 
    else: 
        output = a.copy() 
        output["Class_1"] = a.Class_1 * 0.6 + b.Class_1 * 0.2 + c.Class_1 * 0.2
        output["Class_2"] = a.Class_2 * 0.6 + b.Class_2 * 0.2 + c.Class_2 * 0.2
        output["Class_3"] = a.Class_3 * 0.6 + b.Class_3 * 0.2 + c.Class_3 * 0.2
        output["Class_4"] = a.Class_4 * 0.6 + b.Class_4 * 0.2 + c.Class_4 * 0.2
        return output  
    
def generate(a, b, c):
    ab = ensemble(a, b)
    ab.to_csv('sub-tps-05-blend-ab.csv',index=False)   
    ac = ensemble(a, c)
    ac.to_csv('sub-tps-05-blend-ac.csv',index=False)
    bc = ensemble(b, c)
    bc.to_csv('sub-tps-05-blend-bc.csv',index=False)  
    abc = ensemble(a, b, c)
    abc.to_csv('sub-tps-05-blend-abc.csv',index=False)

In [None]:
generate(blended_submission, laml_submission, mljar_submission)

In [None]:
bl_laml = pd.read_csv("sub-tps-05-blend-ab.csv")
bl_mljar = pd.read_csv("sub-tps-05-blend-ac.csv")
laml_mljar = pd.read_csv("sub-tps-05-blend-bc.csv")
bl_laml_mljar = pd.read_csv("sub-tps-05-blend-abc.csv")

In [None]:
bl_laml.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
bl_mljar.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
laml_mljar.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
bl_laml_mljar.drop("id", axis=1).describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')