# Hierarchy of binary classifiers
---

## Reference
* [Simple Keras embedding in 10 folds](https://www.kaggle.com/pourchot/simple-keras-embedding-in-10-folds) by [@pourchot](https://www.kaggle.com/pourchot)
* [Combining discrete and continuous features in neural networks](https://www.kaggle.com/hiro5299834/tps06-nn-w-discrete-and-continuous-features) by [@bizen](https://www.kaggle.com/hiro5299834)

The idea behind using hierarchy of binary classifiers is that the set of classes in training data has similar set of examples for a set of pair of classes. For eg.

* Class_6    51811    &    Class_8    51763
* Class_9    25542    &    Class_2    24431
* Class_3    14798    &    Class_7    14769
* Class_1     9118
* Class_4     4704    &    Class_5     3064

So, I use a hierarchy of classifiers to classify into different classses

                                68_vs_9273145
                                   /    \
                                  /      \
                              6_vs_8    92_vs_73145
                                          /     \
                                         /       \
                                     9_vs_2     73_vs_145
                                                 /     \
                                                /       \
                                              7_vs_3    1_vs_45
                                                         /    \
                                                        /      \
                                                        1     4_vs_5

# LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import os
import gc

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.cluster import KMeans
from scipy.stats import mode, skew, kurtosis

from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import seaborn as sns

#----------
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

import warnings
warnings.simplefilter('ignore')

# Load Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

all_df = pd.concat([train, test]).reset_index(drop=True)

In [None]:
train.value_counts('target')

In [None]:
cat_features = ['feature_' + str(i) for i in range(0,75)]
cnt_features = []

In [None]:
all_features = cat_features + cnt_features

# Extract train data and corresponding targets for the hierarchy of models

In [None]:
train_c5417329_and_c86 = all_df.iloc[:200000].copy()

In [None]:
train_c54173_and_c29 = all_df[all_df['target'].isin(['Class_9','Class_2','Class_3','Class_7', 'Class_1', 'Class_4', 'Class_5'])]
train_c541_and_c73 = all_df[all_df['target'].isin(['Class_7', 'Class_3', 'Class_5', 'Class_4', 'Class_1'])]
train_c54_and_c1 = all_df[all_df['target'].isin(['Class_5', 'Class_4', 'Class_1'])]

In [None]:
train_c8_and_c6 = all_df[(all_df['target'] == "Class_6") | (all_df['target'] == 'Class_8')]
train_c2_and_c9 = all_df[(all_df['target'] == "Class_9") | (all_df['target'] == 'Class_2')]
train_c7_and_c3 = all_df[(all_df['target'] == "Class_3") | (all_df['target'] == 'Class_7')]
train_c5_and_c4 = all_df[(all_df['target'] == "Class_4") | (all_df['target'] == 'Class_5')]

In [None]:
mapping541732986 = {'Class_5': 0, 'Class_4': 0, 'Class_1': 0, 'Class_7': 0, 'Class_3': 0, 'Class_2': 0, 'Class_9': 0, 'Class_8': 1, 'Class_6': 1}
train_c5417329_and_c86.replace({'target': mapping541732986}, inplace=True)

In [None]:
mapping5417329 = {'Class_5': 0, 'Class_4': 0, 'Class_1': 0, 'Class_7': 0, 'Class_3': 0, 'Class_2': 1, 'Class_9': 1}
train_c54173_and_c29.replace({'target': mapping5417329}, inplace=True)
mapping54173 = {'Class_5': 0, 'Class_4': 0 , 'Class_1': 0, 'Class_7': 1, 'Class_3': 1}
train_c541_and_c73.replace({'target': mapping54173}, inplace=True)
mapping541 = {'Class_5': 0, 'Class_4': 0, 'Class_1': 1}
train_c54_and_c1.replace({'target': mapping541}, inplace=True)

In [None]:
mapping86 = {'Class_8': 0, 'Class_6': 1}
mapping29 = {'Class_2': 0, 'Class_9': 1}
mapping73 = {'Class_7': 0, 'Class_3': 1}
mapping54 = {'Class_5': 0, 'Class_4': 1}

train_c8_and_c6.replace({'target': mapping86}, inplace=True)
train_c2_and_c9.replace({'target': mapping29}, inplace=True)
train_c7_and_c3.replace({'target': mapping73}, inplace=True)
train_c5_and_c4.replace({'target': mapping54}, inplace=True)

In [None]:
c5417329c86 = train_c5417329_and_c86[all_features].to_numpy()
c54173c29 = train_c54173_and_c29[all_features].to_numpy()
c541c73 = train_c541_and_c73[all_features].to_numpy()
c54c1 = train_c54_and_c1[all_features].to_numpy()
t541732986 = train_c5417329_and_c86[['target']].to_numpy()
t5417329 = train_c54173_and_c29[['target']].to_numpy()
t54173 = train_c541_and_c73[['target']].to_numpy()
t541 = train_c54_and_c1[['target']].to_numpy()

In [None]:
c8c6 = train_c8_and_c6[all_features].to_numpy()
c2c9 = train_c2_and_c9[all_features].to_numpy()
c7c3 = train_c7_and_c3[all_features].to_numpy()
c5c4 = train_c5_and_c4[all_features].to_numpy()
test_npy = all_df.iloc[train.shape[0]:][all_features].to_numpy()
t86 = train_c8_and_c6[['target']].to_numpy()
t29 = train_c2_and_c9[['target']].to_numpy()
t73 = train_c7_and_c3[['target']].to_numpy()
t54 = train_c5_and_c4[['target']].to_numpy()

# Define the Model, Losses, Metrics and Callbacks

In [None]:
def model(cat_shape=(75,)):
    cat_input = tf.keras.layers.Input(shape=cat_shape, name='cat_input')
        
    x1 = tf.keras.layers.Embedding(512, 8, name='Embedding')(cat_input)
    x1 = tf.keras.layers.Flatten(name='Flatten')(x1)
    
    x = tf.keras.layers.Dropout(0.4, name='dropout_concatenated')(x1)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(128, activation='relu', name='dense1')(x)
    x = tf.keras.layers.Dense(64, activation='relu', name='dense2')(x)
    x = tf.keras.layers.Dense(32, activation='relu', name='dense3')(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(x)
    
    model = tf.keras.Model(cat_input, outputs)
    
    metrics = ['accuracy', tf.keras.metrics.BinaryCrossentropy(
        from_logits=False,
        label_smoothing=0,
        name='binary_crossentropy'
    )]
    
    loss = tf.keras.losses.BinaryCrossentropy(
                from_logits=False,
                label_smoothing=0,
                reduction='auto',
                name='binary_crossentropy'
    )
    
    optimizer = tfa.optimizers.AdamW(
        weight_decay=1e-7,
        learning_rate=1e-4,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=True,
        name='AdamW',
    )
    
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

model().summary()

In [None]:
scheduler_cb = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    verbose=0,
    mode='auto',
    min_delta=0.0001,
    cooldown=0,
    min_lr=0
)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

# Train the models

In [None]:
K.clear_session()
history541732986 = []
model541732986 = model(cat_shape=c5417329c86[:, :len(cat_features)].shape[1])
log_dir = "logs541732986/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir541732986', histogram_freq=1)

history541732986.append(
    model541732986.fit(
        x=c5417329c86[:, :len(cat_features)],
        y=t541732986,
        steps_per_epoch=1000,
        batch_size=256,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
dfr = pd.DataFrame()
all_df_all_features = all_df[all_features]
all_df_numpy = all_df_all_features.to_numpy()

In [None]:
pred541732986 = pd.Series(model541732986.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_86'] = pred541732986
dfr['Class_5417329'] = 1 - pred541732986

In [None]:
K.clear_session()
history5417329 = []
model5417329 = model(cat_shape=c54173c29[:, :len(cat_features)].shape[1])
log_dir = "logs5417329/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir5417329', histogram_freq=1)

history5417329.append(
    model5417329.fit(
        x=c54173c29[:, :len(cat_features)],
        y=t5417329,
        steps_per_epoch=1000,
        batch_size=128,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred5417329 = pd.Series(model5417329.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_29'] = pred5417329
dfr['Class_54173'] = 1 - pred5417329

In [None]:
K.clear_session()
history54173 = []
model54173 = model(cat_shape=c541c73[:, :len(cat_features)].shape[1])
log_dir = "logs54173/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir54173', histogram_freq=1)

history54173.append(
    model54173.fit(
        x=c541c73[:, :len(cat_features)],
        y=t54173,
        steps_per_epoch=1000,
        batch_size=64,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred54173 = pd.Series(model54173.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_73'] = pred54173
dfr['Class_541'] = 1 - pred54173

In [None]:
K.clear_session()
history86 = []
model86 = model(cat_shape=c8c6[:, :len(cat_features)].shape[1])
log_dir = "logs86/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir86', histogram_freq=1)

history86.append(
    model86.fit(
        x=c8c6[:, :len(cat_features)],
        y=t86,
        steps_per_epoch=1000,
        batch_size=128,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred86 = pd.Series(model86.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_6'] = pred86
dfr['Class_8'] = 1- pred86

In [None]:
history29 = []
model29 = model(cat_shape=c2c9[:, :len(cat_features)].shape[1])
log_dir = "logs29/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir29', histogram_freq=1)

history29.append(
    model29.fit(
        x=c2c9[:, :(len(cat_features))],
        y=t29,
        steps_per_epoch=1000,
        batch_size=64,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
        #verbose=CFG['verbose']
    )
)

In [None]:
pred29 = pd.Series(model29.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_9'] = pred29
dfr['Class_2'] = 1 - pred29

In [None]:
history73 = []
model73 = model(cat_shape=c7c3[:, :len(cat_features)].shape[1])
log_dir = "logs73/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir73', histogram_freq=1)

history73.append(
    model73.fit(
        x=c7c3[:, :(len(cat_features))],
        y=t73,
        steps_per_epoch=1000,
        batch_size=32,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred73 = pd.Series(model73.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_3'] = pred73
dfr['Class_7'] = 1 - pred73

In [None]:
history54 = []
model54 = model(cat_shape=c5c4[:, :len(cat_features)].shape[1])
log_dir = "logs54/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir54', histogram_freq=1)

history54.append(
    model54.fit(
        x=c5c4[:, :(len(cat_features))],
        y=t54,
        steps_per_epoch=1000,
        batch_size=16,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred54 = pd.Series(model54.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_4'] = pred54
dfr['Class_5'] = 1 - pred54

In [None]:
history541 = []
model541 = model(cat_shape=c54c1[:, :len(cat_features)].shape[1])
log_dir = "logs541/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir='log_dir541', histogram_freq=1)

history541.append(
    model541.fit(
        x=c54c1[:, :(len(cat_features))],
        y=t541,
        steps_per_epoch=1000,
        batch_size=32,
        epochs=50,
        validation_split=0.2,
        callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb]
    )
)

In [None]:
pred541 = pd.Series(model541.predict(all_df_numpy[:, :len(cat_features)]).flatten())

In [None]:
dfr['Class_1'] = pred541
dfr['Class_54'] = 1 - pred541

In [None]:
dfr.head(30)

In [None]:
df = dfr.copy()
#df['id'] = [i for i in range(0,100000)]
#df = df.set_index('id')
display(df)
result = pd.DataFrame()
result['Class_1'] = dfr['Class_5417329'] * dfr['Class_54173'] * dfr['Class_541'] * dfr['Class_1']
result['Class_2'] = dfr['Class_5417329'] * dfr['Class_29'] * dfr['Class_2'] 
result['Class_3'] = dfr['Class_5417329'] * dfr['Class_54173'] * dfr['Class_73'] * dfr['Class_3'] 
result['Class_4'] = dfr['Class_5417329'] * dfr['Class_54173'] * dfr['Class_541'] * dfr['Class_54'] * dfr['Class_4'] 
result['Class_5'] = dfr['Class_5417329'] * dfr['Class_54173'] * dfr['Class_541'] * dfr['Class_54'] * dfr['Class_5'] 
result['Class_6'] = dfr['Class_86'] * dfr['Class_6'] 
result['Class_7'] = dfr['Class_5417329'] * dfr['Class_54173'] * dfr['Class_73'] * dfr['Class_7'] 
result['Class_8'] = dfr['Class_86'] * dfr['Class_8'] 
result['Class_9'] = dfr['Class_5417329'] * dfr['Class_29'] * dfr['Class_9'] 
result

In [None]:
display(result.head(10))
train['target'].head(10)

In [None]:
ohe = OneHotEncoder()
tmp = [int(i)-1 for i in (train['target'].str.split('_',expand=True))[1]]
train['predicted'] = np.argmax(result.iloc[:200000].to_numpy(), axis=1).flatten().reshape(-1,1)
train_true = ohe.fit_transform(np.array(tmp).reshape(-1,1))
print(log_loss(train_true,  result.iloc[:200000].to_numpy()))
display( train[tmp != train['predicted']][['target', 'predicted']].value_counts())
#test_pred.shape
#sample = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv')
#print(sample)

In [None]:
display(result.iloc[2])
display(train.iloc[2])

In [None]:
#sub = pd.DataFrame(test_pred.data, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
sub = result.iloc[200000:].copy()
sub.insert(0, 'id', [id for id in range(200000,300000,1)])
print(sub)
csv = 'submission_hierarchy_of_classifiers3.csv'
sub.to_csv(csv, index = False)

In [None]:
from IPython.display import FileLink
FileLink(csv)