# Basic model baseline

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.neighbors import KNeighborsClassifier
from tqdm.notebook import tqdm
from skimage import filters
import seaborn as sns

# Load dataset

In [None]:
csv_all = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv').sample(frac=1)
csv_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
csv_submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

lbl_coder = LabelEncoder()
lbl_coder.fit(csv_all.target)

csv_all['target'] = lbl_coder.transform(csv_all.target)

# Define feature columns

It would be easy to slice input features and targets using predefined column names. Even more - we are aware of column positions by accesing features by column names

In [None]:
feature_columns = {x for x in csv_all.columns}.difference({'row_id', 'target'})
target = 'target'

# Define model

In [None]:
def get_model(weights=None):
    model = tf.keras.models.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32, activation='relu' ),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss=tfa.losses.TripletSemiHardLoss(), metrics=[])
    if weights is not None:
        model.load_weights(weights)
        
    return model

def get_split(fold, with_csv=False):
    train_idx, val_idx = fold
    
    _csv_train = csv_all.iloc[train_idx]
    _csv_val = csv_all.iloc[val_idx]

    model = get_model()
    x = _csv_train[feature_columns].to_numpy()
    y = _csv_train[target]
    
    x_val = _csv_val[feature_columns].to_numpy()
    y_val = _csv_val[target]
    
    x, y, x_val, y_val
    
    if with_csv:
        return x, y, x_val, y_val, _csv_train, _csv_val
    
    return x, y, x_val, y_val
    

# Train KFold

In [None]:
folds = list(KFold().split(csv_all))

fold_i = 0

print(f'Fold #{fold_i}')

model = get_model()

x, y, x_val, y_val, csv_train, csv_val = get_split(folds[fold_i], with_csv=True)

assert not any(csv_val.row_id.isin(csv_train.row_id))


class LearningRateReducerCb(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        old_lr = self.model.optimizer.lr.read_value()
        new_lr = old_lr * 0.99
        self.model.optimizer.lr.assign(new_lr)

save_cb = tf.keras.callbacks.ModelCheckpoint(f'./best_val_new_{fold_i}', save_best_only=True, monitor='val_loss', save_weights_only=True)
h = model.fit(x, y, validation_data=(x_val, y_val), epochs=69, batch_size=64, verbose=2, callbacks=[save_cb, LearningRateReducerCb()])

del model
model = get_model(f'./best_val_new_{fold_i}')

train_emb = model.predict(x)
val_emb = model.predict(x_val)

knn = KNeighborsClassifier(n_neighbors=100, weights='distance', n_jobs=-1)
knn.fit(train_emb, y)

train_acc, val_acc = knn.score(train_emb, y), knn.score(val_emb, y_val)

print('#################################################################')
print(f'Train acc: {train_acc} Validation acc: {val_acc}')
print('#################################################################')


test_emb = model.predict(csv_test[feature_columns])
test_pred = knn.predict(test_emb)
test_probas = knn.predict_proba(test_emb)

# Visualize loss

In [None]:
loss = h.history['loss']
val_loss = h.history['val_loss']

fig, ax = plt.subplots(figsize=(12, 4))

ax.plot(loss, label='Train loss')
ax.plot(val_loss, label='Validation loss')

ax.grid()
ax.legend()
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss value');
fig.suptitle('Loss visualization', fontsize=16);

# Calculate distances between embading

In [None]:
def embading_distances(embeddings, labels, samples):
    dist = tf.keras.metrics.MSE
    
    all_classes = np.unique(labels)

    pos_dists = []
    for i in tqdm(range(samples)):
#         idx = int(np.random.uniform(0, 1000))
        idx = np.random.choice(all_classes)
        possible_idxs = np.where(labels == idx)[0]
        choose_idx = np.random.choice(possible_idxs, size=2)

        d = dist(embeddings[choose_idx[0]], embeddings[choose_idx[1]])
        pos_dists.append(d)

    neg_dists = []
    for i in tqdm(range(samples)):
#         idx = int(np.random.uniform(0, 1000))
        idx = np.random.choice(all_classes)
        other_idxs = np.where(labels != idx)[0]
        choose_idx = np.random.choice(other_idxs, size=1)

        d = dist(embeddings[idx], embeddings[choose_idx[0]])
        neg_dists.append(d)
        
    pos_dists, neg_dists = sklearn.utils.shuffle(pos_dists, neg_dists)
    return pos_dists, neg_dists


train_pos_dists, train_neg_dists = embading_distances(train_emb, y, samples=int(2*10e3))
val_pos_dists, val_neg_dists = embading_distances(val_emb, y_val, samples=int(2*10e3))

In [None]:
val_thresh = filters.threshold_otsu(np.concatenate([val_pos_dists, val_neg_dists]))
train_thresh = filters.threshold_otsu(np.concatenate([train_pos_dists, train_neg_dists]))

print(f'Threshold for train set: {train_thresh:0.5f} Threshold for test set: {val_thresh:0.5f} \t Validation threshold {(np.abs(train_thresh - val_thresh) / train_thresh)*100:0.2f}%')

In [None]:
fig, ax = plt.subplots(figsize=(24, 4))

sns.distplot(train_pos_dists, label='Positive distances (train)')
sns.distplot(train_neg_dists, label='Negative distances (train)')

ax.vlines(train_thresh, 0, 100, color='r', linestyle='--', label='Threshold')
ax.set_title('Validation distance distribution')

ax.set_xlabel('L2 distance', fontsize=16)
ax.legend();

plt.show()


fig, ax = plt.subplots(figsize=(24, 4))

sns.distplot(val_pos_dists, label='Positive distances (validation)')
sns.distplot(val_neg_dists, label='Negative distances (validation)')

ax.vlines(train_thresh, 0, 100, color='r', linestyle='--', label='Threshold')
ax.vlines(val_thresh, 0, 100, color='g', linestyle='--', label='Perfect threshold for validation set')

ax.set_xlabel('L2 distance', fontsize=16)
ax.legend()
ax.set_title('Validation distance distribution')

# Submission

In [None]:
test_pred_str = lbl_coder.inverse_transform(test_pred)
csv_submission['target'] = test_pred_str
csv_submission.to_csv('./submission.csv', index=False)