# Basic model baseline

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold
import tensorflow as tf

# Load dataset

In [None]:
csv_train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
csv_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
csv_submission = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

lbl_coder = LabelEncoder()
lbl_coder.fit(csv_train.target)

csv_train['target'] = lbl_coder.transform(csv_train.target)

# Define feature columns

It would be easy to slice input features and targets using predefined column names. Even more - we are aware of column positions by accesing features by column names

In [None]:
feature_columns = {x for x in csv_train.columns}.difference({'row_id', 'target'})
target = 'target'

In [None]:
hists = []
for i, (train_idx, val_idx) in enumerate(KFold().split(csv_train)):
    print(f'Fold #{i}')
    _csv_train = csv_train.iloc[train_idx]
    _csv_val = csv_train.iloc[val_idx]
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(32, activation='relu' ),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(10, activation='softmax'),
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()])

    x = _csv_train[feature_columns].to_numpy()
    y = tf.one_hot(_csv_train[target], 10)

    x_val = _csv_val[feature_columns].to_numpy()
    y_val = tf.one_hot(_csv_val[target], 10)

    save_cb = tf.keras.callbacks.ModelCheckpoint(f'./best_val_{i}', save_best_only=True, monitor='val_loss', save_weights_only=True)

    class LearningRateReducerCb(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs={}):
            old_lr = self.model.optimizer.lr.read_value()
            new_lr = old_lr * 0.99
            self.model.optimizer.lr.assign(new_lr)

    h = model.fit(x, y, validation_data=(x_val, y_val), epochs=20, batch_size=256, verbose=0, callbacks=[save_cb, LearningRateReducerCb()])
    hists.append(h)

In [None]:
scores = []

splits = list(KFold().split(csv_train))

for i in range(5):
    model.load_weights(f'./best_val_{i}')
    val_idx = splits[i][1]
    
    _csv_val = csv_train.iloc[val_idx]
    
    x_val = _csv_val[feature_columns].to_numpy()
    y_val = tf.one_hot(_csv_val[target], 10)
    
    acc = model.evaluate(x_val, y_val)[1]
    scores.append(acc)
    
print(f'CV score: {np.mean(scores)}')

In [None]:
predictions = []
for i in range(5):
    model.load_weights(f'./best_val_{i}')
    test_pred = np.argmax(model.predict(csv_test[feature_columns]), axis=-1)
    predictions.append(test_pred)

In [None]:
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

In [None]:
pred_matrix = np.stack(predictions)

In [None]:
pred_maj = []

for i in range(pred_matrix.shape[1]):
    pred_maj.append(most_frequent(pred_matrix[:, i]))

In [None]:
test_pred_str = lbl_coder.inverse_transform(pred_maj)
csv_submission['target'] = test_pred_str
csv_submission.to_csv('./submission.csv', index=False)