### Scoreboard

- normalized (model_1): loss: 1.1068 - accuracy: 0.5749 - score: 2.89593

In [None]:
## Imports
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
## Constants
dir = '../input/tabular-playground-series-may-2021/'

In [None]:
## Util functions
def plot_history(history):
    pd.DataFrame(history.history).plot(title='Loss/accuracy vs epochs')
    plt.ylabel('loss / accuracy')
    plt.xlabel('epoch');
    
def plot_lr(history):
    lrs = history.history['lr']
    plt.semilogx(lrs, history.history['loss'])
    plt.xlabel('lr')
    plt.ylabel('loss')
    plt.title('Lr vs loss')

In [None]:
## Read Data
train = pd.read_csv(dir + 'train.csv')
test = pd.read_csv(dir + 'test.csv')
sample_submission = pd.read_csv(dir + 'sample_submission.csv')

In [None]:
print("Train: ", train.shape)
print("Test: ", test.shape)
train.head()

In [None]:
## Preprocessing
X = train.drop(['id', 'target'], axis=1)
y = train.target

## Remove duplicate rows in training data
duplicated_rows = train[train.drop(['id','target'], axis=1).duplicated()]
y = y.drop(duplicated_rows.index.tolist()).values
X = X.drop_duplicates(keep='first').values

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# Noramlize data
# scaler = preprocessing.MinMaxScaler()
# X_norm = scaler.fit_transform(X)
# test_norm = scaler.transform(test.drop('id', axis=1))

print(X.shape, y.shape)

In [None]:
## Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_train.shape)

## Split the normalized data
# X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, shuffle=True, stratify=y)
# print(X_train.shape, y_train.shape, X_test.shape, y_train.shape)

In [None]:
X_train, y_train

In [None]:
# set seed
tf.random.set_seed(42)

# model
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(4, activation="softmax")
])

# compile
model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(lr=0.0001),
             metrics=["accuracy"])

# fit model
history_1 = model_1.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))

In [None]:
plot_history(history_1)

In [None]:
model_1.evaluate(X_test, y_test)

Lets find the optimal LR

In [None]:
# set seed
tf.random.set_seed(42)

# model
model_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(4, activation="softmax")
])

# lr scheduler
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10**(epoch/20))

# compile
model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer="Adam",
             metrics=["accuracy"])

# fit the model
history_2 = model_2.fit(X_train, y_train, epochs=100, callbacks=[lr_scheduler], verbose=0)

In [None]:
plot_lr(history_2)

In [None]:
# set seed
tf.random.set_seed(42)

# model
model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(4, activation="softmax")
])

# compile
model_3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer="sgd",
             metrics=["accuracy"])

# fit model
history_3 = model_3.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))

In [None]:
plot_history(history_3)
model_3.evaluate(X_test, y_test)

In [None]:
# set seed
tf.random.set_seed(42)

# model
model_4 = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(4, activation="softmax")
])

# compile
model_4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
             optimizer="sgd",
             metrics=["accuracy"])

# fit model
history_4 = model_4.fit(X_train, y_train, epochs=100, verbose=0, validation_data=(X_test, y_test))

plot_history(history_4)
model_4.evaluate(X_test, y_test)

Lets make predictions

In [None]:
preds = model_1.predict(test.drop('id', axis=1))
preds = pd.DataFrame(preds, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4'])
preds.insert(loc=0, column='id', value=test.id)
preds.to_csv('submission.csv', index=False)

In [None]:
preds