In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv')

In [None]:
df.head()

In [None]:
features = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07',
       'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16',
       'f_17', 'f_18', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25',
       'f_26', 'f_28', 'f_29', 'f_30']

In [None]:
def create_X(df, scaler, pf, fit=False):
    # will convert each letter into corresponding number
    # A - 0, B - 1, C - 2, D - 4 etc
    X_str = np.stack(df.f_27.apply(lambda x: np.array([ord(let) - ord('A') for let in x])).values)
    
    # scale numerical features
    if fit:
        scaler.fit(df[features].values)
    X_f = scaler.transform(df[features].values)
    
    if fit:
        pf.fit(X_f)
    X_f = pf.transform(X_f)
    
    
    y_true = df['target'].values
    
    return (X_f, X_str), y_true, scaler, pf

In [None]:
scaler = StandardScaler()
pf = PolynomialFeatures()

In [None]:
df_train = df.sample(frac=0.86)
df_val = df[~df['id'].isin(df_train['id'])]
del df

In [None]:
X_train, y_train, scaler, pf = create_X(df_train, scaler, pf, fit=True)
del df_train
X_val, y_val, _, _ = create_X(df_val, scaler, pf, fit=False)
del df_val

## Model

In [None]:
# f_27 encoder
inp_str = tf.keras.Input(shape=(10,), name='f_27')
emb = tf.keras.layers.Embedding(20, 10, input_length=10)(inp_str)

rnn = tf.keras.layers.Bidirectional(
   tf.keras.layers.LSTM(
       24, return_sequences=False,
       use_bias=True,
       activation="tanh",
       recurrent_activation="sigmoid",
       recurrent_dropout=0,
       unroll=True)
)(emb)
rnn = tf.keras.layers.BatchNormalization()(rnn)

x_rnn = tf.keras.layers.Dense(64, activation='relu')(rnn)
x_rnn = tf.keras.layers.BatchNormalization()(x_rnn)

In [None]:
n_units = 126
dropout_rate = 0.1
reg_amount = 5e-6

inp = tf.keras.Input(shape=(X_train[0].shape[1],), name='numerical_features')
x1 = tf.keras.layers.Dense(n_units*2, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(inp)
x1 = tf.keras.layers.BatchNormalization()(x1)
x1 = tf.keras.layers.Dropout(0.5)(x1)
x1 = tf.keras.layers.Dense(n_units, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x1)
x1 = tf.keras.layers.BatchNormalization()(x1)

x2 = tf.keras.layers.concatenate([x1, rnn])

x3 = tf.keras.layers.Dense(n_units+32, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x2)
x3 = tf.keras.layers.BatchNormalization()(x3)
x3 = tf.keras.layers.Dropout(dropout_rate)(x3)
x3 = tf.keras.layers.Dense(n_units, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x3)
x3 = tf.keras.layers.BatchNormalization()(x3)

x3 = tf.keras.layers.add([x1, x3])

x3 = tf.keras.layers.Dense(n_units, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x3)
x3 = tf.keras.layers.BatchNormalization()(x3)

x4 = tf.keras.layers.concatenate([x3, x_rnn])

x5 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x4)
x5 = tf.keras.layers.BatchNormalization()(x5)
x5 = tf.keras.layers.Dropout(dropout_rate)(x5)
x5 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(reg_amount))(x5)
x5 = tf.keras.layers.BatchNormalization()(x5)

x = tf.keras.layers.Dense(1, activation='sigmoid')(x5)

model = tf.keras.Model([inp, inp_str], x)
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
optimizer = tf.keras.optimizers.Adam(
        learning_rate=2e-3
)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC()]
)

## callbacks
lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    factor=0.5,
    patience=4,
    min_lr=1e-7,
    verbose=1) 

es_callback = tf.keras.callbacks.EarlyStopping(
    patience=10, verbose=1)

checkpoint_filepath = './checkpoint'
chkp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True,
    verbose=1)


In [None]:
hist = model.fit(
    X_train, y_train,
    epochs=500,
    batch_size=300,
    verbose=2,
    shuffle=True,
    validation_data=(X_val, y_val),
    callbacks=[lr_callback, es_callback, chkp_callback]
)

In [None]:
# load best weights
model.load_weights(checkpoint_filepath)

In [None]:
model.save('model.h5')

### Save predictions

In [None]:
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv')
df_test['target'] = 0

In [None]:
del X_train
del X_val

In [None]:
X_test, _, _, _ = create_X(df_test, scaler, pf, fit=False)

In [None]:
y_test = model.predict(X_test, verbose=True)

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv')
sub['target'] = y_test
sub.to_csv('submission.csv', index=False, float_format='%.7f')