In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
features = [col for col in df_train.columns if 'f' in col]

### Observing the test data

In [None]:
import seaborn as sns

sns.countplot(x = 'target', data = df_train)

### Conclusion

The distribution of data with target = 0 and target = 1 made are the same. This is one less thing to worry about :)

In [None]:
print(df_train[features].isna().sum().sum())
print(df_test[features].isna().sum().sum())

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# the number 2 is just a threshold to split 
h_skew = list(df_train.loc[:,df_train.skew() >= 2].columns)  
l_skew = list(df_train.loc[:,df_train.skew() < 2].columns)   

#h_skew.remove('target')
l_skew.remove('target')

df_train['median_h'] = df_train[h_skew].median(axis=1)
df_test['median_h'] = df_test[h_skew].median(axis=1)

df_train['median_l'] = df_train[l_skew].median(axis=1)
df_test['median_l'] = df_test[l_skew].median(axis=1)

features.append('median_h')
features.append('median_l')

In [None]:
X = df_train[features].copy()
y = df_train['target'].copy()

x_test = df_test[features].copy()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[features] = scaler.fit_transform(X[features])
x_test[features] = scaler.transform(x_test[features])

### Let's build a simple Neural Network

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks

import tensorflow as tf
import random
import os

my_seed = 42

input_shape = [X.shape[1]]
PATIENCE = 10
MIN_DELTA = 0.0005

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(units=128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(units=64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['AUC'])

early_stopping = keras.callbacks.EarlyStopping(
    patience=15,
    min_delta=MIN_DELTA,
    restore_best_weights=True,
)

In [None]:
model.summary()

### Training model 

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

EPOCHS = 700
BATCH_SIZE = 2048 
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
RANDOM_SEED = 42
n_splits = 5

skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = RANDOM_SEED)

preds_test_nn = []
mean_auc = 0

best_nn_model = None
best_roc_score_nn = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.loc[train_idx], X.loc[val_idx]
    y_train, y_val = y.loc[train_idx], y.loc[val_idx]
    
    model.fit(X_train, y_train,
             verbose = 0,
             validation_data = (X_val, y_val),
             batch_size = BATCH_SIZE, 
             epochs = EPOCHS,
             callbacks = [early_stopping],
             shuffle = True)
   
    y_pred = model.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
    mean_auc += score
    
    
    if score>best_roc_score_nn:
        best_roc_score_nn = score
        best_nn_model = model
    
    print(f"Fold {fold}'s score: {score}")
        
    preds_test_nn.append(model.predict(x_test).reshape(1,-1)[0])

print("==========================================")
print(f"Mean auc of all folds: {mean_auc / n_splits}")

In [None]:
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['target'] = np.mean(preds_test_nn, axis = 0)

In [None]:
submission.to_csv('submission', index=False)