In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
import gc

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Embedding,  Activation, Flatten, Conv1D
from tensorflow.keras.models import Model
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import RMSprop
from tensorflow.keras import regularizers

from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from tensorflow import keras
from sklearn import metrics
from sklearn.impute import SimpleImputer

import math
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

import keras_tuner as kt
from tensorflow import keras

# Load Dataset

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

# Analyse Dataset

In [None]:
validation = train.sample(frac = 0.3)
train = train.drop(validation.index)

In [None]:
print(train.shape)
print(train.claim.value_counts())
print(validation.shape)
print(validation.claim.value_counts())

# Preprocessing

In [None]:
train['missing'] = train.isna().sum(axis=1)
validation['missing'] = validation.isna().sum(axis=1)
test['missing'] = test.isna().sum(axis=1)

features = [col for col in train.columns if col not in ['claim', 'id']]

In [None]:
def preprocessor():
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median', missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=128,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=128, encode='ordinal',strategy='uniform'))
        ])
    
    train[features] = pipe.fit_transform(train[features])
    test[features] = pipe.transform(test[features])
    validation[features] = pipe.transform(validation[features])

# Modeling

In [None]:
def model(hp):
    input = Input(train[features].shape[1:])
    
    e = Embedding(input_dim=128, output_dim=4)(input)
    f2 = Flatten()(e)
    
    for i in range(hp.Int('num_layers', 1, 10)):
        d1 = Dense(hp.Choice('units_' + str(i), [16, 32, 64]),activation='relu')(f2)
        do1 = Dropout(hp.Float('drop_' + str(i), min_value=0.2, max_value=0.5, step=0.1))(d1)
    
    
    output = Dense(1, activation='sigmoid')(do1)

    model = Model(inputs=input, outputs=output)

    auc = tf.keras.metrics.AUC(name='aucroc')
    #optimizer = RMSprop(lr=1e-3, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(hp.Choice("learning_rate", [1e-2, 1e-3, 1e-4])), metrics=[auc])
    
    return model


In [None]:
preprocessor()

In [None]:
x=train[features]
y=train['claim']

xval=validation[features]
yval=validation['claim']

gc.collect()

In [None]:
tuner = kt.RandomSearch(model,objective='val_loss',max_trials=5)
tuner.search(x, y, epochs=5, validation_data=(xval, yval))
best_model = tuner.get_best_models()[0]

In [None]:
best_model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_aucroc', mode='max', patience=5, restore_best_weights=True)

In [None]:
history = best_model.fit(x = x, y = y, batch_size = 1024, shuffle = True, validation_data=(xval, yval), epochs=30, callbacks=[callback])

In [None]:
# plot training history
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.plot(history.history['aucroc'], label='aucroc')
plt.plot(history.history['val_aucroc'], label='val_aucroc')
plt.legend()
plt.show()


In [None]:
sub=pd.DataFrame()
sub['id'] = test['id']
sub['claim'] = best_model.predict(test[features])
sub=sub.set_index('id')
sub.to_csv('submission.csv')

In [None]:
sub.head()