In [None]:
import numpy as np 
import pandas as pd 
import os
seed = 123
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

from catboost import CatBoostClassifier 
from skopt.space import Real, Categorical, Integer
import tensorflow_addons as tfa
import tensorflow as tf
import matplotlib.pyplot as plt
use_pretrained =  False
use_pca = False

In [None]:

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv("../input/jobathon-analytics-vidhya/train.csv")
test = pd.read_csv("../input/jobathon-analytics-vidhya/test.csv")
sample = pd.read_csv("../input/jobathon-analytics-vidhya/sample_submission.csv")
train.rename(columns={'Health Indicator': 'Health_Indicator'}, inplace=True)
test.rename(columns={'Health Indicator': 'Health_Indicator'}, inplace=True)
train = train.drop(["ID"], axis=1)
test = test.drop(["ID"], axis=1)
train.head()

In [None]:
if use_pretrained:
    train = pd.read_csv('/kaggle/input/jobathon/x.csv')
    test = pd.read_csv('/kaggle/input/jobathon/test.csv')
    y = pd.read_csv('/kaggle/input/jobathon/y.csv')

In [None]:
train.head()

In [None]:
cat_cols = ['City_Code', 'Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type', 'Upper_Age', 'Lower_Age', 'Is_Spouse', 'Health_Indicator', 
            'Holding_Policy_Duration',  'Holding_Policy_Type', 'Reco_Policy_Cat']
numeric_col = 'Reco_Policy_Premium'
target = 'Response'

In [None]:
scalar = StandardScaler()
train[numeric_col] = scalar.fit_transform(train[numeric_col].values.reshape((-1,1)))
test[numeric_col] = scalar.fit_transform(test[numeric_col].values.reshape((-1,1)))

In [None]:
for col in cat_cols:
    print(f' {col} --> {train[col].nunique()}')

In [None]:
# preparing embedding inputs layers
inputs = []
models = []

def create_mlp(train, cat_cols):

    for col in cat_cols:
        num_of_unique = int(train[col].nunique())
        embedding_size = int(min(np.ceil(num_of_unique/2), 50))
        print(f'{col} unique_value --> {num_of_unique}')
        print(f'{col} embedding size {embedding_size}')

        cat_in = tf.keras.layers.Input(shape=(1,))
        x = tf.keras.layers.Embedding(num_of_unique + 1,embedding_size, name=col )(cat_in)
        x = tf.keras.layers.SpatialDropout1D(0.3)(x)
        out = tf.keras.layers.Reshape(target_shape=(embedding_size,))(x)
        inputs.append(cat_in)
        models.append(out)


    numeric_in = tf.keras.layers.Input(shape=(1,), name='Reco_Policy_Premium')
    out = tf.keras.layers.Dense(2048, activation='relu' )(numeric_in)
    out =  tf.keras.layers.Dense(1024, activation='relu' )(out)
    inputs.append(numeric_in)
    models.append(out)

    # dense layers
    model = tf.keras.layers.Concatenate()(models)

    x = tf.keras.layers.Dense(1024, activation='relu')(model)
    x = tf.keras.layers.Dropout(.35)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dropout(.15)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(.15)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    



    model = tf.keras.Model(inputs, output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['AUC'])
    return model
    print('model compiled')
model = create_mlp(train, cat_cols)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(train.drop(target, axis=1), train[target], stratify=train[target], random_state=seed, test_size=0.2)
train_x.shape, test.shape

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
if use_pca:
    for i in range(20,76, 5):
        pca = PCA(n_components=i, svd_solver='full')
        pca.fit_transform(train)
        print('components: ',i,'explained variance: ', pca.explained_variance_ratio_.sum() * 100)
    pca =  PCA(n_components=75, svd_solver='full')
    
    transformed = pca.fit_transform(train)

In [None]:
#converting data to list format to match the network structure
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in cat_cols:
        raw_vals = X_train[c].unique()
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    #the rest of the columns
#     other_cols = [c for c in X_train.columns if (not c in numeric_col)]
    input_list_train.append(X_train[numeric_col].values)
    input_list_val.append(X_val[numeric_col].values)
    input_list_test.append(X_test[numeric_col].values)
    
    return input_list_train, input_list_val, input_list_test    



In [None]:
train_x, test_x, test = preproc(train_x, test_x, test)

In [None]:
epochs = 200
batch_size = 512


earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=20,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint('./model.h5', monitor='val_auc', verbose=1, save_best_only=True, mode='max')
reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=20, min_lr=1e-6, mode='max', verbose=1)

history = model.fit(train_x, train_y.values, 
                    validation_data=(test_x, test_y.values),
                    callbacks=[earlyStopping, reduce_lr_loss, checkpoint],
                    batch_size=batch_size,
                    epochs=500)

In [None]:
model = tf.keras.models.load_model('./model.h5')


In [None]:
plt.plot(history.history['loss']) 
plt.plot(history.history['auc']) 
plt.title('model auc') 
plt.ylabel('auc')
plt.xlabel('epoch') 
plt.legend(['loss', 'auc'], loc='upper left') 
plt.show()

In [None]:
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))

skf = StratifiedKFold(n_splits=50)
for train_index, test_index in skf.split(train_x, train[target].values):
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train, y_test = X_train.target.values, X_test.target.values
    train_x, test_x, test = preproc(train_x, test_x, test)
    model = create_model(data, features)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
    earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=20,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)
    checkpoint = tf.keras.callbacks.ModelCheckpoint('./model.h5', monitor='val_auc', verbose=1, save_best_only=True, mode='max')
    reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=13, min_lr=1e-6, mode='max', verbose=1)
    
    model.fit(X_train,
              utils.to_categorical(y_train),
              validation_data=(X_test, utils.to_categorical(y_test)),
              verbose=1,
              batch_size=1024,
              callbacks=[earlyStopping, checkpoint, reduce_lr_loss],
              epochs=100
             )
    valid_fold_preds = model.predict(X_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    oof_preds[test_index] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print(metrics.roc_auc_score(y_test, valid_fold_preds))
    K.clear_session()

In [None]:
sub = pd.DataFrame(columns=['ID','Response'])
test_ = pd.read_csv('../input/jobathon/tabnet4_2021-02-27_0.703923581379631.csv')
sub.ID = test_.ID
sub.Response = model.predict(test).reshape(-1,)
sub.head()

In [None]:
sub.to_csv('nn_sub5.csv', index=False)