In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [None]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

In [None]:
train = pd.read_csv("../input/jobathon-analytics-vidhya/train.csv")
test = pd.read_csv("../input/jobathon-analytics-vidhya/test.csv")
sample = pd.read_csv("../input/jobathon-analytics-vidhya/sample_submission.csv")

In [None]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [None]:
@log_step
def encoding(data):

    """
    Applying One Hot Encoding and Label Encoding 
    """
    le = LabelEncoder()
    data['Holding_Policy_Duration'] = le.fit_transform(data['Holding_Policy_Duration'])

    var_mod = ['Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse', 'Holding_Policy_Duration']

    for i in var_mod:
        data[i] = le.fit_transform(data[i])

    # One Hot Encoding : 
    data = pd.get_dummies(data, columns = ['Accomodation_Type', 'Reco_Insurance_Type', 'Is_Spouse','Holding_Policy_Duration'])
    
    return data

In [None]:
@log_step
def preprocess(data):

    data['Holding_Policy_Type'] = data['Holding_Policy_Type'].astype(str)

    data['Reco_Policy_Cat'] = data['Reco_Policy_Cat'].astype(str)

    data['Region_Code'] = data['Region_Code'].astype(str)
    
    return data

In [None]:
@log_step
def impute(data):
    
    data['Holding_Policy_Duration'] = data['Holding_Policy_Duration'].fillna(str(0.0))
    data['Holding_Policy_Type'] = data['Holding_Policy_Type'].fillna('no_policies')
#    data['Health Indicator'] = data['Health Indicator'].fillna(data['Health Indicator'].mode()[0])
    
    return data

In [None]:
@log_step
def start_pipeline(dataf):
    return dataf.copy() 

In [None]:
train = (train
      .pipe(start_pipeline)
      .pipe(impute)
      .pipe(preprocess)
      .pipe(encoding))

In [None]:
test = (test
      .pipe(start_pipeline)
      .pipe(impute)
      .pipe(preprocess)
      .pipe(encoding))

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['Health Indicator'] = le.fit_transform(train['Health Indicator'])
test['Health Indicator'] = le.fit_transform(test['Health Indicator'])


In [None]:
train = train.rename(columns = {'Health Indicator': 'Health_Indicator'})
test = test.rename(columns = {'Health Indicator': 'Health_Indicator'})

In [None]:
test["Response"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ["ID", "Response"]]

for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)

In [None]:
train = data[data.Response != -1].reset_index(drop=True)
test = data[data.Response == -1].reset_index(drop=True)
test_data = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [None]:
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))

skf = StratifiedKFold(n_splits=50)
for train_index, test_index in skf.split(train, train.Response.values):
    X_train, X_test = train.iloc[train_index, :], train.iloc[test_index, :]
    X_train = X_train.reset_index(drop=True)
    
    X_test = X_test.reset_index(drop=True)
    y_train, y_test = X_train.Response.values, X_test.Response.values
    
    model = create_model(data, features)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
    
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]
    
    es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                                 verbose=1, mode='max', baseline=None, restore_best_weights=True)

    rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                      patience=3, min_lr=1e-6, mode='max', verbose=1)
    
    history = model.fit(X_train,
                  utils.to_categorical(y_train),
                  validation_data=(X_test, utils.to_categorical(y_test)),
                  verbose=1,
                  batch_size=2048,
                  callbacks=[es, rlr],
                  epochs=20
                 )
    
    valid_fold_preds = model.predict(X_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    
    oof_preds[test_index] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    
    print(metrics.roc_auc_score(y_test, valid_fold_preds))
    K.clear_session()

In [None]:
print("Overall AUC={}".format(metrics.roc_auc_score(train.Response.values, oof_preds)))

In [None]:
test_preds /= 50
test_ids = test.ID.values
print("Saving submission file")
submission = pd.DataFrame.from_dict({
    'ID': test_ids,
    'Response': test_preds
})
submission.to_csv("submission.csv", index=False)

In [None]:
pip install hiplot

In [None]:
import hiplot as hip

In [None]:



data = [{'epoch': idx,
    'loss': np.float64(history.history['loss'][idx]),
   'val_loss': np.float64(history.history['val_loss'][idx]),
    'AUC': np.float64(history.history['val_auc'][idx]),
        } 
    for idx in range(11)]



In [None]:
data

In [None]:
hip.Experiment.from_iterable(data).display()