In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras import backend as K


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices()) # list of DeviceAttributes

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# load data
train = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
submit = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')





In [None]:
# check missing / duplicate values
print('# of missing values in train: %s' % train.isnull().sum().sum())
print('# of missing values in submit: %s' % submit.isnull().sum().sum())

# check duplicate values
print('# of duplicate values in train: %s' % (train.id.unique().shape[0] - train.id.shape[0]))
print('# of duplicate values in submit: %s' % (submit.id.unique().shape[0] - submit.id.shape[0]))





In [None]:
def print_mean_response(dimension):
    response = train.groupby(dimension)[['Response']].mean()
    print(response)
    return

def plot_mean_response(dimension):
    response = train.groupby(dimension)[['Response']].mean()
    plt.figure()
    plt.plot(response, 'o')
    return

def plot_distribution(dimension):
    plt.figure()
    neg = train[train.Response==0]
    pos = train[train.Response==1]
    
    sns.distplot(neg[dimension], hist=False, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label='%s when Response=0' % dimension)
    
    sns.distplot(pos[dimension], hist=False, kde=True, 
             bins=int(180/5), color = 'red', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4}, label='%s when Response=1' % dimension)
    
def plot_hist(dimension):
    
    fig, ax = plt.subplots(1,2)
    fig.tight_layout()
                    
    neg = train[train.Response == 0][dimension].to_frame()
    pos = train[train.Response == 1][dimension].to_frame()

    pos_percentages = (pos.groupby(dimension).size() / pos.groupby(dimension).size().sum()) * 100
    pos_percentages = pos_percentages.reset_index(name='frequencies')
   
    
    neg_percentages = (neg.groupby(dimension).size() / neg.groupby(dimension).size().sum()) * 100
    neg_percentages = neg_percentages.reset_index(name='frequencies')
    
    neg_plot = sns.barplot(x=neg_percentages[dimension], y=neg_percentages['frequencies'], ax=ax[0])
    pos_plot = sns.barplot(x=pos_percentages[dimension], y=pos_percentages['frequencies'], ax=ax[1])
    
    ax[0].set(xlabel='%s (Response = 0)' % dimension, ylabel='Frequencies (%)')
    ax[1].set(xlabel='%s (Response = 1)' % dimension, ylabel='Frequencies (%)')
  
    fig.show()

    return
    
    
# plot_distribution('Age')
# plot_distribution('Vintage')
# plot_distribution('Annual_Premium')

# plot_hist('Gender')
# plot_hist('Driving_License')
# plot_hist('Previously_Insured')
# plot_hist('Vehicle_Age')
# plot_hist('Vehicle_Damage')


print(train.groupby('Response')['id'].count() / train.shape[0])






In [None]:
def preprocess(X):
    df_dropped = X.drop(['Vintage', 'Annual_Premium'], axis=1)
 
    X = pd.get_dummies(df_dropped, 
                         columns=['Gender', 'Driving_License', 
                                  'Region_Code', 'Previously_Insured',
                                  'Vehicle_Age', 'Vehicle_Damage', 
                                  'Policy_Sales_Channel'])
    return X
    
# construct X_train




X_raw = train.iloc[:, 1:11]
train_rows = train.shape[0]
y = train[['Response']]
X_submit_raw = submit.iloc[:, 1:11]

preprocessed = preprocess(pd.concat([X_raw, X_submit_raw]))

X = preprocessed.iloc[0:(train_rows), :]
X_submit = preprocessed.iloc[train_rows:,: ]


X_train, X_dev_test, y_train, y_dev_test = train_test_split(X, y, test_size=0.05)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, test_size=0.5)


print('Train size: %s' % X_train.shape[0])
print('Dev size: %s' % X_dev.shape[0])
print('Test size: %s' % X_test.shape[0])


In [None]:
from keras.callbacks import Callback

#from s

class Metrics(Callback):
    def __init__(self, validation):   
        super(Metrics, self).__init__()
        self.validation = validation    
            
        print('validation shape', len(self.validation[0]))
        
    def on_train_begin(self, logs={}):        
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
     
    def on_epoch_end(self, epoch, logs={}):
        val_targ = self.validation[1]   
        val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
    
        val_f1 = f1_score(val_targ, val_predict)
        val_recall = recall_score(val_targ, val_predict)         
        val_precision = precision_score(val_targ, val_predict)
        
        self.val_f1s.append(round(val_f1, 6))
        self.val_recalls.append(round(val_recall, 6))
        self.val_precisions.append(round(val_precision, 6))
 
        print(f' — val_f1: {val_f1} — val_precision: {val_precision}, — val_recall: {val_recall}')



In [None]:


def plot_confusion_matrix(y_real, y_pred):
    cm = confusion_matrix(y_real, y_pred)

    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='g')

    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    return
    
    
# build neural network

model = tf.keras.models.Sequential(
    [
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dropout(0.2),

      tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)

class_weight = {0:1, 1:8}


# roc = RocCallback(training_data=(X_train, y_train),
#                   validation_data=(X_dev, y_dev))

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-3), metrics=[])
model.fit(X_train, y_train, epochs=30, validation_data=(X_dev, y_dev), callbacks=[Metrics( validation=(X_dev, y_dev))], batch_size=256, class_weight=class_weight)



y_dev_predict = np.greater(model.predict(X_dev), 0.5).astype(int)
plot_confusion_matrix(y_dev, y_dev_predict)






In [None]:
print('Evaluate model on test set')
model.evaluate(X_test, y_test, verbose=1)

y_test_pred = model.predict(X_test)
y_test_pred = np.greater(y_test_pred, 0.5).astype(int)

val_f1 = f1_score(y_test, y_test_pred)
val_recall = recall_score(y_test, y_test_pred)         
val_precision = precision_score(y_test, y_test_pred)

print('F1 %s recall %s precision %s' % (str(val_f1), str(val_recall), str(val_precision)))



In [None]:
# submit 
predictions = model.predict(X_submit)
predictions = np.greater(predictions, 0.5).astype(int)

pred_df = pd.DataFrame()
pred_df['prediction'] = predictions.ravel()
ids = submit[['id']]
result = pd.concat([ids, pred_df], axis=1)
print(result.groupby('prediction').count())


In [None]:
result.to_csv('submission.csv')