In [None]:
# kgg problem: https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction
%pylab inline

In [None]:
import time
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Dense, Activation, Layer, Flatten

In [None]:
# config
FEATURES = {
    "Gender": 0,
    "Age": 1,
    "Driving_License": 2,
    "Region_Code": 3,
    "Previously_Insured": 4,
    "Vehicle_Age": 5,
    "Vehicle_Damage": 6,
    "Annual_Premium": 7,
    "Policy_Sales_Channel": 8,
    "Vintage": 9
}
DIM = 128
FEATURE_NAMES = list(FEATURES.keys())
FEATURE_SIZES = [] # need compute after features encoding
DEEP_DENSE_OUT_DIM = [128, 64, 32, 1]
BATCH_SIZE = 32

LR = 0.001
optimizer = tf.keras.optimizers.Adam(LR)
metric = tf.keras.metrics.Accuracy()

In [None]:
# DeepFM model
class SingleBiasLayer(Layer):
    def __init__(self, *args, **kwargs):
        super(SingleBiasLayer, self).__init__(*args, **kwargs)

    def build(self, units):
        # one single bias value add to batch so shape is (1, )
        self.bias = self.add_weight('bias',
                                    shape=(1, ),
                                    initializer='random_normal',
                                    trainable=True)
                                
    def call(self, X):
        """
        Add the same single bias value to each x
            @X: (batch_size, )
        """
        return X + self.bias


class DeepFM(keras.Model):
    def __init__(self, dim, feature_names, feature_sizes, batch_size, deep_dense_out_dim):
        super(DeepFM, self).__init__()
        self.dim = dim
        self.feature_names = feature_names
        self.feature_sizes = feature_sizes
        self.batch_size = batch_size
        self.init_fm_part()
        self.deep_dense_out_dim = deep_dense_out_dim
        self.init_deep_part()

    def init_features_embeds(self):
        self.features_embeds = []
        for feature_name, feature_size in zip(self.feature_names, self.feature_sizes):
            # feature_size+1 for randomly weight representing unseen feature in train
            # unseen value for one certrain feature in test are all encoded as `feature_size`
            embed_lookup = Embedding(feature_size+1, self.dim, name="embed"+feature_name)
            self.features_embeds.append(embed_lookup)
    
    def init_fm_dense(self):
        """
            Using 1-dim value embed lookup to represent one-hot vector dot weight vector
        """
        self.fm_dense = []
        for feature_name, feature_size in zip(self.feature_names, self.feature_sizes):
            # feature_size+1 for randomly embedding representing unseen feature in train
            # unseen value for one certrain feature in test are all encoded as number: `feature_size`
            dense_weight_lookup = Embedding(feature_size+1, 1, name="dense"+feature_name)
            self.fm_dense.append(dense_weight_lookup)
        self.fm_bias = SingleBiasLayer()

    def init_fm_part(self):
        self.init_features_embeds()
        self.init_fm_dense()

    def init_deep_part(self):
        self.flatten = Flatten()
        self.deep_dense_layers = []
        # first layer's input size is dim*n_fea, last layer's output size is 1
        for i, output_dim in enumerate(self.deep_dense_out_dim):
            layer = Dense(units=output_dim,
                          activation="relu",
                          use_bias=True,
                          name="deep_dense_"+str(i))
            self.deep_dense_layers.append(layer)

    def fm_part(self, X):
        """
            @X: (batch_size, n_fea)
        """
        # 1-order part (dense and bias):
        one_order = []
        for i in range(X.shape[1]):
            feature_w_lookup = self.fm_dense[i]
            feature_batch_w = feature_w_lookup(X[:, i]) # (batch_size, 1)
            one_order.append(feature_batch_w)
        # (n_fea, batch_size, 1) --stack--> (batch_size, n_fea, 1) --sum&squeeze--> (batch_size, )
        one_order = tf.squeeze(tf.reduce_sum(tf.stack(one_order, axis=1), axis=1))

        # 2-order part:
        embeds = []
        for i in range(len(self.feature_names)):
            feature_embed = self.features_embeds[i]
            batch_embeds = feature_embed(X[:, i]) # (batch_size, dim)
            embeds.append(batch_embeds)
        # (n_fea, batch_size, dim) -> (batch_size, n_fea, dim)
        embeds = tf.stack(embeds, axis=1)
        # feature embeddings crossing dot product
        #two_order = tf.zeros(self.batch_size) # don't use this, for reminder may not be of shape (self.batch_size, )
        two_order = tf.zeros(X.shape[0])
        for i in range(len(self.feature_names)):
            for j in range(i+1, len(self.feature_names)):
                # (batch_size, dim)
                V_i, V_j = embeds[:, i, :], embeds[:, j, :]
                # perform batch dot operation between V_i and V_j:
                # (batch_size, 1, dim).dot((batch_size, dim, 1)) --squeeze--> (batch_size, )
                batch_dot = tf.squeeze(tf.matmul(tf.expand_dims(V_i, 1), 
                                                 tf.expand_dims(V_j, -1)))
                two_order += batch_dot

        return self.fm_bias(one_order + two_order) # (batch_size, )

    def deep_part(self, X):
        """
            @X: (batch_size, n_fea)
        """
        embeds = []
        for i in range(X.shape[1]):
            feature_embed = self.features_embeds[i]
            batch_embeds = feature_embed(X[:, i])
            embeds.append(batch_embeds)
        # stack to (n_fea, batch_size, dim) then concat(flat) to (batch_size, n_fea*dim)
        layer_out = self.flatten(tf.stack(embeds, axis=1))
        # feed into dense layers
        for i, layer in enumerate(self.deep_dense_layers):
            layer_out = layer(layer_out)

        return tf.squeeze(layer_out) # (batch_size, )

    def call(self, X):
        """
        Forward function for training
            X: (batch_size, n_fea)
        """
        # (batch_size, )
        return tf.sigmoid(self.fm_part(X) + self.deep_part(X))


In [None]:
# training functions
@tf.function
def train_step(X, Y, model):
    with tf.GradientTape() as tape:
        batch_pred = model(X)
        loss = tf.losses.binary_crossentropy(Y, batch_pred)

    batch_loss = (loss / X.shape[0])

    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


def train_fm_model(model, dataset, epochs):
    for epoch in range(epochs):
        start = time.time()

        total_loss, total_acc, steps = 0, 0, 0

        for (batch_index, (X, Y)) in enumerate(dataset):
            batch_loss = train_step(X, Y, model)
            total_loss += batch_loss
            steps += 1

            if steps % 1000 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.\
                    format(epoch + 1,
                           batch_index,
                           batch_loss))

        print('====== Epoch {} Loss {:.4f} ======'.format(epoch + 1, total_loss / steps))

    return model

In [None]:
# bucketize Annual_Premium
train_samples = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
annual_premium = train_samples["Annual_Premium"].values
percentiles = []
for p in np.arange(0, 100, 0.25):
    percentiles.append(np.percentile(annual_premium, p))

def bucketize(x, percentiles):
    return np.searchsorted(percentiles, x)

In [None]:
print("raw Annual_Premium")
print(len(pd.unique(train_samples["Annual_Premium"])))
hist(train_samples["Annual_Premium"])

In [None]:
train_samples["Annual_Premium"] = train_samples["Annual_Premium"].apply(bucketize, args=(percentiles, ))
print("after bucketize")
print(len(pd.unique(train_samples["Annual_Premium"])))
hist(train_samples["Annual_Premium"])

test_samples = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")
test_samples["Annual_Premium"] = test_samples["Annual_Premium"].apply(bucketize, args=(percentiles, ))

In [None]:
# feature encoding functions
def prepare_encoder(train_samples):
    features_value_encoder = [{} for _ in FEATURES.keys()]

    for i in range(len(features_value_encoder)):
        encode_value = 0
        _field_value_encoder = features_value_encoder[i]

        for sample in train_samples.itertuples(index=False):
            field_value = sample[i]
            if field_value in _field_value_encoder:
                continue
            else:
                _field_value_encoder[field_value] = encode_value
                encode_value += 1

    # set unseen feature value's encode value
    for feature_value_encoder in features_value_encoder:
        unseen_value = max(feature_value_encoder.values()) + 1
        feature_value_encoder["unseen"] = unseen_value

    return features_value_encoder

def encode_train_samples(train_samples, encoder):
    encoded_train_samples = []
    for sample in train_samples.itertuples(index=False):
        _sample = []
        for i in range(len(sample)-1):
            encoded_value = encoder[i][sample[i]]
            _sample.append(encoded_value)
        encoded_train_samples.append(_sample)

    return encoded_train_samples

def encode_test_samples(test_samples, encoder):
    encoded_test_samples = []
    for sample in test_samples.itertuples(index=False):
        _sample = []
        for i in range(len(sample)):
            feature_encoder = encoder[i]
            try:
                encoded_value = feature_encoder[sample[i]]
            except KeyError:
                # encode feature value not seen in train set as max feature encode value + 1
                print("feature {} value {} not seen in training set".format(i, sample[i]))
                encoded_value = feature_encoder["unseen"]
            _sample.append(encoded_value)
        encoded_test_samples.append(_sample)

    return encoded_test_samples


In [None]:
# encode train samples
del train_samples["id"]
encoder = prepare_encoder(train_samples)
X_encoded = encode_train_samples(train_samples, encoder)
Y = train_samples["Response"].tolist()
# compute feature_sizes
for feature_encoder in encoder:
    FEATURE_SIZES.append(len(feature_encoder))
# prepare train dataset
dataset = tf.data.Dataset.from_tensor_slices((X_encoded, Y))
dataset = dataset.batch(BATCH_SIZE).shuffle(5000)

In [None]:
# encode test samples
del test_samples["id"]
X_encoded_test = encode_test_samples(test_samples, encoder)
# prepare test dataset
test_dataset = tf.data.Dataset.from_tensor_slices(X_encoded_test)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
# train model
model = DeepFM(DIM, FEATURE_NAMES, FEATURE_SIZES, BATCH_SIZE, DEEP_DENSE_OUT_DIM)
model = train_fm_model(model, dataset, 30)

In [None]:
preds = []
for X in test_dataset:
    batch_pred = model(X)
    for pred in batch_pred:
        preds.append(pred)
preds = [x.numpy() for x in preds]

In [None]:
# write submission file
IDs = []
with open("../input/health-insurance-cross-sell-prediction/test.csv", "r") as f:
    first = True
    for line in f.readlines():
        if first:
            first = False
            continue
        IDs.append(line.split(",")[0])

lines = ["id,Response\n"]
assert len(IDs) == len(preds)
for ID, pred in zip(IDs, preds):
    lines.append(ID + "," + str(pred) + "\n")
with open("submission.csv", "w") as f:
    f.writelines(lines)