The original data and related data preprocessing cannot be disclosed due to corporate secrecy compliance.

However, the network structure of the model is open.
In the code, 
* the data features are preprocessed by the function FeaturePreprocessing() before being passed to the input layer of the network.
* The dataset, named dataset, is input during model estimation.
* Bid amounts were used in our analysis under the column name, adjuster_bid.

### Naive model

In [None]:
feature = FeaturePreprocessing()

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

pCTR = tf.keras.layers.Dense(256, activation="swish")(input_net)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=feature.inputs, outputs=pCTR)

dummy = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net)
dummy = tf.keras.Model(inputs=feature.inputs, outputs=dummy)

model = tf.keras.Model(
    inputs=feature.inputs, outputs=[pCTR.output,dummy.output,dummy.output, dummy.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy",None,None,None]
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="loss"
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

### IPS Estimator

In [None]:
### pIMP
feature = FeaturePreprocessing()
feature.inputs_iv = feature.inputs_iv | feature.inputs
feature.inputs_numeric_iv += feature.inputs_numeric

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

input_num_iv = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric_iv)
input_num_iv = tf.keras.layers.BatchNormalization()(input_num_iv)
input_net_iv = tf.keras.layers.Concatenate(axis=-1)([input_num_iv, input_net])

pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(input_net_iv)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(1, activation="sigmoid", name="impression_iv")(pIMP_iv)
pIMP_iv = tf.keras.Model(inputs=feature.inputs_iv, outputs=pIMP_iv)

dummy_output = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net_iv)
dummy_output = tf.keras.Model(inputs=feature.inputs_iv, outputs=dummy_output)

model = tf.keras.Model(
    inputs=feature.inputs_iv, outputs=[dummy_output.output, pIMP_iv.output, dummy_output.output, dummy_output.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=[None, "binary_crossentropy",None, None],
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="loss"
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

In [None]:
### IPS-pCTR
feature = FeaturePreprocessing()

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

pCTR = tf.keras.layers.Dense(256, activation="swish")(input_net)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=feature.inputs, outputs=pCTR)

model = tf.keras.Model(
    inputs=feature.inputs, outputs=pCTR.output,
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy",None,None,None]
)

In [None]:
ipsimp_model = tf.keras.models.load_model("")
features, y, w = next(iter(dataset))
pimp = ipsimp_model.predict(features, batch_size=200000, verbose=1)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="loss"
)

model.fit(x=features, y=y[0], sample_weight=pimp[1], epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

### baseline-IV

In [None]:
feature = FeaturePreprocessing()
feature.inputs_iv = feature.inputs_iv | feature.inputs
feature.inputs_numeric_iv += feature.inputs_numeric

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

input_num_iv = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric_iv)
input_num_iv = tf.keras.layers.BatchNormalization()(input_num_iv)
input_net_iv = tf.keras.layers.Concatenate(axis=-1)([input_num_iv, input_net])

pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(input_net_iv)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(1, activation="sigmoid", name="impression_iv")(pIMP_iv)
pIMP_iv = tf.keras.Model(inputs=feature.inputs_iv, outputs=pIMP_iv)

input_net = tf.keras.layers.Concatenate(axis=-1)([input_net, pIMP_iv.output])
pCTR = tf.keras.layers.Dense(256, activation="swish")(input_net)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=feature.inputs_iv, outputs=pCTR)

dummy_output = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net_iv)
dummy_output = tf.keras.Model(inputs=feature.inputs_iv, outputs=dummy_output)

model = tf.keras.Model(
    inputs=feature.inputs_iv, outputs=[pCTR.output, pIMP_iv.output, dummy_output.output, dummy_output.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy","binary_crossentropy",None,None]
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="loss"
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

### afs-iv: First stage pIMP with an addtional layer

In [None]:
feature = FeaturePreprocessing()
feature.inputs_iv = feature.inputs_iv | feature.inputs
feature.inputs_numeric_iv += feature.inputs_numeric

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

input_num_iv = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric_iv)
input_num_iv = tf.keras.layers.BatchNormalization()(input_num_iv)
input_net_iv = tf.keras.layers.Concatenate(axis=-1)([input_num_iv, input_net])

pIMP_iv = tf.keras.layers.Dense(256, activation="swish")(input_net_iv)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(pIMP_iv)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(1, activation="sigmoid", name="impression_iv")(pIMP_iv)
pIMP_iv = tf.keras.Model(inputs=feature.inputs_iv, outputs=pIMP_iv)

input_net = tf.keras.layers.Concatenate(axis=-1)([input_net, pIMP_iv.output])
pCTR = tf.keras.layers.Dense(256, activation="swish")(input_net)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=feature.inputs_iv, outputs=pCTR)

dummy = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net_bias)
dummy = tf.keras.Model(inputs=feature.inputs, outputs=dummy)

model = tf.keras.Model(
    inputs=feature.inputs_iv, outputs=[pCTR.output, pIMP_iv.output, dummy.output,dummy.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy","binary_crossentropy", None,None],
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="loss"
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

### iv-fsatt: Explicitly Incorpolating First stage IVs interactions using Attention Networks

In [None]:
feature = FeaturePreprocessing()
all_inputs_iv = feature.inputs_iv | feature.inputs

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

# Attention network
query_iv = tf.keras.layers.BatchNormalization()(feature.inputs_iv["adjusted_bid"])
key = input_net
query_iv_tiled = tf.tile(query_iv, [1, 297])
att_iv = tf.keras.layers.Multiply()([query_iv_tiled, key])
W = tf.keras.layers.Dense(297)
att_iv = W(att_iv)
att_iv = tf.keras.layers.Activation('softmax')(att_iv)
context_iv = tf.keras.layers.Multiply()([att_iv, input_net])
context_input = tf.keras.layers.Concatenate(axis=-1)([input_net, context_iv])

pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(context_input)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(1, activation="sigmoid", name="impression_iv")(pIMP_iv)
pIMP_iv = tf.keras.Model(inputs=all_inputs_iv, outputs=pIMP_iv)


input_net = tf.keras.layers.Concatenate(axis=-1)([input_net, pIMP_iv.output])
pCTR = tf.keras.layers.Dense(256, activation="swish")(input_net)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=all_inputs_iv, outputs=pCTR)

dummy_output = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net)
dummy_output = tf.keras.Model(inputs=all_inputs_iv, outputs=dummy_output)

model = tf.keras.Model(
    inputs=all_inputs_iv, outputs=[pCTR.output, pIMP_iv.output, dummy_output.output, dummy_output.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy","binary_crossentropy", None, None]
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="click_loss",
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])

### iv-ssfsatt: Explicitly Incorporating Interactions in First Stage and Second Stage with Attention network

In [None]:
feature = FeaturePreprocessing()
all_inputs_iv = feature.inputs_iv | feature.inputs

input_num = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_numeric)
input_num = tf.keras.layers.BatchNormalization()(input_num)
input_cat = tf.keras.layers.Concatenate(axis=-1)(feature.inputs_categorical)
input_net = tf.keras.layers.Concatenate(axis=-1)([input_num, input_cat])

# First Stage Attention network
query_iv = tf.keras.layers.BatchNormalization()(feature.inputs_iv["adjusted_bid"])
key = input_net
query_iv_tiled = tf.tile(query_iv, [1, 297])
att_iv = tf.keras.layers.Multiply()([query_iv_tiled, key])
W = tf.keras.layers.Dense(297, activation="swish")
att_iv = W(att_iv)
att_iv = tf.keras.layers.Activation('softmax')(att_iv)
context_input = tf.keras.layers.Multiply()([att_iv, input_net])
context_input = tf.keras.layers.Concatenate(axis=-1)([input_net, context_input])

pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(context_input) #add
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(128, activation="swish")(pIMP_iv)
pIMP_iv = tf.keras.layers.BatchNormalization()(pIMP_iv)
pIMP_iv = tf.keras.layers.Dense(1, activation="sigmoid", name="impression_iv")(pIMP_iv)
pIMP_iv = tf.keras.Model(inputs=all_inputs_iv, outputs=pIMP_iv)

# Second Stage Attention network
query_imp = pIMP_iv.output
key_imp = input_net
query_imp = tf.tile(query_imp, [1, 297])
att_imp = tf.keras.layers.Multiply()([query_imp, key_imp])
W_imp = tf.keras.layers.Dense(297, activation="swish")
att_imp = W_imp(att_imp)
att_imp = tf.keras.layers.Activation('softmax')(att_imp)
context_imp = tf.keras.layers.Multiply()([att_imp, input_net])

pCTR = tf.keras.layers.Dense(256, activation="swish")(context_imp)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(256, activation="relu")(pCTR)
pCTR = tf.keras.layers.BatchNormalization()(pCTR)
pCTR = tf.keras.layers.Dense(1, activation="sigmoid", name="click")(pCTR)
pCTR = tf.keras.Model(inputs=all_inputs_iv, outputs=pCTR)

dummy_output = tf.keras.layers.Lambda(lambda x: x[:, 0:1] * 0, name="dummy")(input_net)
dummy_output = tf.keras.Model(inputs=all_inputs_iv, outputs=dummy_output)

model = tf.keras.Model(
    inputs=all_inputs_iv, outputs=[pCTR.output, pIMP_iv.output, dummy_output.output, dummy_output.output]
)
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=0.001),
    loss=["binary_crossentropy","binary_crossentropy", None, None],
)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="",
    save_best_only=True,
    verbose=1,
    monitor="click_loss"
)

model.fit(dataset, epochs=15, steps_per_epoch=1000, callbacks=[cp_callback])