# Model Analysis: Floor Price Optimisation

- Author: Reshad Dernjani

## Dataset Overview

inventory_id, request_type, ex_floor_price, ex_bid_price, state_code, country_code, city_code, device_os, device_os_version, device_type

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import pprint
import tempfile
import time

try:
    import tensorflow_transform as tft
    import apache_beam as beam
    import tensorflow_model_analysis as tfma
except ImportError:
    # This will take a minute, ignore the warnings.
    !pip install -q tensorflow-transform
    !pip install -q apache_beam
    !pip install -q tensorflow-model-analysis
    import tensorflow_transform as tft
    import apache_beam as beam
    import tensorflow_model_analysis as tfma  
    
import tensorflow as tf
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform.saved import saved_transform_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

## Name our columns

In [None]:
CATEGORICAL_FEATURE_KEYS = [
    'inventory_id',
    'request_type',
    'state_code',
    'country_code',
    'city_code',
    'device_os',
    'device_os_version',
    'hour_of_day',
]

NUMERIC_FEATURE_KEYS = [
    'ex_floor_price',
]

OPTIONAL_NUMERIC_FEATURE_KEYS = [ 
    # actually we handled optionals on the data query (at least for research)
]

LABEL_KEY = 'ex_bid_price'

## Define our features and schema

In [None]:
RAW_DATA_FEATURE_SPEC = dict(
    [(name, tf.FixedLenFeature([], tf.string))
     for name in CATEGORICAL_FEATURE_KEYS] +
    [(name, tf.FixedLenFeature([], tf.float32))
     for name in NUMERIC_FEATURE_KEYS] +
    [(name, tf.VarLenFeature(tf.float32))
     for name in OPTIONAL_NUMERIC_FEATURE_KEYS] +
    [(LABEL_KEY, tf.FixedLenFeature([], tf.float32))]
)

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(RAW_DATA_FEATURE_SPEC)
)

## Basic housekeeping

In [None]:
# Names of temp files
TRANSFORMED_TRAIN_DATA_FILEBASE = 'train_transformed'
TRANSFORMED_TEST_DATA_FILEBASE = 'test_transformed'
EXPORTED_MODEL_DIR = 'exported_model_dir'
EXPORTED_EVAL_MODEL_DIR = 'eval_dir'

## Predict with transformed data

In [None]:
def sort_results(predictions, labels):
    """Sort values keeping prediction and lable pairs together.
        Args:
            predictions: List of predictions.
            labels: List of labels.
        Returns:
            The same way sorted lists
    """
    tmp_pairs = []
    for x in range(len(labels)):
        tmp_pairs.append([labels[x],predictions[x]])
    def getKey(item):
        return item[0]
    sorted_predictions = []
    sorted_labels = []
    for pair in sorted(tmp_pairs, key=getKey):
        sorted_labels.append(pair[0])
        sorted_predictions.append(pair[1])
    
    return sorted_predictions, sorted_labels

In [None]:
def get_prediction_results(iterations, path):
    """Gets prediction results.
    Args:
        iterations: Number of predictions to make on the test data.
        path: Path to the transformed test data file.
    Returns:
        List of predictions, list of labels(dsp bid prices) and floor prices(historical). 
    """
    record_iterator = tf.python_io.tf_record_iterator(path=path)
    predictions = []
    labels = []
    floor_prices = []
    i = 0
    for string_record in record_iterator:
        # Before predicting we transform the data instance back in order to get the label
        example = tf.train.Example()
        example.ParseFromString(string_record)
        labels.append(example.features.feature['ex_bid_price'].float_list.value[0])
        floor_prices.append(example.features.feature['ex_floor_price'].float_list.value[0])
        pred = predict_fn({'examples': [example.SerializeToString()]})
        predictions.append(pred['predictions'][0])
        i+=1
        if i==iterations:
            break
    return predictions, labels, floor_prices

In [None]:
def calc_revenue(new_floor_prices, dsp_bids, old_floor_prices):
    """The bid price predictions are used as the new floor price. The labels are the original bids.
    A second bid auction is simulated to gather information about the revenue uplift on historical data.
    
    Args:
        new_floor_prices: Model predictions
        dsp_bids: Labels
        old_floor_prices: Floor price from the historical test data
    
    Returns:
        Metrics of the auction simulation with optimized floor prices
    """
    abs_rev_new = 1e-10
    abs_rev_old = 1e-10
    over_the_bid_counter = 0
    below_zero_counter = 0
    iterations = len(raw_pred)
    for x in range(iterations):
        abs_rev_new +=  new_floor_prices[x] if new_floor_prices[x] > 0 and new_floor_prices[x] < dsp_bids[x] else old_floor_prices[x]
        abs_rev_old +=  old_floor_prices[x] if old_floor_prices[x] < dsp_bids[x] else 0.
        over_the_bid_counter += 1 if new_floor_prices[x] > dsp_bids[x] else 0
        below_zero_counter += 1 if new_floor_prices[x] < 0 else 0
    over_the_bid_pct = (over_the_bid_counter / iterations) * 100
    below_zero_pct = (below_zero_counter / iterations) * 100
    return abs_rev_new, abs_rev_old, over_the_bid_pct, below_zero_pct

In [None]:
def get_prediction_speed(iterations, path):
    """Prediction speed is collected and averaged over the specified iterations. Prints results.
    
    Args:
        iterations: Number of data instances to use
        path: Path of the transformed test data file
    """
    record_iterator = tf.python_io.tf_record_iterator(path=path)
    i = 0
    start = time.time()
    for string_record in record_iterator:
        pred = predict_fn({'examples': [string_record]})
        i+=1
        if i==iterations:
            break
    elapsed_sec = time.time() - start
    millisec_per_req = (elapsed_sec * 1000) / iterations
    print('\n\nPrediction for {} requests took {:.2f} seconds. Resulting in {:.2f} milliseconds per prediction.\n\n'
          .format(iterations, elapsed_sec, millisec_per_req))

In [None]:
def count_instances(tf_records_filenames):
    """Counts data instances from a list of files.
    
    Args:
        tf_records_filenames: List of data files
        
    Returns:
        Number of data instances
    """ 
    counter = 0
    for fn in tf_records_filenames:
        for record in tf.python_io.tf_record_iterator(fn):
            counter += 1 
    return counter

## Put it all together

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import decimal
import matplotlib.pyplot as plt
tf.logging.set_verbosity(tf.logging.ERROR)

total_rev_new = 0.
total_rev_old = 0.
total_potential_revenue = 0.
total_num_of_predictions = 0
total_over_the_bid_count = 0
total_below_zero_count = 0

# Base directory of the exported models.
directory = 'tmp'

# Iterations over the test data instances.
# If the test file has less data instances the max value will be used
iterations_of_datapoints = 1000

# List of models to analyse
dsp_list = ['47']

for dsp in dsp_list:
    print("\n\n\nAnalysis for dsp: {}\n".format(dsp))
    # Change transormed_dir init if you want to run multiple models on the same test data
    transformed_dir = '/notebooks/transformed/' + dsp
    model_path = tf.gfile.Glob('/notebooks/'+directory+'/' +dsp+'/exported_model_dir/1*')[0]
    tf_records_filenames = tf.gfile.Glob(transformed_dir + '/test_transformed*')
    
    for test_path in tf_records_filenames:
        print(test_path)
        instances_count = count_instances([test_path])
        # If the test file has less data instances the max value will be used
        num_of_predictions = iterations_of_datapoints if instances_count > iterations_of_datapoints else instances_count
        total_num_of_predictions += num_of_predictions
    
        # Load model
        predict_fn = tf.contrib.predictor.from_saved_model(model_path)

        # Collect results
        raw_pred, raw_labels, old_floor_prices = get_prediction_results(num_of_predictions, test_path)
        predictions, labels = sort_results(raw_pred, raw_labels)

        # Calculate uplift and revenue
        abs_rev_new, abs_rev_old, over_the_bid_pct, below_zero_pct = calc_revenue(raw_pred, raw_labels, old_floor_prices)
        uplift = (abs_rev_new / abs_rev_old) - 1
        abs_potential_revenue = sum(raw_labels)
        print("New revenue: {:.2f}, old revenue: {:.2f}, uplift: {:.2f}%, potential revenue: {:.2f}."
            .format(abs_rev_new, abs_rev_old, uplift*100, abs_potential_revenue))
        print("{:.2f}% of the predictions were higher than the winning bid!".format(over_the_bid_pct))
        print("{:.2f}% of the predictions were below zero!".format(below_zero_pct))
        
        # Collect overall summary
        total_rev_new += abs_rev_new
        total_rev_old += abs_rev_old
        total_potential_revenue += abs_potential_revenue
        total_over_the_bid_count = num_of_predictions * (over_the_bid_pct/100)
        total_below_zero_count = num_of_predictions * (below_zero_pct/100)

        # Calculate prediction speed
        get_prediction_speed(num_of_predictions, test_path)

        # Plot: Prediction vs Label
        plt.figure(figsize=(16,10))
        plt.plot(predictions, label='Predictions')
        plt.plot(labels, label='Labels')
        plt.legend()
        plt.ylim(-2.0,10.0)
        plt.yticks(np.arange(-2, 11, 1.0))
        plt.grid(True, which='both')
        plt.axhline(y=0, color='k')
        plt.axvline(x=0, color='k')
        plt.xlabel('Count')
        plt.ylabel('Bid Price')
        plt.title('Predictions vs Labels Curve',fontsize=16)
        plt.show()
 
        old_floor, new_floor = sort_results(old_floor_prices, raw_pred)

        # Plot: Predictions vs Floor Prices
        plt.figure(figsize=(16,10))
        plt.plot(new_floor, label='Predictions')
        plt.plot(old_floor, label='Floor Price')
        plt.legend()
        plt.ylim(-2.0,10.0)
        plt.yticks(np.arange(-2, 11, 1.0))
        plt.grid(True, which='both')
        plt.axhline(y=0, color='k')
        plt.axvline(x=0, color='k')
        plt.xlabel('Count')
        plt.ylabel('Price')
        plt.title('Predictions vs Floor Prices',fontsize=16)
        plt.show()
    
        # Distribution plot
        plt.figure(figsize=(16,10))
        plt.hist(
            [raw_pred, raw_labels],
            np.arange(-1.5, 10.5, 0.5),
            label=['Distribution of predictions', 'Distribution of labels']
        )
        plt.ylabel('Bid Count')
        plt.xlabel('Bid Price Categories')
        plt.xlim(-1.5,10.0)
        plt.xticks(np.arange(-1.5, 10.5, 0.5))
        plt.legend()
        plt.title('Distribution Plot',fontsize=16)
        plt.show()

        # Plot: Scatterplot
        plt.figure(figsize=(16,10))
        plt.scatter(labels, predictions, c='red', alpha=0.5)
        plt.plot([-2, 10], [-2, 10], ls="--", c="black")
        plt.plot([-2, 10], [0, 0], ls="--", c="black")
        plt.ylim(-2.0,10.0)
        plt.xlim(-2.0,10.0)
        plt.yticks(np.arange(-2, 11, 1.0))
        plt.xticks(np.arange(-2, 11, 1.0))
        plt.ylabel("Predicted Bid Price")
        plt.xlabel("Label Bid Price")
        plt.title('Predictions vs Labels',fontsize=16)
        plt.show()

print("\n\nOverall analysis")
total_uplift = (total_rev_new / total_rev_old) - 1
total_over_the_bid_pct = (total_over_the_bid_count  / total_num_of_predictions) * 100
total_below_zero_pct = (total_below_zero_count / total_num_of_predictions) * 100
print("New revenue: {:.2f}, old revenue: {:.2f}, uplift: {:.2f}%, potential revenue: {:.2f}."
      .format(total_rev_new, total_rev_old, total_uplift*100, total_potential_revenue))
print("{:.2f}% of the predictions were higher than the winning bid!".format(total_over_the_bid_pct))
print("{:.2f}% of the predictions were below zero!".format(total_below_zero_pct))

## Predict with raw data

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def serialize_example(inventory_id, request_type, ex_floor_price,
                      state_code, country_code, city_code, device_os, device_os_version, hour_of_day):
  
    # Create a dictionary mapping the feature name to the tf.Example-compatible data type.
    feature = {
        'inventory_id': _bytes_feature(inventory_id),
        'request_type': _bytes_feature(request_type),
        'ex_floor_price': _float_feature(ex_floor_price),
        'state_code': _bytes_feature(state_code),
        'country_code': _bytes_feature(country_code),
        'city_code': _bytes_feature(city_code),
        'device_os': _bytes_feature(device_os),
        'device_os_version': _bytes_feature(device_os_version),
        'hour_of_day': _bytes_feature(hour_of_day),
    }
  
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

### Use this for custom predictions with raw data input

In [None]:
#model_path = tf.gfile.Glob('/notebooks/tmp/47/exported_model_dir/*')[0]
#predict_fn = tf.contrib.predictor.from_saved_model(model_path)

#example = serialize_example("249621", "banner", 1.6590000000000014, "TEXAS_ST_US",
#                            "US", "DALLAS_TX_US", "Android", "8.1", "02")

#prediction = predict_fn({'examples': [example]})
#prediction = prediction['predictions'][0]

#print('Prediction: {}'.format(prediction))

## Demonstration of the behaviour of the asymmetric loss used

In [None]:
import random

# Definition of the asymmetric loss function
def asymmetric_loss(pred, label, shift):
    return ((pred-label)**2) * (np.sign(pred-label)+shift)**2

loss_no_shift = []
loss_50_shift = []
loss_neg_50_shift = []
preds = []
labels = []

# Generates for 10 data points overestimation and for 10 others underestimations
for i in range(10):
    x = random.uniform(1.5, 1.9)
    y = x + random.uniform(0.2, 0.5)
    preds.append(y)
    labels.append(x)
    loss_no_shift.append(asymmetric_loss(y, x, 0))
    loss_50_shift.append(asymmetric_loss(y, x, 0.50))
    loss_neg_50_shift.append(asymmetric_loss(y, x, -0.50))
for i in range(10):
    x = random.uniform(1.5, 1.9)
    y = x - random.uniform(0.2, 0.5)
    preds.append(y)
    labels.append(x)
    loss_no_shift.append(asymmetric_loss(y, x, 0))
    loss_50_shift.append(asymmetric_loss(y, x, 0.50))
    loss_neg_50_shift.append(asymmetric_loss(y, x, -0.50))

# Plot the asymmetric loss compared to the prediction and lable curve
plt.figure(figsize=(16,10))
plt.plot(loss_no_shift, label='Shift 0')
plt.plot(loss_50_shift, label='Shift 0.50')
plt.plot(loss_neg_50_shift, label='Shift -0.50')
plt.plot(preds, label='Random predictions')
plt.plot(labels, label='Random labels')
plt.legend()
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.xlabel('Count')
plt.ylabel('Loss')
plt.title('Asymmetric Loss',fontsize=16)
plt.show()