### First, let's define the feature configuration for our data

### ... brace yourselves!

In [1]:
# Set up the feature configurations
from ml4ir.features.feature_config import parse_config
from ml4ir.features.feature_config import ExampleFeatureConfig
from ml4ir.config.keys import TFRecordTypeKey
import json

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("Logger is initialized...")

feature_config_yaml = '''
query_key: 
  name: query_key
  node_name: query_key
  trainable: false
  dtype: int64
  log_at_inference: true
  feature_layer_info:
    type: numeric
    shape: null
  serving_info:
    required: false
    default_value: 0
  tfrecord_type: context
label:
  name: label
  node_name: label
  trainable: false
  dtype: int64
  log_at_inference: true
  feature_layer_info:
    type: numeric
    shape: null
  serving_info:
    required: false
    default_value: 0
  tfrecord_type: sequence
features:
  - name: feat_0
    node_name: feat_0
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: feat_1
    node_name: feat_1
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: feat_2
    node_name: feat_2
    trainable: true
    dtype: float
    log_at_inference: false
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0.0
    tfrecord_type: sequence
  - name: query_str
    node_name: query_str
    trainable: true
    dtype: string
    log_at_inference: true
    feature_layer_info:
      type: numeric
      shape: null
      fn: get_sequence_encoding
      args:
        encoding_type: bilstm
        encoding_size: 128
        embedding_size: 128
        max_length: 20
    preprocessing_info:
      - fn: preprocess_text
        args:
          remove_punctuation: true
          to_lower: true
      - fn: strip_numbers
    serving_info:
      required: true
      default_value: ""
    tfrecord_type: context
  - name: group
    node_name: group
    trainable: true
    dtype: int64
    log_at_inference: false
    is_group_metric_key: true
    feature_layer_info:
      type: numeric
      shape: null
      fn: custom_categorical_embedding
      args:
        vocabulary_size: 16
        embedding_size: 128
    serving_info:
      required: false
      default_value: 0
    tfrecord_type: context
  - name: pos
    node_name: pos
    trainable: false
    dtype: int64
    log_at_inference: true
    feature_layer_info:
      type: numeric
      shape: null
    serving_info:
      required: true
      default_value: 0
    tfrecord_type: sequence
'''
feature_config: ExampleFeatureConfig = parse_config(TFRecordTypeKey.EXAMPLE, feature_config_yaml, logger=logger)
    
logging.info(json.dumps(feature_config.get_all_features(), indent=4))

DEBUG:root:Logger is initialized...
INFO:root:Reading feature config from YAML string
INFO:root:Feature config loaded successfully
INFO:root:Trainable Features : 
feat_0
feat_1
feat_2
query_str
group
INFO:root:Label : label
INFO:root:Metadata Features : 
query_key
label
pos
INFO:root:[
    {
        "name": "query_key",
        "node_name": "query_key",
        "trainable": false,
        "dtype": "int64",
        "log_at_inference": true,
        "feature_layer_info": {
            "type": "numeric",
            "shape": null
        },
        "serving_info": {
            "required": false,
            "default_value": 0
        },
        "tfrecord_type": "context"
    },
    {
        "name": "label",
        "node_name": "label",
        "trainable": false,
        "dtype": "int64",
        "log_at_inference": true,
        "feature_layer_info": {
            "type": "numeric",
            "shape": null
        },
        "serving_info": {
            "required": false,
         

### Time to load the data and save awesome TFRecords

In [8]:
from ml4ir.io import file_io
from ml4ir.data import tfrecord_writer
import glob

# Load data
df = file_io.read_df_list(glob.glob('/Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/train/file_0.csv'))

# Save as TFRecord SequenceExample/Example
tfrecord_writer.write_from_df(df,
                              tfrecord_file='/Users/ashish.srinivasa/search_relevance/data/demo/tfrecords/file_0.tfrecord',
                              feature_config=feature_config,
                              tfrecord_type=TFRecordTypeKey.EXAMPLE)

# Let's see what it looks like
df.head()

Unnamed: 0,query_key,query_str,pos,feat_0,feat_1,feat_2,label,group
0,2,MHS7A7RJB1Y4BJT,2,0.47373,0.0,0.0,0,2
1,2,MHS7A7RJB1Y4BJT,1,1.06319,0.205381,0.30103,1,2
2,5,KNJNWV,6,1.368108,0.030636,0.0,0,0
3,5,KNJNWV,3,1.370628,0.041261,0.30103,0,0
4,5,KNJNWV,4,1.3667,0.082535,0.30103,0,0


### Load TFRecords and add custom preprocessing functions

In [9]:
from ml4ir.data import tfrecord_reader
from tensorflow import print as tfprint
import tensorflow as tf

@tf.function
def strip_numbers(feature_tensor):
    return tf.strings.regex_replace(feature_tensor, "[0-9]", "")

# Define per instance preprocessing functions
preprocessing_fns = {
    "strip_numbers": strip_numbers
}

# Create a TFRecord dataset
dataset = tfrecord_reader.read(data_dir='/Users/ashish.srinivasa/search_relevance/data/demo/tfrecords/',
                               feature_config=feature_config,
                               tfrecord_type=TFRecordTypeKey.EXAMPLE,
                               preprocessing_keys_to_fns=preprocessing_fns)

tfprint(next(iter(dataset.batch(5))))

{'query_key': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'label': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'feat_0': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_1': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_2': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'query_str': FixedLenFeature(shape=[], dtype='string', default_value=''), 'group': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'pos': FixedLenFeature(shape=[], dtype='int64', default_value=0)}
({'feat_0': [[0.473729521]
 [1.06319]
 [1.36810815]
 [1.37062836]
 [1.36669993]],
  'feat_1': [[0]
 [0.205380633]
 [0.0306360275]
 [0.0412614979]
 [0.0825348869]],
  'feat_2': [[0]
 [0.30103]
 [0]
 [0.30103]
 [0.30103]],
  'group': [[2]
 [2]
 [0]
 [0]
 [0]],
  'pos': [[2]
 [1]
 [6]
 [3]
 [4]],
  'query_key': [[2]
 [2]
 [5]
 [5]
 [5]],
  'query_str': [["mhsarjbybjt"]
 ["mhsarjbybjt"]
 ["knjnwv"]
 ["knjnwv"]
 ["knjnwv"]]},
 [[0]
 [1]
 [

### Map, Filter, Filter, Batch the Dataset

In [4]:
# Variety of map, reduce, filter, shuffle operations can be used here
# dataset = dataset.<map, filter, reduce>(tf_preprocess_fn)

# NOTE: This is lazy batching
dataset = dataset.batch(batch_size=128, drop_remainder=True)

### Or... you can do all of that for train, val and test in _one_ step!

In [5]:
from ml4ir.data.relevance_dataset import RelevanceDataset
from ml4ir.config.keys import DataFormatKey

relevance_dataset = RelevanceDataset(
        data_dir='/Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv',
        data_format=DataFormatKey.CSV,
        feature_config=feature_config,
        tfrecord_type=TFRecordTypeKey.EXAMPLE,
        batch_size=128,
        preprocessing_keys_to_fns=preprocessing_fns,
        logger=logger
    )

tfprint(relevance_dataset.train)
tfprint(relevance_dataset.validation)
tfprint(relevance_dataset.test)

INFO:root:Reading 1 files from [/Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/train/file_0.csv, ..
INFO:root:Writing SequenceExample protobufs to : /Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/tfrecord/train/file_0.tfrecord
INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['/Users/ashish.srinivasa/search_relevance/ml4ir/p
INFO:root:Reading 1 files from [/Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/validation/file_0.csv, ..
INFO:root:Writing SequenceExample protobufs to : /Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/tfrecord/validation/file_0.tfrecord


{'query_key': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'label': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'feat_0': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_1': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_2': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'query_str': FixedLenFeature(shape=[], dtype='string', default_value=''), 'group': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'pos': FixedLenFeature(shape=[], dtype='int64', default_value=0)}


INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['/Users/ashish.srinivasa/search_relevance/ml4ir/p
INFO:root:Reading 1 files from [/Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/test/file_0.csv, ..
INFO:root:Writing SequenceExample protobufs to : /Users/ashish.srinivasa/search_relevance/ml4ir/python/applications/ranking/tests/data/csv/tfrecord/test/file_0.tfrecord


{'query_key': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'label': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'feat_0': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_1': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_2': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'query_str': FixedLenFeature(shape=[], dtype='string', default_value=''), 'group': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'pos': FixedLenFeature(shape=[], dtype='int64', default_value=0)}


INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['/Users/ashish.srinivasa/search_relevance/ml4ir/p


{'query_key': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'label': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'feat_0': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_1': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_2': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'query_str': FixedLenFeature(shape=[], dtype='string', default_value=''), 'group': FixedLenFeature(shape=[], dtype='int64', default_value=0), 'pos': FixedLenFeature(shape=[], dtype='int64', default_value=0)}
<BatchDataset shapes: ({query_key: (128, 1), feat_0: (128, 1), feat_1: (128, 1), feat_2: (128, 1), query_str: (128, 1), group: (128, 1), pos: (128, 1)}, (128, 1)), types: ({query_key: tf.int64, feat_0: tf.float32, feat_1: tf.float32, feat_2: tf.float32, query_str: tf.string, group: tf.int64, pos: tf.int64}, tf.int64)>
<BatchDataset shapes: ({query_key: (128, 1), feat_0: (128, 1), feat_1: (128, 1), feat_2: (128, 1), query_str: (128, 1)

## Let's define a model, already!

### Model Framework

In [6]:
# TODO : insert architecture diagram
#
# RelevanceModel(
#     Scorer(
#         InteractionModel(
#             Inputs
#         ),
#         Loss),
#     Metrics,
#     Optimizer,
#     Callbacks
# )

### Step 0: Define the Interaction Model

In [2]:
from ml4ir.model.scoring.interaction_model import InteractionModel, UnivariateInteractionModel
from ml4ir.config.keys import TFRecordTypeKey

# Define custom feature layer ops
def get_categorical_embedding(input_feature, feature_info):
    """Embedding lookup for categorical features"""
    
    feature_layer_info = feature_info.get("feature_layer_info")
    return layers.Embedding(input_dim=feature_layer_info["args"]["vocabulary_size"],
                     output_dim=feature_layer_info["args"]["embedding_size"],
                     name="categorical_embedding_{}".format(feature_info.get("name")),
                 )(input_feature)

feature_layer_fns = {
    "custom_categorical_embedding": get_categorical_embedding
}

interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            feature_layer_keys_to_fns=feature_layer_fns,
                                            tfrecord_type=TFRecordTypeKey.EXAMPLE)

### Step 1: Define the Scorer

In [3]:
from ml4ir.model.scoring.scoring_model import ScorerBase, RelevanceScorer
from ml4ir.model.losses.loss_base import RelevanceLossBase
from tensorflow.keras import layers
from tensorflow.keras import losses

class MyCustomLoss(RelevanceLossBase):
    def get_loss_fn(self, **kwargs):
        """
        Define a sigmoid cross entropy loss
        Additionally can pass in record positions to handle positional bias

        """
        bce = losses.BinaryCrossentropy(reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
        mask = kwargs.get("mask")

        def _loss_fn(y_true, y_pred):
            # NOTE: Can use any of the metadata features to qualify your loss here
            return bce(y_true, y_pred)

        return _loss_fn

    def get_final_activation_op(self, output_name):
        return lambda logits, mask: layers.Activation("sigmoid", name=output_name)(logits)

scorer: ScorerBase = RelevanceScorer.from_model_config_file(
    model_config_file='/Users/ashish.srinivasa/search_relevance/ml4ir/python/ml4ir/config/default_model_config.yaml',
    interaction_model=interaction_model,
    loss=MyCustomLoss(),
    output_name="relevance_score")
    
logger.info(json.dumps(scorer.model_config, indent=4))

INFO:root:{
    "architecture_key": "dnn",
    "layers": [
        {
            "type": "dense",
            "name": "first_dense",
            "units": 256,
            "activation": "relu"
        },
        {
            "type": "dropout",
            "name": "first_dropout",
            "rate": 0.0
        },
        {
            "type": "dense",
            "name": "second_dense",
            "units": 64,
            "activation": "relu"
        },
        {
            "type": "dropout",
            "name": "second_dropout",
            "rate": 0.0
        },
        {
            "type": "dense",
            "name": "final_dense",
            "units": 1,
            "activation": null
        }
    ]
}


### Step 2: Define Metrics

In [4]:
from tensorflow.keras import metrics as kmetrics

metrics = ['binary_accuracy', kmetrics.Precision(name='precision')]

### Step 3: Define Optimizer

In [5]:
from tensorflow.keras.optimizers import Optimizer
from ml4ir.model.optimizer import get_optimizer
from ml4ir.config.keys import OptimizerKey

optimizer: Optimizer = get_optimizer(
                optimizer_key=OptimizerKey.ADAM,
                learning_rate=0.01,
                learning_rate_decay=0.94,
                learning_rate_decay_steps=1000,
                gradient_clip_value=50,
            )

### Now... let's put it all together (shhh!)

In [6]:
from ml4ir.model.relevance_model import RelevanceModel
from ml4ir.config.keys import OptimizerKey

relevance_model = RelevanceModel(
        feature_config=feature_config,
        scorer=scorer,
        metrics=metrics,
        optimizer=optimizer,
        tfrecord_type=TFRecordTypeKey.EXAMPLE,
        output_name="relevance_score",
        logger=logger
    )

INFO:root:Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
query_str (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_DecodePaddedRaw (Te [(None, 1, 20)]      0           query_str[0][0]                  
__________________________________________________________________________________________________
tf_op_layer_Reshape (TensorFlow [(None, 20)]         0           tf_op_layer_DecodePaddedRaw[0][0]
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 128)      32768       tf_op_layer_Reshape[0][0]        
____________________________________________________________________________________

In [12]:
relevance_model.fit(relevance_dataset, 
                    num_epochs=5, 
                    models_dir='/Users/ashish.srinivasa/search_relevance/model_training/test/models',
                    logs_dir='/Users/ashish.srinivasa/search_relevance/model_training/test/logs',
                    monitor_metric='val_binary_accuracy',
                    monitor_mode='max')

INFO:root:Training Model
INFO:root:Starting Epoch : 1
INFO:root:{}


Epoch 1/5


INFO:root:[epoch: 1 | batch: 0] {'batch': 0, 'size': 128, 'loss': 0.6951301, 'binary_accuracy': 0.4296875, 'precision': 0.2795699}






     25/Unknown - 5s 187ms/step - loss: 0.5846 - binary_accuracy: 0.7237 - precision: 0.2796

INFO:root:[epoch: 1 | batch: 25] {'batch': 25, 'size': 128, 'loss': 0.5673238, 'binary_accuracy': 0.72415864, 'precision': 0.2795699}


     44/Unknown - 6s 131ms/step - loss: 0.5744 - binary_accuracy: 0.7282 - precision: 0.2796

INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00001: val_binary_accuracy improved from -inf to 0.73899, saving model to /Users/ashish.srinivasa/search_relevance/model_training/test/models/checkpoint.tf
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/checkpoint.tf/assets
INFO:root:End of Epoch 1
INFO:root:{'loss': 0.5744227672165091, 'binary_accuracy': 0.7281605, 'precision': 0.2795699, 'val_loss': 0.5471044494347139, 'val_binary_accuracy': 0.7389915, 'val_precision': 0.0}




INFO:root:Starting Epoch : 2
INFO:root:{}
INFO:root:[epoch: 2 | batch: 0] {'batch': 0, 'size': 128, 'loss': 0.53769475, 'binary_accuracy': 0.75, 'precision': 0.0}


Epoch 2/5

INFO:root:[epoch: 2 | batch: 25] {'batch': 25, 'size': 128, 'loss': 0.5610144, 'binary_accuracy': 0.7364784, 'precision': 0.0}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None
INFO:root:End of Epoch 2
INFO:root:{'loss': 0.551779412410476, 'binary_accuracy': 0.7354403, 'precision': 0.0, 'val_loss': 0.5538304055278952, 'val_binary_accuracy': 0.7389915, 'val_precision': 0.0}



Epoch 00002: val_binary_accuracy did not improve from 0.73899


INFO:root:Starting Epoch : 3
INFO:root:{}
INFO:root:[epoch: 3 | batch: 0] {'batch': 0, 'size': 128, 'loss': 0.5520214, 'binary_accuracy': 0.75, 'precision': 0.0}


Epoch 3/5

INFO:root:[epoch: 3 | batch: 25] {'batch': 25, 'size': 128, 'loss': 0.56047297, 'binary_accuracy': 0.7364784, 'precision': 0.0}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None
INFO:root:End of Epoch 3
INFO:root:{'loss': 0.5519916056232019, 'binary_accuracy': 0.7354403, 'precision': 0.0, 'val_loss': 0.5560686249624599, 'val_binary_accuracy': 0.7389915, 'val_precision': 0.0}



Epoch 00003: val_binary_accuracy did not improve from 0.73899
Restoring model weights from the end of the best epoch.


INFO:root:Completed training model
INFO:root:None


Epoch 00003: early stopping


### Let's save the model(... and don't forget about serving signatures)

In [10]:
relevance_model.save(
    models_dir='/Users/ashish.srinivasa/search_relevance/model_training/test/models',
    preprocessing_keys_to_fns=preprocessing_fns,
    required_fields_only=True)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/final/default/assets


INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/final/default/assets


{'feat_0': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_1': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'feat_2': FixedLenFeature(shape=[], dtype='float', default_value=0.0), 'query_str': FixedLenFeature(shape=[], dtype='string', default_value=''), 'pos': FixedLenFeature(shape=[], dtype='int64', default_value=0)}
INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/final/tfrecord/assets


INFO:tensorflow:Assets written to: /Users/ashish.srinivasa/search_relevance/model_training/test/models/final/tfrecord/assets
INFO:root:Final model saved to : /Users/ashish.srinivasa/search_relevance/model_training/test/models/final


### Reload the model for some fun

### Make some predictions