# Training a Learning to Rank Model using ml4ir
-----

### First, let's install ml4ir

In [1]:
!pip install ml4ir

Looking in indexes: https://pypi.python.org/simple






Collecting pyarrow<0.15.0,>=0.14.0
  Using cached pyarrow-0.14.1-cp37-cp37m-macosx_10_6_intel.whl (34.4 MB)
[31mERROR: tfx-bsl 0.15.3 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: tfx-bsl 0.15.3 has requirement apache-beam[gcp]<2.17,>=2.16, but you'll have apache-beam 2.22.0 which is incompatible.[0m
[31mERROR: tensorflow-transform 0.15.0 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement dill<0.3.2,>=0.3.1.1, but you'll have dill 0.3.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement httplib2<0.18.0,>=0.8, but you'll have httplib2 0.18.1 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement pyarrow<0.18.0,>=0.15.1; python_version >= "3.0" or platform_system != "Windows", but you'll have pyarrow 0.14.1 which is incompatible.[0m
Installing collected packages: pyarrow
  Attempting uninstall: pyar

### Load/Define a FeatureConfig

In [2]:
FEATURE_CONFIG_PATH = "configs/activate_2020/feature_config.yaml"
MODEL_CONFIG_PATH = "configs/activate_2020/model_config.yaml"
DATA_DIR = "../ml4ir/applications/ranking/tests/data/tfrecord"
MODELS_DIR = '../models/activate_2020'
LOGS_DIR = '../logs/activate_2020'
MAX_SEQUENCE_SIZE = 25

In [3]:
import logging
import tensorflow as tf

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
tf.get_logger().setLevel('INFO')
logging.debug("Logger is initialized...")

from ml4ir.base.io.local_io import LocalIO
from ml4ir.base.io.file_io import FileIO
file_io: FileIO = LocalIO()
file_io.make_directory(LOGS_DIR)
file_io.make_directory(MODELS_DIR)

from ml4ir.base.features.feature_config import FeatureConfig, SequenceExampleFeatureConfig
from ml4ir.base.config.keys import TFRecordTypeKey


feature_config: SequenceExampleFeatureConfig = FeatureConfig.get_instance(
    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
    feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
    logger=logger)

DEBUG:root:Logger is initialized...
DEBUG:root:{
    "query_key": {
        "name": "query_id",
        "node_name": "query_id",
        "trainable": false,
        "dtype": "string",
        "log_at_inference": true,
        "feature_layer_info": {
            "type": "numeric",
            "shape": null
        },
        "serving_info": {
            "name": "queryId",
            "required": false,
            "default_value": ""
        },
        "tfrecord_type": "context"
    },
    "rank": {
        "name": "rank",
        "node_name": "rank",
        "trainable": false,
        "dtype": "int64",
        "log_at_inference": true,
        "feature_layer_info": {
            "type": "numeric",
            "shape": null
        },
        "serving_info": {
            "name": "originalRank",
            "required": true,
            "default_value": 0
        },
        "tfrecord_type": "sequence"
    },
    "label": {
        "name": "clicked",
        "node_name": "clicked",
   

### Create a RelevanceDataset from TFRecords

In [4]:
from ml4ir.base.data.relevance_dataset import RelevanceDataset
from ml4ir.base.config.keys import DataFormatKey
from ml4ir.base.config.keys import TFRecordTypeKey

ranking_dataset = RelevanceDataset(data_dir=DATA_DIR,
                                   data_format=DataFormatKey.TFRECORD,
                                   feature_config=feature_config,
                                   tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                   max_sequence_size=MAX_SEQUENCE_SIZE,
                                   batch_size=128,
                                   preprocessing_keys_to_fns={},
                                   file_io=file_io,
                                   logger=logger)

INFO:root:Found in ../ml4ir/applications/ranking/tests/data/tfrecord directory : ['../ml4ir/applications/ranking/tests/data/tfrecord/test', '../ml4ir/applications/ranking/tests/data/tfrecord/train', '../ml4ir/applications/ranking/tests/data/tfrecord/validation']
INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['../ml4ir/applications/ranking/tests/data/tfrecor
INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['../ml4ir/applications/ranking/tests/data/tfrecor
INFO:root:Created TFRecordDataset from SequenceExample protobufs from 1 files : ['../ml4ir/applications/ranking/tests/data/tfrecor


### Define an InteractionModel

In [5]:
from ml4ir.base.model.scoring.interaction_model import InteractionModel, UnivariateInteractionModel

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

###  ... now with custom feature transforms

In [6]:
from tensorflow.keras import layers
from tensorflow import io

# Define custom feature transforms
def bytes_sequence_encoding_bigru(feature_tensor, feature_info, file_io: FileIO):
    args = feature_info["feature_layer_info"]["args"]

    # Decode string tensor to bytes
    feature_tensor = io.decode_raw(
        feature_tensor,
        out_type=tf.uint8,
        fixed_length=args.get("max_length", None),
    )

    feature_tensor = tf.squeeze(feature_tensor, axis=1)
    if "embedding_size" in args:
        char_embedding = layers.Embedding(
            name="{}_bytes_embedding".format(feature_info.get("node_name", feature_info.get("name"))),
            input_dim=256, # bytes vocabulary size is fixed at 256
            output_dim=args["embedding_size"],
            mask_zero=True,
            input_length=args.get("max_length", None),
        )(feature_tensor)
    else:
        char_embedding = tf.one_hot(feature_tensor, depth=256)

    #############################################################################
    # Compute sequence encoding using GRU
    encoding = layers.Bidirectional(
        layers.GRU(units=int(args["encoding_size"] / 2), return_sequences=False),
        merge_mode="concat",
    )(char_embedding)
    #############################################################################
    
    encoding = tf.expand_dims(encoding, axis=1)
    return encoding

feature_layer_keys_to_fns = {
    "bytes_sequence_encoding_bigru" : bytes_sequence_encoding_bigru
}

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            feature_layer_keys_to_fns=feature_layer_keys_to_fns,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

### Use predefined losses, metrics and optimizers or create your own!

In [7]:
from ml4ir.applications.ranking.model.losses import loss_factory
from ml4ir.applications.ranking.model.metrics import metric_factory
from ml4ir.base.model.losses.loss_base import RelevanceLossBase
from ml4ir.base.model.optimizer import get_optimizer

from ml4ir.applications.ranking.config.keys import LossKey, MetricKey, ScoringTypeKey
from ml4ir.base.config.keys import OptimizerKey

from tensorflow.keras.metrics import Metric
from tensorflow.keras.optimizers import Optimizer
from typing import List, Union, Type


# Define loss object from loss key
loss: RelevanceLossBase = loss_factory.get_loss(
                                loss_key=LossKey.RANK_ONE_LISTNET,
                                scoring_type=ScoringTypeKey.POINTWISE)
    
# Define metrics objects from metrics keys
metrics: List[Union[Type[Metric], str]] = [metric_factory.get_metric(metric_key="MRR")]
    
# Define optimizer
optimizer: Optimizer = get_optimizer(
                            optimizer_key=OptimizerKey.ADAM,
                            learning_rate=0.001
                        )

### Define a scoring function, or the Scorer

In [8]:
from ml4ir.base.model.scoring.scoring_model import RelevanceScorer

# Define scorer
print(open(MODEL_CONFIG_PATH).read())
scorer: RelevanceScorer = RelevanceScorer.from_model_config_file(
    model_config_file=MODEL_CONFIG_PATH,
    interaction_model=interaction_model,
    loss=loss,
    logger=logger,
    file_io=file_io,
)

architecture_key: dnn
layers:
  - type: dense
    name: first_dense
    units: 256
    activation: relu
  - type: dropout
    name: first_dropout
    rate: 0.3
  - type: dense
    name: second_dense
    units: 64
    activation: relu
  - type: dense
    name: final_dense
    units: 1
    activation: null



### Combine it all to create a RankingModel

In [9]:
from ml4ir.applications.ranking.model.ranking_model import RankingModel
from ml4ir.base.model.relevance_model import RelevanceModel


# Combine the above to define a RelevanceModel
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    file_io=file_io,
                                    logger=logger,
                                )

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/Users/ashish.srinivasa/search_relevance/ml4ir/python/notebooks, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/Users/ashish.srinivasa/search_relevance/ml4ir/python/notebooks, universal_newlines=False, shell=None, istream=None)
INFO:root:Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
query_text (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mask (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_op_layer_DecodePaddedRaw (Te [(None, 1, 20)]      0     

### Finally, time to train the model

In [10]:
ranking_model.fit(dataset=ranking_dataset,
                  num_epochs=3, 
                  models_dir='../models',
                  logs_dir='../logs',
                  monitor_metric="new_MRR",
                  monitor_mode="max")

INFO:root:Training Model
INFO:root:Starting Epoch : 1
INFO:root:{}


Epoch 1/3


INFO:root:[epoch: 1 | batch: 0] {'batch': 0, 'size': 128, 'loss': 2.0685291, 'old_MRR': 0.8084635, 'new_MRR': 0.58808935}






     11/Unknown - 9s 860ms/step - loss: 1.9660 - old_MRR: 0.7875 - new_MRR: 0.6473

INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00001: val_new_MRR improved from -inf to 0.70512, saving model to ../models/checkpoint.tf
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 1
INFO:root:{'loss': 1.9660225022922864, 'old_MRR': 0.7874729, 'new_MRR': 0.6473302, 'val_loss': 1.9555319222536953, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.7051184}




INFO:root:Starting Epoch : 2
INFO:root:{}


Epoch 2/3


INFO:root:[epoch: 2 | batch: 0] {'batch': 0, 'size': 128, 'loss': 1.9890631, 'old_MRR': 0.8084635, 'new_MRR': 0.6610025}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00002: val_new_MRR improved from 0.70512 to 0.70698, saving model to ../models/checkpoint.tf
INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 2
INFO:root:{'loss': 1.890663201158697, 'old_MRR': 0.7874729, 'new_MRR': 0.6824102, 'val_loss': 1.895951877940785, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.70697683}




INFO:root:Starting Epoch : 3
INFO:root:{}


Epoch 3/3


INFO:root:[epoch: 3 | batch: 0] {'batch': 0, 'size': 128, 'loss': 1.9199574, 'old_MRR': 0.8084635, 'new_MRR': 0.6828125}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00003: val_new_MRR improved from 0.70698 to 0.70732, saving model to ../models/checkpoint.tf
INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 3
INFO:root:{'loss': 1.8445354158228093, 'old_MRR': 0.7874729, 'new_MRR': 0.6921806, 'val_loss': 1.8489787470210681, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.7073162}




INFO:root:Completed training model
INFO:root:None


### Let's try some Transfer Learning
##### Using bytes embeddings from a query classification model

In [11]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[ 0.04817661, -0.02310021, -0.0158019 , ...,  0.04689885,
          0.04715741, -0.02395461],
        [ 0.01107422, -0.04068863,  0.03669446, ..., -0.04298381,
          0.04932303, -0.03682303],
        [ 0.03654481,  0.01255837, -0.00286949, ...,  0.04108845,
         -0.02460071, -0.02626768],
        ...,
        [-0.01108004,  0.00978049,  0.04506851, ..., -0.03921436,
         -0.04932214,  0.02432754],
        [-0.02197567,  0.04634149, -0.01837877, ..., -0.04804767,
         -0.0298071 ,  0.00523918],
        [-0.00237392, -0.02369238,  0.01442983, ..., -0.0254912 ,
         -0.00172541, -0.0236601 ]], dtype=float32)]

In [14]:
initialize_layers_dict = {
    "query_text_bytes_embedding" : "../models/test_wandb2/final/layers/query_text_bytes_embedding.npy"
}
freeze_layers_list = ["query_text_bytes_embedding"]
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    initialize_layers_dict=initialize_layers_dict,
                                    freeze_layers_list=freeze_layers_list,
                                    file_io=file_io,
                                    logger=logger,
                                )

INFO:root:Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
query_text (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mask (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_op_layer_DecodePaddedRaw_3 ( [(None, 1, 20)]      0           query_text[0][0]                 
__________________________________________________________________________________________________
tf_op_layer_GatherV2_6 (TensorF [(None,)]            0           mask[0][0]                       
__________________________________________________________________________________

INFO:root:Setting query_text_bytes_embedding weights from ../models/test_wandb2/final/layers/query_text_bytes_embedding.npy
INFO:root:Freezing query_text_bytes_embedding layer


In [15]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[-0.03738469,  0.00727513, -0.02006867, ...,  0.01078511,
         -0.028496  , -0.04102874],
        [ 0.00512887,  0.0062821 , -0.0010671 , ...,  0.04945388,
         -0.0132054 , -0.01177131],
        [-0.00937623, -0.03438247,  0.00176773, ...,  0.046078  ,
         -0.0310035 , -0.04288797],
        ...,
        [-0.04410168,  0.0383402 ,  0.03348425, ...,  0.02123589,
          0.02240864, -0.04049417],
        [ 0.03601265,  0.04585798,  0.00272902, ..., -0.00353998,
         -0.04783431,  0.02852656],
        [ 0.04903785, -0.03518286,  0.00195389, ...,  0.03783921,
         -0.01398294,  0.0107099 ]], dtype=float32)]

In [16]:
ranking_model.fit(dataset=ranking_dataset,
                  num_epochs=3, 
                  models_dir='../models',
                  logs_dir='../logs',
                  monitor_metric="new_MRR",
                  monitor_mode="max")

INFO:root:Training Model
INFO:root:Starting Epoch : 1
INFO:root:{}


Epoch 1/3


INFO:root:[epoch: 1 | batch: 0] {'batch': 0, 'size': 128, 'loss': 2.0706391, 'old_MRR': 0.8084635, 'new_MRR': 0.5596571}






     11/Unknown - 21s 2s/step - loss: 1.9583 - old_MRR: 0.7875 - new_MRR: 0.6669

INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00001: val_new_MRR improved from -inf to 0.70563, saving model to ../models/checkpoint.tf
INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 1
INFO:root:{'loss': 1.958261793309992, 'old_MRR': 0.7874729, 'new_MRR': 0.6668506, 'val_loss': 1.935523585839705, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.7056252}




INFO:root:Starting Epoch : 2
INFO:root:{}


Epoch 2/3


INFO:root:[epoch: 2 | batch: 0] {'batch': 0, 'size': 128, 'loss': 1.9837611, 'old_MRR': 0.8084635, 'new_MRR': 0.6734375}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00002: val_new_MRR improved from 0.70563 to 0.70739, saving model to ../models/checkpoint.tf
INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 2
INFO:root:{'loss': 1.8647452592849731, 'old_MRR': 0.7874729, 'new_MRR': 0.69406474, 'val_loss': 1.851927789774808, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.70739114}




INFO:root:Starting Epoch : 3
INFO:root:{}


Epoch 3/3


INFO:root:[epoch: 3 | batch: 0] {'batch': 0, 'size': 128, 'loss': 1.9621739, 'old_MRR': 0.8084635, 'new_MRR': 0.6690104}




INFO:root:Evaluating Model
INFO:root:Completed evaluating model
INFO:root:None



Epoch 00003: val_new_MRR improved from 0.70739 to 0.70956, saving model to ../models/checkpoint.tf
INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets


INFO:tensorflow:Assets written to: ../models/checkpoint.tf/assets
INFO:root:End of Epoch 3
INFO:root:{'loss': 1.8506084463813088, 'old_MRR': 0.7874729, 'new_MRR': 0.67757916, 'val_loss': 1.8340057134628296, 'val_old_MRR': 0.7827933, 'val_new_MRR': 0.70955735}




INFO:root:Completed training model
INFO:root:None


In [17]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[-0.03738469,  0.00727513, -0.02006867, ...,  0.01078511,
         -0.028496  , -0.04102874],
        [ 0.00512887,  0.0062821 , -0.0010671 , ...,  0.04945388,
         -0.0132054 , -0.01177131],
        [-0.00937623, -0.03438247,  0.00176773, ...,  0.046078  ,
         -0.0310035 , -0.04288797],
        ...,
        [-0.04410168,  0.0383402 ,  0.03348425, ...,  0.02123589,
          0.02240864, -0.04049417],
        [ 0.03601265,  0.04585798,  0.00272902, ..., -0.00353998,
         -0.04783431,  0.02852656],
        [ 0.04903785, -0.03518286,  0.00195389, ...,  0.03783921,
         -0.01398294,  0.0107099 ]], dtype=float32)]