# Training a Learning to Rank Model using ml4ir
-----

### First, let's install ml4ir

In [1]:
!pip install ml4ir

Looking in indexes: https://pypi.python.org/simple






Collecting pyarrow<0.15.0,>=0.14.0
  Using cached pyarrow-0.14.1-cp37-cp37m-macosx_10_6_intel.whl (34.4 MB)
[31mERROR: tfx-bsl 0.15.3 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: tfx-bsl 0.15.3 has requirement apache-beam[gcp]<2.17,>=2.16, but you'll have apache-beam 2.22.0 which is incompatible.[0m
[31mERROR: tensorflow-transform 0.15.0 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement dill<0.3.2,>=0.3.1.1, but you'll have dill 0.3.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement httplib2<0.18.0,>=0.8, but you'll have httplib2 0.18.1 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement pyarrow<0.18.0,>=0.15.1; python_version >= "3.0" or platform_system != "Windows", but you'll have pyarrow 0.14.1 which is incompatible.[0m
Installing collected packages: pyarrow
  Attempting uninstall: pyar

### Load/Define a FeatureConfig

In [1]:
FEATURE_CONFIG_PATH = "configs/activate_2020/feature_config.yaml"
MODEL_CONFIG_PATH = "configs/activate_2020/model_config.yaml"
DATA_DIR = "../ml4ir/applications/ranking/tests/data/tfrecord"
MODELS_DIR = '../models/activate_2020'
LOGS_DIR = '../logs/activate_2020'
MAX_SEQUENCE_SIZE = 25

%load_ext tensorboard

In [4]:
import logging
import tensorflow as tf

logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
# tf.get_logger().setLevel('INFO')
# logging.debug("Logger is initialized...")

from ml4ir.base.io.local_io import LocalIO
from ml4ir.base.io.file_io import FileIO
file_io: FileIO = LocalIO()
file_io.make_directory(LOGS_DIR, clear_dir=True)
file_io.make_directory(MODELS_DIR, clear_dir=True)

from ml4ir.base.features.feature_config import FeatureConfig, SequenceExampleFeatureConfig
from ml4ir.base.config.keys import TFRecordTypeKey


feature_config: SequenceExampleFeatureConfig = FeatureConfig.get_instance(
    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
    feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
    logger=logger)
print("Training features\n-----------------")
print("\n".join(feature_config.get_train_features(key="name")))

Training features
-----------------
text_match_score
page_views_score
quality_score
query_text
domain_id
domain_name


### Create a RelevanceDataset from TFRecords

In [5]:
from ml4ir.base.data.relevance_dataset import RelevanceDataset
from ml4ir.base.config.keys import DataFormatKey
from ml4ir.base.config.keys import TFRecordTypeKey

ranking_dataset = RelevanceDataset(data_dir=DATA_DIR,
                                   data_format=DataFormatKey.TFRECORD,
                                   feature_config=feature_config,
                                   tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                   max_sequence_size=MAX_SEQUENCE_SIZE,
                                   batch_size=128,
                                   preprocessing_keys_to_fns={},
                                   file_io=file_io,
                                   logger=logger)

### Define an InteractionModel

In [6]:
from ml4ir.base.model.scoring.interaction_model import InteractionModel, UnivariateInteractionModel

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

###  ... now with custom feature transforms

In [7]:
from tensorflow.keras import layers
from tensorflow import io

# Define custom feature transforms
def bytes_sequence_encoding_bigru(feature_tensor, feature_info, file_io: FileIO):
    args = feature_info["feature_layer_info"]["args"]

    # Decode string tensor to bytes
    feature_tensor = io.decode_raw(
        feature_tensor,
        out_type=tf.uint8,
        fixed_length=args.get("max_length", None),
    )

    feature_tensor = tf.squeeze(feature_tensor, axis=1)
    if "embedding_size" in args:
        char_embedding = layers.Embedding(
            name="{}_bytes_embedding".format(feature_info.get("node_name", feature_info.get("name"))),
            input_dim=256, # bytes vocabulary size is fixed at 256
            output_dim=args["embedding_size"],
            mask_zero=True,
            input_length=args.get("max_length", None),
        )(feature_tensor)
    else:
        char_embedding = tf.one_hot(feature_tensor, depth=256)

    #############################################################################
    # Compute sequence encoding using GRU
    encoding = layers.Bidirectional(
        layers.GRU(units=int(args["encoding_size"] / 2), return_sequences=False),
        merge_mode="concat",
    )(char_embedding)
    #############################################################################
    
    encoding = tf.expand_dims(encoding, axis=1)
    return encoding

feature_layer_keys_to_fns = {
    "bytes_sequence_encoding_bigru" : bytes_sequence_encoding_bigru
}

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            feature_layer_keys_to_fns=feature_layer_keys_to_fns,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

### Use predefined losses, metrics and optimizers or create your own!

In [8]:
from ml4ir.applications.ranking.model.losses import loss_factory
from ml4ir.applications.ranking.model.metrics import metric_factory
from ml4ir.base.model.losses.loss_base import RelevanceLossBase
from ml4ir.base.model.optimizer import get_optimizer

from ml4ir.applications.ranking.config.keys import LossKey, MetricKey, ScoringTypeKey
from ml4ir.base.config.keys import OptimizerKey

from tensorflow.keras.metrics import Metric
from tensorflow.keras.optimizers import Optimizer
from typing import List, Union, Type


# Define loss object from loss key
loss: RelevanceLossBase = loss_factory.get_loss(
                                loss_key=LossKey.RANK_ONE_LISTNET,
                                scoring_type=ScoringTypeKey.POINTWISE)
    
# Define metrics objects from metrics keys
metrics: List[Union[Type[Metric], str]] = [metric_factory.get_metric(metric_key="MRR")]
    
# Define optimizer
optimizer: Optimizer = get_optimizer(
                            optimizer_key=OptimizerKey.ADAM,
                            learning_rate=0.001
                        )

### Define a scoring function, or the Scorer

In [9]:
from ml4ir.base.model.scoring.scoring_model import RelevanceScorer

# Define scorer
print(open(MODEL_CONFIG_PATH).read())
scorer: RelevanceScorer = RelevanceScorer.from_model_config_file(
    model_config_file=MODEL_CONFIG_PATH,
    interaction_model=interaction_model,
    loss=loss,
    logger=logger,
    file_io=file_io,
)

architecture_key: dnn
layers:
  - type: dense
    name: first_dense
    units: 256
    activation: relu
  - type: dropout
    name: first_dropout
    rate: 0.3
  - type: dense
    name: second_dense
    units: 64
    activation: relu
  - type: dense
    name: final_dense
    units: 1
    activation: null



### Combine it all to create a RankingModel

In [10]:
from ml4ir.applications.ranking.model.ranking_model import RankingModel
from ml4ir.base.model.relevance_model import RelevanceModel


# Combine the above to define a RelevanceModel
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    file_io=file_io,
                                    logger=logger,
                                )

### Finally, time to train the model

In [11]:
ranking_model.fit(dataset=ranking_dataset,
                  num_epochs=3, 
                  models_dir=MODELS_DIR,
                  logs_dir=LOGS_DIR,
                  monitor_metric="new_MRR",
                  monitor_mode="max")

Epoch 1/3
     11/Unknown - 9s 816ms/step - loss: 1.9723 - old_MRR: 0.7875 - new_MRR: 0.6603
Epoch 00001: val_new_MRR improved from -inf to 0.70766, saving model to ../models/activate_2020/checkpoint.tf
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../models/activate_2020/checkpoint.tf/assets
Epoch 2/3
Epoch 00002: val_new_MRR did not improve from 0.70766
Epoch 3/3
Epoch 00003: val_new_MRR did not improve from 0.70766
Restoring model weights from the end of the best epoch.
Epoch 00003: early stopping


In [12]:
%tensorboard --logdir ../logs/activate_2020 --host localhost

Reusing TensorBoard on port 6006 (pid 43897), started 0:03:31 ago. (Use '!kill 43897' to kill it.)

### Let's try some Transfer Learning
##### Using bytes embeddings from a query classification model

In [13]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[ 0.0343091 , -0.01302702,  0.04351665, ...,  0.00049679,
          0.00265286, -0.02234181],
        [ 0.00524911,  0.04285176,  0.01042024, ..., -0.02495209,
          0.04603917,  0.00322132],
        [ 0.0233542 , -0.03867441,  0.04775677, ..., -0.0047126 ,
         -0.03055434,  0.00582387],
        ...,
        [-0.00038807, -0.04099176, -0.0269978 , ...,  0.0316273 ,
          0.02206803,  0.00144542],
        [ 0.01976344, -0.02634192,  0.01420883, ...,  0.0106725 ,
         -0.04455068,  0.02625582],
        [ 0.00829865,  0.00498176,  0.0074448 , ..., -0.04165881,
         -0.00186209, -0.02414169]], dtype=float32)]

In [14]:
initialize_layers_dict = {
    "query_text_bytes_embedding" : "../models/test_wandb2/final/layers/query_text_bytes_embedding.npy"
}
freeze_layers_list = ["query_text_bytes_embedding"]
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    initialize_layers_dict=initialize_layers_dict,
                                    freeze_layers_list=freeze_layers_list,
                                    file_io=file_io,
                                    logger=logger,
                                )

In [15]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[-0.03738469,  0.00727513, -0.02006867, ...,  0.01078511,
         -0.028496  , -0.04102874],
        [ 0.00512887,  0.0062821 , -0.0010671 , ...,  0.04945388,
         -0.0132054 , -0.01177131],
        [-0.00937623, -0.03438247,  0.00176773, ...,  0.046078  ,
         -0.0310035 , -0.04288797],
        ...,
        [-0.04410168,  0.0383402 ,  0.03348425, ...,  0.02123589,
          0.02240864, -0.04049417],
        [ 0.03601265,  0.04585798,  0.00272902, ..., -0.00353998,
         -0.04783431,  0.02852656],
        [ 0.04903785, -0.03518286,  0.00195389, ...,  0.03783921,
         -0.01398294,  0.0107099 ]], dtype=float32)]