### First, let's install ml4ir

In [1]:
!pip install ml4ir

Looking in indexes: https://pypi.python.org/simple






Collecting pyarrow<0.15.0,>=0.14.0
  Using cached pyarrow-0.14.1-cp37-cp37m-macosx_10_6_intel.whl (34.4 MB)
[31mERROR: tfx-bsl 0.15.3 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: tfx-bsl 0.15.3 has requirement apache-beam[gcp]<2.17,>=2.16, but you'll have apache-beam 2.22.0 which is incompatible.[0m
[31mERROR: tensorflow-transform 0.15.0 has requirement absl-py<0.9,>=0.7, but you'll have absl-py 0.9.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement dill<0.3.2,>=0.3.1.1, but you'll have dill 0.3.0 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement httplib2<0.18.0,>=0.8, but you'll have httplib2 0.18.1 which is incompatible.[0m
[31mERROR: apache-beam 2.22.0 has requirement pyarrow<0.18.0,>=0.15.1; python_version >= "3.0" or platform_system != "Windows", but you'll have pyarrow 0.14.1 which is incompatible.[0m
Installing collected packages: pyarrow
  Attempting uninstall: pyar

### Load/Define a FeatureConfig

In [1]:
MODEL_CONFIG_PATH = "configs/activate_2020/model_config.yaml"
FEATURE_CONFIG_PATH = "configs/activate_2020/feature_config.yaml"

DATA_DIR = "../ml4ir/applications/ranking/tests/data/tfrecord"
MODELS_DIR = '../models/activate_2020'
LOGS_DIR = '../logs/activate_2020'
VOCAB_FILE = '../ml4ir/applications/ranking/tests/data/config/domain_name_vocab.csv'

MAX_SEQUENCE_SIZE = 25

%load_ext tensorboard

In [2]:
import logging
import tensorflow as tf

logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
# tf.get_logger().setLevel('INFO')
# logging.debug("Logger is initialized...")

from ml4ir.base.io.local_io import LocalIO
from ml4ir.base.io.file_io import FileIO
file_io: FileIO = LocalIO()
file_io.make_directory(LOGS_DIR, clear_dir=True)
file_io.make_directory(MODELS_DIR, clear_dir=True)

from ml4ir.base.features.feature_config import FeatureConfig, SequenceExampleFeatureConfig
from ml4ir.base.config.keys import TFRecordTypeKey


feature_config: SequenceExampleFeatureConfig = FeatureConfig.get_instance(
    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
    feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
    logger=logger)
print("Training features\n-----------------")
print("\n".join(feature_config.get_train_features(key="name")))

Training features
-----------------
text_match_score
page_views_score
quality_score
query_text
domain_id
domain_name


### Create a RelevanceDataset from TFRecords

In [4]:
from ml4ir.base.data.relevance_dataset import RelevanceDataset
from ml4ir.base.config.keys import DataFormatKey
from ml4ir.base.config.keys import TFRecordTypeKey

def get_relevance_dataset(preprocessing_keys_to_fns={}):

    return RelevanceDataset(data_dir=DATA_DIR,
                            data_format=DataFormatKey.TFRECORD,
                            feature_config=feature_config,
                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                            max_sequence_size=MAX_SEQUENCE_SIZE,
                            batch_size=128,
                            preprocessing_keys_to_fns=preprocessing_keys_to_fns,
                            file_io=file_io,
                            logger=logger)

ranking_dataset = get_relevance_dataset()

### Define an InteractionModel

In [5]:
from ml4ir.base.model.scoring.interaction_model import InteractionModel, UnivariateInteractionModel

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

###  ... now with custom feature transforms

In [10]:
from tensorflow.keras import layers
from tensorflow import io

# Define custom feature transforms
def bytes_sequence_encoding_bigru(feature_tensor, feature_info, file_io: FileIO):
    feature_name = feature_info.get("node_name", feature_info.get("name"))
    args = feature_info["feature_layer_info"]["args"]

    # Decode string tensor to bytes
    feature_tensor = io.decode_raw(
        feature_tensor,
        out_type=tf.uint8,
        fixed_length=args.get("max_length", None),
    )

    feature_tensor = tf.squeeze(feature_tensor, axis=1)
    if "embedding_size" in args:
        char_embedding = layers.Embedding(
            name="{}_bytes_embedding".format(feature_name),
            input_dim=256, # bytes vocabulary size is fixed at 256
            output_dim=args["embedding_size"],
            mask_zero=True,
            input_length=args.get("max_length", None),
        )(feature_tensor)
    else:
        char_embedding = tf.one_hot(feature_tensor, depth=256)

    #############################################################################
    # Compute sequence encoding using GRU
    encoding = layers.Bidirectional(
        layers.GRU(units=int(args["encoding_size"] / 2), return_sequences=False),
        merge_mode="concat",
        name="{}_bigru".format(feature_name)
    )(char_embedding)
    #############################################################################
    
    encoding = tf.expand_dims(encoding, axis=1)
    return encoding

from transformers import TFAutoModelForSequenceClassification
from transformers import DistilBertTokenizer

def distilbert_encode(feature_tensor, feature_info, file_io: FileIO):
    feature_name = feature_info.get("node_name", feature_info.get("name"))
    args = feature_info["feature_layer_info"]["args"]
    
    distilbert_encoder = TFAutoModelForSequenceClassification.from_pretrained(
                            "distilbert-base-uncased",
                            num_labels=args.get("encoding_size"),
                            name="{}_distilbert".format(feature_name))
    
    encoding = distilbert_encoder(feature_tensor)

feature_layer_keys_to_fns = {
    "bytes_sequence_encoding_bigru" : bytes_sequence_encoding_bigru,
    "distil_bert_encode" : distilbert_encode
}

# Update the FeatureConfig
f = feature_config.get_feature("query_text")
f["feature_layer_info"]["fn"] = "bytes_sequence_encoding_bigru"
# f["feature_layer_info"]["fn"] = "distilbert_encode"
f["preprocessing_info"].append({"fn": "distilbert_tokenize"})
feature_config.set_feature("query_text", f)

distillbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def distilbert_tokenize(feature_tensor):
    return distilbert_tokenizer(x, return_tensors="tf")

# ranking_dataset = get_relevance_dataset({
#     "distilbert_tokenize": distilbert_tokenize
# })

# Define interaction model
interaction_model: InteractionModel = UnivariateInteractionModel(
                                            feature_config=feature_config,
                                            feature_layer_keys_to_fns=feature_layer_keys_to_fns,
                                            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                            max_sequence_size=MAX_SEQUENCE_SIZE,
                                            file_io=file_io,
                                        )

### Use predefined losses, metrics and optimizers or create your own!

In [11]:
from ml4ir.applications.ranking.model.losses import loss_factory
from ml4ir.applications.ranking.model.metrics import metric_factory
from ml4ir.base.model.losses.loss_base import RelevanceLossBase
from ml4ir.base.model.optimizer import get_optimizer

from ml4ir.applications.ranking.config.keys import LossKey, MetricKey, ScoringTypeKey
from ml4ir.base.config.keys import OptimizerKey

from tensorflow.keras.metrics import Metric
from tensorflow.keras.optimizers import Optimizer
from typing import List, Union, Type


# Define loss object from loss key
loss: RelevanceLossBase = loss_factory.get_loss(
                                loss_key=LossKey.RANK_ONE_LISTNET,
                                scoring_type=ScoringTypeKey.POINTWISE)
    
# Define metrics objects from metrics keys
metrics: List[Union[Type[Metric], str]] = [metric_factory.get_metric(metric_key="MRR")]
    
# Define optimizer
optimizer: Optimizer = get_optimizer(
                            optimizer_key=OptimizerKey.ADAM,
                            learning_rate=0.001
                        )

### Define a scoring function, or the Scorer

In [12]:
from ml4ir.base.model.scoring.scoring_model import RelevanceScorer

# Define scorer
print(open(MODEL_CONFIG_PATH).read())
scorer: RelevanceScorer = RelevanceScorer.from_model_config_file(
    model_config_file=MODEL_CONFIG_PATH,
    interaction_model=interaction_model,
    loss=loss,
    logger=logger,
    file_io=file_io,
)

architecture_key: dnn
layers:
  - type: dense
    name: first_dense
    units: 256
    activation: relu
  - type: dropout
    name: first_dropout
    rate: 0.3
  - type: dense
    name: second_dense
    units: 64
    activation: relu
  - type: dense
    name: final_dense
    units: 1
    activation: null



### Combine it all to create a RankingModel

In [13]:
from ml4ir.applications.ranking.model.ranking_model import RankingModel
from ml4ir.base.model.relevance_model import RelevanceModel


# Combine the above to define a RelevanceModel
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    file_io=file_io,
                                    logger=logger,
                                )

### Finally, time to train the model

In [14]:
ranking_model.fit(dataset=ranking_dataset,
                  num_epochs=3, 
                  models_dir=MODELS_DIR,
                  logs_dir=LOGS_DIR,
                  monitor_metric="new_MRR",
                  monitor_mode="max")

Epoch 1/3
     11/Unknown - 7s 596ms/step - loss: 0.0000e+00 - old_MRR: 0.7875 - new_MRR: 0.5852
Epoch 00001: val_new_MRR improved from -inf to 0.58153, saving model to ../models/activate_2020/checkpoint.tf
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../models/activate_2020/checkpoint.tf/assets
Epoch 2/3
Epoch 00002: val_new_MRR did not improve from 0.58153
Epoch 3/3
Epoch 00003: val_new_MRR did not improve from 0.58153
Restoring model weights from the end of the best epoch.
Epoch 00003: early stopping


In [30]:
%tensorboard --logdir ../logs/activate_2020 --host localhost --port 8088

Reusing TensorBoard on port 8088 (pid 74855), started 5:11:26 ago. (Use '!kill 74855' to kill it.)

### Let's try some Transfer Learning
##### Using bytes embeddings from a query classification model

In [27]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[ 0.0082345 , -0.01132863, -0.02455961, ..., -0.00426633,
          0.00965168, -0.04957014],
        [ 0.04529722, -0.00861281, -0.03258212, ..., -0.03981649,
          0.00670507, -0.01975964],
        [-0.00396828, -0.01097615,  0.02423097, ...,  0.02656201,
          0.04708388, -0.01636691],
        ...,
        [-0.00624301, -0.04872387,  0.02553919, ...,  0.03933351,
         -0.04143144, -0.0293048 ],
        [-0.02145042,  0.04962123, -0.0130963 , ..., -0.02183086,
         -0.01356266, -0.04104717],
        [ 0.03428017, -0.02764865, -0.03456013, ..., -0.0031287 ,
          0.04662568, -0.04645002]], dtype=float32)]

In [28]:
initialize_layers_dict = {
    "query_text_bytes_embedding" : "../models/test_wandb2/final/layers/query_text_bytes_embedding.npy"
}
freeze_layers_list = ["query_text_bytes_embedding"]
ranking_model: RelevanceModel = RankingModel(
                                    feature_config=feature_config,
                                    tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
                                    scorer=scorer,
                                    metrics=metrics,
                                    optimizer=optimizer,
                                    initialize_layers_dict=initialize_layers_dict,
                                    freeze_layers_list=freeze_layers_list,
                                    file_io=file_io,
                                    logger=logger,
                                )

In [29]:
ranking_model.model.get_layer("query_text_bytes_embedding").get_weights()

[array([[-0.03738469,  0.00727513, -0.02006867, ...,  0.01078511,
         -0.028496  , -0.04102874],
        [ 0.00512887,  0.0062821 , -0.0010671 , ...,  0.04945388,
         -0.0132054 , -0.01177131],
        [-0.00937623, -0.03438247,  0.00176773, ...,  0.046078  ,
         -0.0310035 , -0.04288797],
        ...,
        [-0.04410168,  0.0383402 ,  0.03348425, ...,  0.02123589,
          0.02240864, -0.04049417],
        [ 0.03601265,  0.04585798,  0.00272902, ..., -0.00353998,
         -0.04783431,  0.02852656],
        [ 0.04903785, -0.03518286,  0.00195389, ...,  0.03783921,
         -0.01398294,  0.0107099 ]], dtype=float32)]

In [5]:
import xmltodict

deps = xmltodict.parse("""  <dependencies>
     <dependency>
         <groupId>org.scala-lang</groupId>
         <artifactId>scala-library</artifactId>
         <version>1.8</version>
     </dependency>
    <dependency>
        <groupId>org.scalatest</groupId>
        <artifactId>scalatest_2.11</artifactId>
        <version>3.0.7</version>
        <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.scalacheck</groupId>
      <artifactId>scalacheck_2.11</artifactId>
      <version>1.14.3</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.tensorflow</groupId>
        <artifactId>tensorflow</artifactId>
        <version>1.15.0</version>
    </dependency>
    <dependency>
      <groupId>org.tensorflow</groupId>
      <artifactId>proto</artifactId>
      <version>1.15.0</version>
    </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.dataformat</groupId>
          <artifactId>jackson-dataformat-yaml</artifactId>
          <version>2.10.0</version>
      </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.core</groupId>
          <artifactId>jackson-annotations</artifactId>
          <version>2.10.0</version>
      </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.module</groupId>
          <artifactId>jackson-module-scala_2.11</artifactId>
          <version>2.10.0</version>
      </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.core</groupId>
          <artifactId>jackson-databind</artifactId>
          <version>2.10.0</version>
      </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.core</groupId>
          <artifactId>jackson-core</artifactId>
          <version>2.10.0</version>
      </dependency>
  </dependencies>""")

for dep in deps["dependencies"]["dependency"]:
    print("import $ivy.`{}:{}:{}`".format(dep["groupId"], dep["artifactId"], dep["version"]))

import $ivy.`org.scala-lang:scala-library:1.8`
import $ivy.`org.scalatest:scalatest_2.11:3.0.7`
import $ivy.`org.scalacheck:scalacheck_2.11:1.14.3`
import $ivy.`junit:junit:4.12`
import $ivy.`org.tensorflow:tensorflow:1.15.0`
import $ivy.`org.tensorflow:proto:1.15.0`
import $ivy.`com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.10.0`
import $ivy.`com.fasterxml.jackson.core:jackson-annotations:2.10.0`
import $ivy.`com.fasterxml.jackson.module:jackson-module-scala_2.11:2.10.0`
import $ivy.`com.fasterxml.jackson.core:jackson-databind:2.10.0`
import $ivy.`com.fasterxml.jackson.core:jackson-core:2.10.0`
