In [1]:
import time

notebook_start_time = time.time()

# Set up environment

In [2]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment
Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system


# 🧬 Training pipeline: Training ranking model </span>

In this notebook, you will train a ranking model using gradient boosted trees. 

## 📝 Imports

In [3]:
%load_ext autoreload
%autoreload 2

import warnings

warnings.filterwarnings("ignore")

from loguru import logger

from recsys import hopsworks_integration, training
from recsys.config import settings

## Constants

In [4]:
from pprint import pprint

pprint(dict(settings))

{'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'HOPSWORKS_API_KEY': SecretStr('**********'),
 'RECSYS_DIR': PosixPath('/Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system/recsys'),
 'TWO_TOWER_DATASET_TEST_SPLIT_SIZE': 0.1,
 'TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'TWO_TOWER_LEARNING_RATE': 0.01,
 'TWO_TOWER_MODEL_BATCH_SIZE': 2048,
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,
 'TWO_TOWER_NUM_EPOCHS': 10,
 'TWO_TOWER_WEIGHT_DECAY': 0.001}


## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [5]:
project, fs = hopsworks_integration.get_feature_store()

[32m2024-11-21 13:22:48.327[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m12[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/15551
Connected. Call `.close()` to terminate connection gracefully.


# Getting the training data

In [6]:
feature_view_ranking = hopsworks_integration.feature_store.create_ranking_feature_views(
    fs
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15551/fs/15471/fv/customers/version/1
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15551/fs/15471/fv/articles/version/1
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/15551/fs/15471/fv/ranking/version/1


In [7]:
X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(
    test_size=settings.RANKING_DATASET_VALIDATON_SPLIT_SIZE,
    description="Ranking training dataset",
)
X_train.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (10.83s) 



Unnamed: 0,age,month_sin,month_cos,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
1,21.0,-0.866025,-0.5,Trousers,Garment Lower body,Solid,Black,Dark,Black,Trousers,Divided,Divided,Divided Collection,Trousers
2,23.0,-0.5,-0.866025,Leggings/Tights,Garment Lower body,Check,Grey,Medium Dusty,Grey,Jersey fancy,Ladieswear,Ladieswear,Womens Everyday Collection,Jersey Fancy
3,40.0,0.5,-0.866025,Top,Garment Upper body,Front print,Light Beige,Dusty Light,Beige,Jersey Fancy DS,Divided,Divided,Divided Selected,Jersey Fancy


In [8]:
y_train.head(3)

Unnamed: 0,label
1,0
2,1
3,0


# Training the ranking model

Let's train the ranking model:

In [9]:
model = training.ranking.RankingModelFactory.build()
trainer = training.ranking.RankingModelTrainer(
    model=model, train_dataset=(X_train, y_train), eval_dataset=(X_val, y_val)
)

In [10]:
trainer.fit()

0:	learn: 0.6860959	test: 0.6878611	best: 0.6878611 (0)	total: 86.5ms	remaining: 8.56s
1:	learn: 0.6798111	test: 0.6831127	best: 0.6831127 (1)	total: 111ms	remaining: 5.46s
2:	learn: 0.6774529	test: 0.6812453	best: 0.6812453 (2)	total: 136ms	remaining: 4.4s
3:	learn: 0.6771734	test: 0.6809682	best: 0.6809682 (3)	total: 152ms	remaining: 3.64s
4:	learn: 0.6743181	test: 0.6792478	best: 0.6792478 (4)	total: 173ms	remaining: 3.28s
5:	learn: 0.6700901	test: 0.6756502	best: 0.6756502 (5)	total: 193ms	remaining: 3.03s
6:	learn: 0.6679408	test: 0.6741762	best: 0.6741762 (6)	total: 213ms	remaining: 2.83s
7:	learn: 0.6667781	test: 0.6731474	best: 0.6731474 (7)	total: 234ms	remaining: 2.69s
8:	learn: 0.6630759	test: 0.6717030	best: 0.6717030 (8)	total: 255ms	remaining: 2.58s
9:	learn: 0.6601952	test: 0.6695397	best: 0.6695397 (9)	total: 278ms	remaining: 2.5s
10:	learn: 0.6575911	test: 0.6684752	best: 0.6684752 (10)	total: 299ms	remaining: 2.42s
11:	learn: 0.6563756	test: 0.6673738	best: 0.6673738 

<catboost.core.CatBoostClassifier at 0x354fb4a50>

## Evaluating the ranking model

Next, you'll evaluate how well the model performs on the validation data using metrics for classification such as precision, recall and f1-score:

In [11]:
metrics = trainer.evaluate(log=True)

[32m2024-11-21 13:23:15.435[0m | [1mINFO    [0m | [36mrecsys.training.ranking[0m:[36mevaluate[0m:[36m60[0m - [1m              precision    recall  f1-score   support

           0       0.94      0.69      0.79     20845
           1       0.14      0.54      0.22      1964

    accuracy                           0.68     22809
   macro avg       0.54      0.62      0.51     22809
weighted avg       0.87      0.68      0.75     22809
[0m


It can be seen that the model has a low F1-score on the positive class (higher is better). The performance could potentially be improved by adding more features to the dataset, e.g. image embeddings.

Let's see which features your model considers important.

In [12]:
trainer.get_feature_importance()

{'month_cos': 14.187857671116273,
 'age': 13.866672119234732,
 'section_name': 8.949230425986132,
 'product_group_name': 8.852948790953244,
 'month_sin': 7.883541209776112,
 'product_type_name': 7.674231304684104,
 'garment_group_name': 7.5875973795538485,
 'department_name': 6.204410874615057,
 'index_name': 6.1405538852885515,
 'graphical_appearance_name': 4.8978315805567405,
 'perceived_colour_value_name': 4.490918891695863,
 'index_group_name': 3.4249671993906516,
 'perceived_colour_master_name': 3.206848308176551,
 'colour_group_name': 2.6323903589721476}

## <span style="color:#ff5f27">  Uploading the model to Hopsworks model registry </span>

In [13]:
mr = project.get_model_registry()

Connected. Call `.close()` to terminate connection gracefully.


In [14]:
ranking_module = hopsworks_integration.ranking_serving.HopsworksRankingModel(
    model=model
)
ranking_module.register(mr, X_train, y_train, metrics)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1679218 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/468 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1274 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/15551/models/ranking_model/1


## <span style="color:#ff5f27"> Inspecting the model in the Hopsworks model registry </span>

View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry**

---

In [15]:
notebook_end_time = time.time()
notebook_execution_time = notebook_end_time - notebook_start_time

logger.info(
    f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes"
)

[32m2024-11-21 13:23:35.183[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m⌛️ Notebook Execution time: 51.66 seconds ~ 0.86 minutes[0m


# <span style="color:#ff5f27">→ Next Steps </span>

In the next notebook, you will compute embeddings for all the items, populate a vector index with them (as a feature group) and create an online feature view which will allow you to retrieve candidates, for each user, with very low latency.