In [1]:
import time

notebook_start_time = time.time()

# Set up environment

In [2]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment
Adding the following directory to the PYTHONPATH: /Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system


# 🧬 Training pipeline: Training ranking model </span>

In this notebook, you will train a ranking model using gradient boosted trees. 

## 📝 Imports

In [3]:
%load_ext autoreload
%autoreload 2

import warnings

warnings.filterwarnings("ignore")

from loguru import logger

from recsys import hopsworks_integration, training
from recsys.config import settings

## Constants

In [4]:
from pprint import pprint

pprint(dict(settings))

{'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'CUSTOM_HOPSWORKS_INFERENCE_ENV': 'custom_env_name',
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'HOPSWORKS_API_KEY': SecretStr('**********'),
 'OPENAI_API_KEY': SecretStr('**********'),
 'OPENAI_MODEL_ID': 'gpt-4o-mini',
 'RANKING_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'RANKING_EARLY_STOPPING_ROUNDS': 5,
 'RANKING_ITERATIONS': 100,
 'RANKING_LEARNING_RATE': 0.2,
 'RANKING_MODEL_TYPE': 'ranking',
 'RANKING_SCALE_POS_WEIGHT': 10,
 'RECSYS_DIR': PosixPath('/Users/pauliusztin/Documents/01_projects/hopsworks_recsys/hands-on-recommender-system/recsys'),
 'TWO_TOWER_DATASET_TEST_SPLIT_SIZE': 0.1,
 'TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'TWO_TOWER_LEARNING_RATE': 0.01,
 'TWO_TOWER_MODEL_BATCH_SIZE': 2048,
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,
 'TWO_TOWER_NUM_EPOCHS': 10,
 'TWO_TOWER_WEIGHT_DECAY': 0.001}


## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [5]:
project, fs = hopsworks_integration.get_feature_store()

[32m2024-12-24 13:02:45.188[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m13[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


2024-12-24 13:02:45,189 INFO: Initializing external client
2024-12-24 13:02:45,189 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-24 13:02:46,594 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1192098


# Getting the training data

In [6]:
feature_view_ranking = hopsworks_integration.feature_store.create_ranking_feature_views(
    fs
)

In [7]:
X_train, X_val, y_train, y_val = feature_view_ranking.train_test_split(
    test_size=settings.RANKING_DATASET_VALIDATON_SPLIT_SIZE,
    description="Ranking training dataset",
)
X_train.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.54s) 




Unnamed: 0,age,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,month_sin,month_cos
0,23.0,Leggings/Tights,Garment Lower body,Check,Grey,Medium Dusty,Grey,Jersey fancy,Ladieswear,Ladieswear,Womens Everyday Collection,Jersey Fancy,-0.5,-0.8660254
1,50.0,Dress,Garment Full body,Solid,Dark Beige,Dark,Beige,Knitwear,Ladieswear,Ladieswear,Womens Everyday Collection,Knitwear,-0.5,0.8660254
2,26.0,Top,Garment Upper body,Solid,White,Light,White,Jersey,Ladieswear,Ladieswear,Womens Tailoring,Jersey Fancy,-1.0,-1.83697e-16


In [8]:
y_train.head(3)

Unnamed: 0,label
0,1
1,1
2,1


# Training the ranking model

Let's train the ranking model:

In [9]:
model = training.ranking.RankingModelFactory.build()
trainer = training.ranking.RankingModelTrainer(
    model=model, train_dataset=(X_train, y_train), eval_dataset=(X_val, y_val)
)

In [10]:
trainer.fit()

0:	learn: 0.5142740	test: 0.5142148	best: 0.5142148 (0)	total: 93.3ms	remaining: 9.23s
1:	learn: 0.3941888	test: 0.3940829	best: 0.3940829 (1)	total: 134ms	remaining: 6.55s
2:	learn: 0.3081133	test: 0.3079641	best: 0.3079641 (2)	total: 148ms	remaining: 4.8s
3:	learn: 0.2441452	test: 0.2439562	best: 0.2439562 (3)	total: 190ms	remaining: 4.56s
4:	learn: 0.1954138	test: 0.1951864	best: 0.1951864 (4)	total: 218ms	remaining: 4.13s
5:	learn: 0.1576688	test: 0.1574069	best: 0.1574069 (5)	total: 232ms	remaining: 3.63s
6:	learn: 0.1281129	test: 0.1278174	best: 0.1278174 (6)	total: 259ms	remaining: 3.45s
7:	learn: 0.1047526	test: 0.1044253	best: 0.1044253 (7)	total: 273ms	remaining: 3.14s
8:	learn: 0.0861819	test: 0.0858238	best: 0.0858238 (8)	total: 288ms	remaining: 2.91s
9:	learn: 0.0714014	test: 0.0710115	best: 0.0710115 (9)	total: 336ms	remaining: 3.03s
10:	learn: 0.0595149	test: 0.0590966	best: 0.0590966 (10)	total: 365ms	remaining: 2.95s
11:	learn: 0.0499656	test: 0.0495185	best: 0.0495185

<catboost.core.CatBoostClassifier at 0x354cf7150>

## Evaluating the ranking model

Next, you'll evaluate how well the model performs on the validation data using metrics for classification such as precision, recall and f1-score:

In [11]:
metrics = trainer.evaluate(log=True)

[32m2024-12-24 13:03:06.983[0m | [1mINFO    [0m | [36mrecsys.training.ranking[0m:[36mevaluate[0m:[36m62[0m - [1m              precision    recall  f1-score   support

           0       1.00      1.00      1.00     38778
           1       0.96      1.00      0.98      1942

    accuracy                           1.00     40720
   macro avg       0.98      1.00      0.99     40720
weighted avg       1.00      1.00      1.00     40720
[0m


It can be seen that the model has a low F1-score on the positive class (higher is better). The performance could potentially be improved by adding more features to the dataset, e.g. image embeddings.

Let's see which features your model considers important.

In [12]:
trainer.get_feature_importance()

{'month_cos': 58.88502448246417,
 'month_sin': 33.568428970468496,
 'product_type_name': 1.5060291283122975,
 'age': 1.4510136344766673,
 'perceived_colour_value_name': 0.9090543083951164,
 'index_group_name': 0.905068938543685,
 'product_group_name': 0.8481770590684912,
 'graphical_appearance_name': 0.47203813718737087,
 'garment_group_name': 0.45948435511105373,
 'department_name': 0.40798195143449595,
 'index_name': 0.3400126867120175,
 'perceived_colour_master_name': 0.15406431986826075,
 'section_name': 0.09209849465645363,
 'colour_group_name': 0.001523533301427642}

## <span style="color:#ff5f27">  Uploading the model to Hopsworks model registry </span>

In [13]:
mr = project.get_model_registry()

In [14]:
ranking_module = hopsworks_integration.ranking_serving.HopsworksRankingModel(
    model=model
)
ranking_module.register(mr, feature_view_ranking, X_train, metrics)

Uploading: 100.000%|██████████| 618847/618847 elapsed<00:02 remaining<00:00  1.18it/s]
Uploading: 100.000%|██████████| 448/448 elapsed<00:02 remaining<00:00<00:07,  1.95s/it]
Model export complete: 100%|██████████| 6/6 [00:12<00:00,  2.04s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1192098/models/ranking_model/2





## <span style="color:#ff5f27"> Inspecting the model in the Hopsworks model registry </span>

View results in [Hopsworks Serverless](https://rebrand.ly/serverless-github): **Data Science → Model Registry**

---

In [15]:
notebook_end_time = time.time()
notebook_execution_time = notebook_end_time - notebook_start_time

logger.info(
    f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds ~ {notebook_execution_time / 60:.2f} minutes"
)

[32m2024-12-24 13:03:20.163[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m⌛️ Notebook Execution time: 39.05 seconds ~ 0.65 minutes[0m


# <span style="color:#ff5f27">→ Next Steps </span>

In the next notebook, you will compute embeddings for all the items, populate a vector index with them (as a feature group) and create an online feature view which will allow you to retrieve candidates, for each user, with very low latency.