In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
load_dotenv()

POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
DB_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:5432/{POSTGRES_DB}"
EXPERIMENT_NAME = "xgboost_reranker_experiment"

engine = create_engine(DB_URL)
query = "SELECT * FROM rerank_training_datas"
df = pd.read_sql(query, engine)
df = df.sort_values(by=['query_text'])
df.head(10)

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "rerank_training_datas" does not exist
LINE 1: SELECT * FROM rerank_training_datas
                      ^

[SQL: SELECT * FROM rerank_training_datas]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
features = pd.DataFrame()
df['q_lower'] = df['query_text'].astype(str).str.lower()
df['doc_lower'] = df['full_text'].astype(str).str.lower()
df['h1_lower'] = df['h1'].astype(str).str.lower()
df.head()

Unnamed: 0,id,query_text,doc_id,full_text,h1,h2,qdrant_score,label,created_at,q_lower,doc_lower,h1_lower
284,285,Apa penyebab dan jenis kerontokan rambut?,98f9c796-107a-4a77-9b71-d83f7bbd45da,DERMIAS MAX\n\nRambut rontok adalah lepasnya r...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.626284,1,2025-12-01 16:05:41.921429,apa penyebab dan jenis kerontokan rambut?,dermias max\n\nrambut rontok adalah lepasnya r...,661627558-katalog-dermies-max-by-erha-hiress.pdf
285,286,Apa penyebab dan jenis kerontokan rambut?,68f2f6d6-f877-404a-b3dc-cb45a8c73e33,Alopecia Traksi\n\nRambut rontok tipe ini akib...,661627558-Katalog-Dermies-Max-by-Erha-Hiress.pdf,,0.557933,1,2025-12-01 16:05:41.921429,apa penyebab dan jenis kerontokan rambut?,alopecia traksi\n\nrambut rontok tipe ini akib...,661627558-katalog-dermies-max-by-erha-hiress.pdf
286,287,Apa penyebab dan jenis kerontokan rambut?,c32e0729-1c22-48e5-8c7f-c7a31a456491,Mulai dari Rp1.125.000\n\n&)\n\nMANFAAT\n\nMen...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.494095,1,2025-12-01 16:05:41.921429,apa penyebab dan jenis kerontokan rambut?,mulai dari rp1.125.000\n\n&)\n\nmanfaat\n\nmen...,724126666-erha-ultimate-pricelist-24.pdf
287,288,Apa penyebab dan jenis kerontokan rambut?,042e7526-f4b4-45da-a22c-b1ca117dc0b5,MANFAAT\n\nMerangsang dan mempercepat pertumbu...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.446724,1,2025-12-01 16:05:41.921429,apa penyebab dan jenis kerontokan rambut?,manfaat\n\nmerangsang dan mempercepat pertumbu...,724126666-erha-ultimate-pricelist-24.pdf
479,480,Berapa harga Age Reversal Filler Aesthefill di...,e7bc5395-e66e-4243-9c44-b337f77b35b3,MAKE OVA CENTER\n\nSINGLE TREATMENT\n\n‚ÄúL) 40 ...,724126666-ERHA-Ultimate-Pricelist-24.pdf,,0.521353,0,2025-12-01 16:05:41.921429,berapa harga age reversal filler aesthefill di...,make ova center\n\nsingle treatment\n\n‚Äúl) 40 ...,724126666-erha-ultimate-pricelist-24.pdf


In [None]:
features['qdrant_score'] = df['qdrant_score']

# Feature 2: Length Features
features['doc_len'] = df['doc_lower'].apply(len)
features['query_len'] = df['q_lower'].apply(len)
features.head()

Unnamed: 0,qdrant_score,doc_len,query_len
284,0.626284,869,41
285,0.557933,1110,41
286,0.494095,944,41
287,0.446724,873,41
479,0.521353,820,62


In [None]:
def word_overlap(row):
    q_tokens = set(row['q_lower'].split())
    d_tokens = set(row['doc_lower'].split())
    if not q_tokens: return 0.0
    return len(q_tokens.intersection(d_tokens)) / len(q_tokens)

features['word_overlap'] = df.apply(word_overlap, axis=1)
features.head()

Unnamed: 0,qdrant_score,doc_len,query_len,word_overlap
284,0.626284,869,41,0.333333
285,0.557933,1110,41,0.166667
286,0.494095,944,41,0.5
287,0.446724,873,41,0.333333
479,0.521353,820,62,0.4


In [None]:
from rapidfuzz import fuzz

features['match_in_h1'] = df.apply(
    lambda x: fuzz.partial_ratio(x['q_lower'], x['h1_lower']), axis=1
)

# Feature 5: Fuzzy Matching (Handling Typos/Variations)
features['fuzzy_ratio'] = df.apply(
    lambda x: fuzz.ratio(x['q_lower'], x['doc_lower'][:500]), axis=1
)

features.head()

Unnamed: 0,qdrant_score,doc_len,query_len,word_overlap,match_in_h1,fuzzy_ratio
284,0.626284,869,41,0.333333,25.641026,14.417745
285,0.557933,1110,41,0.166667,25.641026,13.678373
286,0.494095,944,41,0.5,21.052632,14.417745
287,0.446724,873,41,0.333333,21.052632,14.048059
479,0.521353,820,62,0.4,32.5,18.149466


In [None]:
def price_relevance(row):
    is_price_query = any(w in row['q_lower'] for w in ['harga', 'biaya', 'price', 'rp'])
    has_price_info = 'rp' in row['doc_lower'] or 'rp.' in row['doc_lower']
    return 1 if (is_price_query and has_price_info) else 0
    
features['is_price_match'] = df.apply(price_relevance, axis=1)
features.head()

Unnamed: 0,qdrant_score,doc_len,query_len,word_overlap,match_in_h1,fuzzy_ratio,is_price_match
284,0.626284,869,41,0.333333,25.641026,14.417745,0
285,0.557933,1110,41,0.166667,25.641026,13.678373,0
286,0.494095,944,41,0.5,21.052632,14.417745,0
287,0.446724,873,41,0.333333,21.052632,14.048059,0
479,0.521353,820,62,0.4,32.5,18.149466,1


In [None]:
y = df["label"]
y.head()

284    1
285    1
286    1
287    1
479    0
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import GroupShuffleSplit

groups = df.groupby("query_text").size().to_list()
gss = GroupShuffleSplit(test_size=.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(features, y, groups=df["query_text"]))


In [None]:
X_train, X_test = features.iloc[train_idx], features.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [None]:
train_groups = df.iloc[train_idx].groupby("query_text").size().to_list()
test_groups = df.iloc[test_idx].groupby("query_text").size().to_list()

In [None]:
print(f"üìä Training on {len(X_train)} rows ({len(train_groups)} queries)")
print(f"üìä Testing on {len(X_test)} rows ({len(test_groups)} queries)")

üìä Training on 385 rows (92 queries)
üìä Testing on 95 rows (23 queries)


In [None]:
import mlflow
import time
import xgboost as xgb
import mlflow.xgboost
from time import sleep

tracking_uri = "http://127.0.0.1:5050"
registry_uri = "http://127.0.0.1:9001"
mlflow.set_tracking_uri(tracking_uri)
client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri, registry_uri=registry_uri)

# Wait for MLflow to be ready
max_retries = 10
for attempt in range(max_retries):
    try:
        mlflow.get_experiment_by_name(EXPERIMENT_NAME)
        print("‚úÖ MLflow is ready")
        break
    except Exception as e:
        if attempt < max_retries - 1:
            print(f"‚è≥ Waiting for MLflow... (attempt {attempt + 1}/{max_retries})")
            sleep(3)
        else:
            raise Exception("MLflow failed to become ready") from e

# Get or create experiment
existing_exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not existing_exp:
    mlflow.create_experiment(EXPERIMENT_NAME)
    print("Experiment created")

try:
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception:
    print("Restoring experiment")
    mlflow.tracking.MlflowClient(tracking_uri=tracking_uri, registry_uri=tracking_uri).restore_experiment(EXPERIMENT_NAME)
    mlflow.set_experiment(EXPERIMENT_NAME)
    print("Success")

start_time = time.perf_counter()
with mlflow.start_run():
    params = {
        "objective": "rank:ndcg", 
        "eval_metric": "ndcg@3",  
        "tree_method": "hist",    
        "learning_rate": 0.1,
        "max_depth": 4,
        "n_estimators": 100
    }

    print("Logging hyperparameters")
    mlflow.log_params(params)

    print("Training reranker")
    ranker = xgb.XGBRanker(**params)
    ranker.fit(
        X_train, y_train,
        group=train_groups,
        eval_set=[(X_test, y_test)],
        eval_group=[test_groups],
        verbose=True
    )

    results = ranker.evals_result()
    best_score = results["validation_0"]["ndcg@3"][-1]
    metrics = {
        "ndcg_3": best_score
    }
    mlflow.log_metrics(metrics)

    print("Logging model")
    mlflow.xgboost.log_model(
        xgb_model=ranker,
        name="XGBoostReranker",
        registered_model_name="XGBoostReranker",
        input_example=X_train.head(10)
    )

    latest_versions = client.get_latest_versions("XGBoostReranker", stages=["None"])
    if latest_versions:
        target_version = latest_versions[0].version

        client.transition_model_version_stage(
            name="XGBoostReranker",
            version=target_version,
            stage="Staging",
            archive_existing_versions=False,
        )

    print(f"\n‚úÖ Training Complete! NDCG@3 Score: {best_score:.4f}")
    print(f"üöÄ Model saved to MLflow Experiment: '{EXPERIMENT_NAME}'")
    
end_time = time.perf_counter()
elapsed_seconds = end_time - start_time
print(f"Training completed in {elapsed_seconds:.2f} seconds")

‚úÖ MLflow is ready
Experiment created
Logging hyperparameters
Training reranker
[0]	validation_0-ndcg@3:0.88602
[1]	validation_0-ndcg@3:0.88904
[2]	validation_0-ndcg@3:0.89392
[3]	validation_0-ndcg@3:0.90427
[4]	validation_0-ndcg@3:0.90427
[5]	validation_0-ndcg@3:0.90427
[6]	validation_0-ndcg@3:0.90427
[7]	validation_0-ndcg@3:0.90427
[8]	validation_0-ndcg@3:0.90427
[9]	validation_0-ndcg@3:0.90427
[10]	validation_0-ndcg@3:0.90427
[11]	validation_0-ndcg@3:0.91760
[1]	validation_0-ndcg@3:0.88904
[2]	validation_0-ndcg@3:0.89392
[3]	validation_0-ndcg@3:0.90427
[4]	validation_0-ndcg@3:0.90427
[5]	validation_0-ndcg@3:0.90427
[6]	validation_0-ndcg@3:0.90427
[7]	validation_0-ndcg@3:0.90427
[8]	validation_0-ndcg@3:0.90427
[9]	validation_0-ndcg@3:0.90427
[10]	validation_0-ndcg@3:0.90427
[11]	validation_0-ndcg@3:0.91760
[12]	validation_0-ndcg@3:0.91760
[13]	validation_0-ndcg@3:0.91760
[14]	validation_0-ndcg@3:0.91760
[15]	validation_0-ndcg@3:0.91760
[16]	validation_0-ndcg@3:0.92329
[17]	validatio

2025/12/01 23:05:55 INFO mlflow.models.model: Found the following environment variables used during model inference: [DEEPINFRA_API_TOKEN, FIRECRAWL_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
2025/12/01 23:05:55 INFO mlflow.models.model: Found the following environment variables used during model inference: [DEEPINFRA_API_TOKEN, FIRECRAWL_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'XGBoostReranker'.
2025/12/01 23:05:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostReranker, version 1
Successfully registered model 'XGBoostReranker'.
2025/12/01 23:05:56 INFO mlflow.store.model_registry.abstract_store: Waiting u


‚úÖ Training Complete! NDCG@3 Score: 0.8971
üöÄ Model saved to MLflow Experiment: 'xgboost_reranker_experiment'
üèÉ View run efficient-lark-970 at: http://127.0.0.1:5050/#/experiments/1/runs/e7274b8cf40c4c86bb236acebcbcd735
üß™ View experiment at: http://127.0.0.1:5050/#/experiments/1
Training completed in 3.54 seconds


Created version '1' of model 'XGBoostReranker'.
  latest_versions = client.get_latest_versions("XGBoostReranker", stages=["None"])
  client.transition_model_version_stage(
