In [1]:
import os
import sys

new_path = "/".join(os.getcwd().split("/")[:-1])
sys.path.append(new_path)

# print(new_path)
from app.utils.config_loader import ConfigLoader
from app.utils.logger import CustomLogger
config = ConfigLoader.load_config(f"{new_path}/app/config/config.yaml")
logger = CustomLogger.setup_logger(config["log_level"])

In [2]:
import xgboost as xgb
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import pickle
from app.utils.config_loader import ConfigLoader
from app.utils.helper import normalize_room_name


In [3]:
model = xgb.XGBClassifier(
                use_label_encoder=False,
                eval_metric=config["model_configs"]["xgb"]["fixed_params"]["eval_metric"],
                random_state=config["random_state"],
            )

model.load_model(f'../{config["model_configs"]["xgb"]["model_path"]}')

vectorizer_path = f'../{config["data"]["vectorizer"]}'
with open(vectorizer_path, "rb") as f:
    vectorizer = pickle.load(f)

In [15]:
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity

def get_feature_inference(ref_name: str, sup_name: str, vectorizer: TfidfVectorizer) -> np.ndarray:
    feature_type = ConfigLoader.get_config()["feature_type"]
    model_name = ConfigLoader.get_config()["sentence_transformer_model"]
    embedding_model = SentenceTransformer(model_name)
    if feature_type == "numeric_feature":
        ref_norm = normalize_room_name(ref_name)
        sup_norm = normalize_room_name(sup_name)
        
        ref_tokens = ref_norm.split() if ref_norm else []
        sup_tokens = sup_norm.split() if sup_norm else []
        ref_tokens = set(ref_norm.split()) if ref_norm else set()
        sup_tokens = set(sup_norm.split()) if sup_norm else set()
        cos_sim = cosine_similarity(vectorizer.transform([sup_norm]), vectorizer.transform([ref_norm]))[0, 0]
        jac = jaccard_similarity_score(sup_tokens, ref_tokens)
        substr = 1 if is_substring(sup_norm, ref_norm) else 0
        seq_ratio = SequenceMatcher(None, sup_norm.lower(), ref_norm.lower()).ratio()
        emb_cos_sim = embedding_cosine_similarity(sup_norm.lower(), ref_norm.lower(), embedding_model)
        char_ngram_jaccard_score = char_ngram_jaccard(sup_norm.lower(), ref_norm.lower())     
        features = np.array([[0.8 * cos_sim, jac, substr, seq_ratio, 1.5 * emb_cos_sim, char_ngram_jaccard_score]])
        return features
    elif feature_type == "embedding_feature":
        model_name = ConfigLoader.get_config()["sentence_transformer_model"]
        model = SentenceTransformer(model_name)
        sup_embedding = model.encode(normalize_room_name(sup_name))
        ref_embedding = model.encode(normalize_room_name(ref_name))
        abs_diff = np.abs(sup_embedding - ref_embedding)
        mult_embedding = sup_embedding * ref_embedding
        features = np.concatenate([sup_embedding, ref_embedding, abs_diff, mult_embedding])
        return features

def jaccard_similarity_score(s1_tokens: set, s2_tokens: set) -> float:
    if not s1_tokens and not s2_tokens:
        return 1.0  # define Jaccard(∅, ∅) = 1 (both empty strings considered identical)
    if not s1_tokens or not s2_tokens:
        return 0.0
    inter = s1_tokens.intersection(s2_tokens)
    union = s1_tokens.union(s2_tokens)
    return float(len(inter) / len(union))


def is_substring(s1: str, s2: str) -> bool:
    s1_low, s2_low = s1.lower(), s2.lower()
    return (s1_low in s2_low) or (s2_low in s1_low)


def load_vectorizer_pkl_file() -> TfidfVectorizer:
    vectorizer_path = ConfigLoader.get_config()["data"]["vectorizer"]
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)
    return vectorizer


def embedding_cosine_similarity(s1: str, s2: str, embedding_model: SentenceTransformer) -> float:
    emb1 = embedding_model.encode([s1])[0]
    emb2 = embedding_model.encode([s2])[0]
    # Compute cosine similarity manually
    # (avoid zero-division by adding a small epsilon to norms)
    norm1 = np.linalg.norm(emb1)
    norm2 = np.linalg.norm(emb2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    cos_sim = np.dot(emb1, emb2) / (norm1 * norm2)
    return float(cos_sim)

def char_ngram_jaccard(s1: str, s2: str, n: int = 3) -> float:
    """
    Compute Jaccard similarity of character n-grams between two strings.
    By default, uses n=3 (trigrams). 
    """
    if not s1 and not s2:
        return 1.0
    if not s1 or not s2:
        return 0.0
    # Generate set of character n-grams for each string
    def ngrams(text, n):
        return { text[i:i+n] for i in range(len(text) - n + 1) }  # set of all n-length substrings
    set1 = ngrams(s1, n)
    set2 = ngrams(s2, n)
    return jaccard_similarity_score(set1, set2)


In [16]:
print("Feature importances:", model.feature_importances_)

Feature importances: [0.7914176  0.04135257 0.12760274 0.01846616 0.01104024 0.01012074]


In [10]:
print("Feature importances:", model.feature_importances_)

Feature importances: [0.8251176  0.0324874  0.10662588 0.0183772  0.00639508 0.01099679]


In [11]:
supplier_room_names = ['Classic Room - Olympic Queen Bed - ROOM ONLY', 'CLASSIC ROOM ADA - ROOM ONLY', 'SUPERIOR ROOM ADA - ROOM ONLY', 'Superior Room - Olympic Queen Bed - ROOM ONLY', 'Superior City View - Olympic Queen Bed - ROOM ONLY', 'Balcony Room - Olympic Queen Bed - ROOM ONLY']
reference_room_names = ['Classic Room', 'Superior Room', 'Superior Room with City View', 'Balcony Room', 'Classic Room - Disability Access', 'Superior Room - Disability Access', 'Junior Suite - Disability Access']

In [10]:
supplier_room_names = ["Classic Room - Olympic Queen Bed - ROOM ONLY","CLASSIC ROOM ADA - ROOM ONLY","SUPERIOR ROOM ADA - ROOM ONLY","Superior Room - Olympic Queen Bed - ROOM ONLY","Superior City View - Olympic Queen Bed - ROOM ONLY","Balcony Room - Olympic Queen Bed - ROOM ONLY"]
reference_room_names = ["Classic Room","Superior Room","Superior Room with City View","Balcony Room","Classic Room - Disability Access","Superior Room - Disability Access","Junior Suite - Disability Access"]


In [13]:
Results  =  []
UnmappedRooms =   []

for supplier_room_name in supplier_room_names:
    best_match: str = ""
    best_score: float = 0.0
    for reference_room_name in reference_room_names:
        features = get_feature_inference(ref_name=reference_room_name, sup_name=supplier_room_name, vectorizer=vectorizer)
        match_prob: float = model.predict_proba(features.reshape(1,-1))[0, 1]
        if match_prob >= best_score:
            best_score = match_prob
            best_match = normalize_room_name(reference_room_name)
        
        logger.info(f"{normalize_room_name(supplier_room_name)} ->/t-> {normalize_room_name(reference_room_name)} score: {match_prob}")

    logger.info(
        f"Best Match: {best_match} score: {best_score} {best_score >= 0.9} of {normalize_room_name(supplier_room_name)}"
    )
    if best_score >= 0.9:
        Results.append(supplier_room_name)
    else:
        UnmappedRooms.append(supplier_room_name)

2025-03-31 23:01:28,490 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> classic room score: 0.308853417634964
2025-03-31 23:01:30,147 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room score: 0.8980321288108826
2025-03-31 23:01:31,846 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room with city view score: 0.7342244386672974
2025-03-31 23:01:33,201 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> balcony room score: 0.8607859015464783
2025-03-31 23:01:34,716 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> classic room disability access score: 0.33043402433395386
2025-03-31 23:01:36,263 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room disability access score: 0.0017569343326613307
2025-03-31 23:01:38,224 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> junior suite disability access score: 0.01659599505364895


In [14]:
Results

['CLASSIC ROOM ADA - ROOM ONLY',
 'SUPERIOR ROOM ADA - ROOM ONLY',
 'Superior City View - Olympic Queen Bed - ROOM ONLY']

In [13]:
features = get_feature_inference(ref_name="classic room disability access score", sup_name="classic room ada room only", vectorizer=vectorizer)
features

array([ 0.02304631, -0.02174101, -0.034021  , ...,  0.00080381,
        0.00955819,  0.0015241 ], dtype=float32)

In [None]:
Results  =  []
UnmappedRooms =   []

for supplier_room_name in supplier_room_names:
    best_match: str = ""
    best_score: float = 0.0
    for reference_room_name in reference_room_names:
        features = get_feature_inference(ref_name=reference_room_name, sup_name=supplier_room_name, vectorizer=vectorizer)
        match_prob: float = model.predict_proba(features.reshape(1,-1))[0, 1]
        if match_prob >= best_score:
            best_score = match_prob
            best_match = normalize_room_name(reference_room_name)
        
        logger.info(f"{normalize_room_name(supplier_room_name)} ->/t-> {normalize_room_name(reference_room_name)} score: {match_prob}")

    logger.info(
        f"Best Match: {best_match} score: {best_score} {best_score >= 0.9} of {normalize_room_name(supplier_room_name)}"
    )
    if best_score >= 0.9:
        Results.append(supplier_room_name)
    else:
        UnmappedRooms.append(supplier_room_name)

2025-04-02 03:24:27,470 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> classic room score: 0.5183444619178772
2025-04-02 03:24:28,890 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room score: 0.9867157340049744
2025-04-02 03:24:30,432 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room with city view score: 0.0058548711240291595
2025-04-02 03:24:32,039 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> balcony room score: 0.9667254090309143
2025-04-02 03:24:33,494 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> classic room disability access score: 0.9609687924385071
2025-04-02 03:24:35,049 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> superior room disability access score: 0.00042875681538134813
2025-04-02 03:24:37,117 - cupid-api - INFO - classic room olympic queen bed room only ->/t-> junior suite disability access score: 0.00041915351175

In [None]:
ref_name="classic room  access score"
sup_name="classic room  access only"

jac = jaccard_similarity_score(sup_name, ref_name)
jac

In [None]:
UnmappedRooms

In [None]:
for room in reference_room_names:
    print(room)

In [None]:
for room in supplier_room_names:
    print(room)

In [None]:
Results = [None] * len(supplier_room_names)
UnmappedRooms = []

# Step 1: Build full similarity matrix
similarity_matrix = []
for sup_idx, supplier_room_name in enumerate(supplier_room_names):
    sim_row = []
    for ref_idx, reference_room_name in enumerate(reference_room_names):
        features = get_feature_inference(ref_name=reference_room_name, sup_name=supplier_room_name, vectorizer=vectorizer)
        match_prob = model.predict_proba(features)[0, 1]

        sim_row.append(match_prob)

        logger.info(f"{normalize_room_name(supplier_room_name)} ->\t-> {normalize_room_name(reference_room_name)} score: {match_prob}")

    similarity_matrix.append(sim_row)

# Step 2: Conflict-aware matching
assigned_refs = {}  # ref_index -> sup_index
match_sup_to_ref = [None] * len(supplier_room_names)

sup_order = sorted(range(len(supplier_room_names)), key=lambda i: max(similarity_matrix[i]), reverse=True)

for sup_idx in sup_order:
    ref_scores = list(enumerate(similarity_matrix[sup_idx]))
    ref_scores.sort(key=lambda x: -x[1])  # sort by match probability descending

    for ref_idx, score in ref_scores:
        if score < 0.3:
            break  # don't consider anything below threshold

        if ref_idx not in assigned_refs:
            # ref not assigned yet
            assigned_refs[ref_idx] = sup_idx
            match_sup_to_ref[sup_idx] = ref_idx
            break
        else:
            current_sup = assigned_refs[ref_idx]
            if similarity_matrix[sup_idx][ref_idx] > similarity_matrix[current_sup][ref_idx]:
                # reassign to better supplier
                match_sup_to_ref[current_sup] = None
                match_sup_to_ref[sup_idx] = ref_idx
                assigned_refs[ref_idx] = sup_idx
                break

# Step 3: Build Results and UnmappedRooms
for sup_idx, ref_idx in enumerate(match_sup_to_ref):
    supplier_name = supplier_room_names[sup_idx]
    if ref_idx is not None:
        matched_ref_name = normalize_room_name(reference_room_names[ref_idx])
        Results[sup_idx] = matched_ref_name
        logger.info(
            f"Best Match: {matched_ref_name} score: {similarity_matrix[sup_idx][ref_idx]} True of {normalize_room_name(supplier_name)}"
        )
    else:
        UnmappedRooms.append(supplier_name)

In [None]:
import json
import random
with open("../data/processed_data.json", "r") as f:
    train_data = json.load(f)

In [None]:
for data in train_data:
    s, r, l = data
    if "superior room"  == s[0:14]:
        print(data)