# use sentence transformer as the feature, instead of manual feature, should have better semantics from data. 

In [1]:
import json
import random
with open("../data/train.json", "r") as f:
    train_data = json.load(f)

total_data = train_data["train_positive"] + train_data["train_negative"]
random.shuffle(total_data)


In [2]:
from sentence_transformers import SentenceTransformer
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif


smodel = SentenceTransformer('all-MiniLM-L6-v2')

feature_matrix = []
labels = []


from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def token_jaccard(s1, s2):
    set1, set2 = set(s1.lower().split()), set(s2.lower().split())
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0.0

def is_substring(s1, s2):
    s1_low, s2_low = s1.lower(), s2.lower()
    return (s1_low in s2_low) or (s2_low in s1_low)


# Compute features
feature_matrix = []
labels = []


all_names = []
for s, r, l in total_data:
    all_names.extend([s,r])

vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)


for sup_name, ref_name, label in total_data:
    embedding_sup = smodel.encode(sup_name)
    embedding_ref = smodel.encode(ref_name)
    abs_diff = np.abs(embedding_sup - embedding_ref)
    mult_embedding = embedding_sup * embedding_ref

    cos_sim = cosine_similarity(vectorizer.transform([sup_name]), vectorizer.transform([ref_name]))[0,0]
    jac = token_jaccard(sup_name, ref_name)
    substr = 1 if is_substring(sup_name, ref_name) else 0
    seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()

    numeric_features = np.array([cos_sim, jac, substr, seq_ratio])
    embedding_features = np.concatenate([ embedding_sup, embedding_ref, mult_embedding])
    
    pair_embedding = np.concatenate([numeric_features, embedding_features])
    feature_matrix.append(pair_embedding)
    labels.append(label)

scaler = StandardScaler()
feature_matrix_scaled = scaler.fit_transform(feature_matrix)

# selector = SelectKBest(score_func=f_classif, k=1000)
# feature_matrix_selected = selector.fit_transform(feature_matrix_scaled, labels)

feature_matrix = np.array(feature_matrix_scaled)
labels = np.array(labels)


In [3]:
feature_matrix.shape

(4000, 1156)

In [4]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Convert to numpy for XGBoost
X = np.array(feature_matrix)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# If classes are imbalanced, set scale_pos_weight or use class_weight to give more weight to matches.
imbalance_ratio = sum(y==0) / sum(y==1)
model.set_params(scale_pos_weight=imbalance_ratio)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Predict on test set
y_pred = model.predict(X_test)

# Metrics calculation
print("Classification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[0]	validation_0-logloss:0.45902


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[1]	validation_0-logloss:0.32568
[2]	validation_0-logloss:0.24053
[3]	validation_0-logloss:0.18612
[4]	validation_0-logloss:0.14900
[5]	validation_0-logloss:0.12332
[6]	validation_0-logloss:0.10417
[7]	validation_0-logloss:0.09155
[8]	validation_0-logloss:0.08110
[9]	validation_0-logloss:0.07516
[10]	validation_0-logloss:0.06851
[11]	validation_0-logloss:0.06542
[12]	validation_0-logloss:0.06208
[13]	validation_0-logloss:0.06129
[14]	validation_0-logloss:0.05939
[15]	validation_0-logloss:0.05802
[16]	validation_0-logloss:0.05605
[17]	validation_0-logloss:0.05525
[18]	validation_0-logloss:0.05544
[19]	validation_0-logloss:0.05396
[20]	validation_0-logloss:0.05285
[21]	validation_0-logloss:0.05336
[22]	validation_0-logloss:0.05480
[23]	validation_0-logloss:0.05377
[24]	validation_0-logloss:0.05425
[25]	validation_0-logloss:0.05422
[26]	validation_0-logloss:0.05437
[27]	validation_0-logloss:0.05470
[28]	validation_0-logloss:0.05454
[29]	validation_0-logloss:0.05454
[30]	validation_0-loglo

In [5]:
print("Feature importances:", model.feature_importances_)

Feature importances: [0.2633692  0.00457201 0.01939864 ... 0.00726019 0.         0.        ]


In [6]:
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

all_names = []
for s, r, l in total_data:
    all_names.extend([s,r])

vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)

def normalize_room_name(name: str) -> str:
    name = name.lower()                           # lowercasing
    name = re.sub(r'[^a-z0-9\s]', '', name)       # remove punctuation/special chars
    name = re.sub(r'\s+', ' ', name).strip()      # collapse multiple spaces
    return name

In [7]:


hotel3_supplier = ["Classic Room - Olympic Queen Bed - ROOM ONLY","CLASSIC ROOM ADA - ROOM ONLY","SUPERIOR ROOM ADA - ROOM ONLY","Superior Room - Olympic Queen Bed - ROOM ONLY","Superior City View - Olympic Queen Bed - ROOM ONLY","Balcony Room - Olympic Queen Bed - ROOM ONLY"]
hotel3_reference = ["Classic Room","Superior Room","Superior Room with City View","Balcony Room","Classic Room - Disability Access","Superior Room - Disability Access","Junior Suite - Disability Access"]





for sup_name in hotel3_supplier:
    best_match = None
    best_score = 0.0
    for ref_name in hotel3_reference:
        embedding_sup = smodel.encode(sup_name)
        embedding_ref = smodel.encode(ref_name)
        abs_diff = np.abs(embedding_sup - embedding_ref)
        mult_embedding = embedding_sup * embedding_ref
        pair_embedding = np.concatenate([embedding_sup, embedding_ref, mult_embedding])

        # print(pair_embedding.shape)
        # Predict probability of match

        cos_sim = cosine_similarity(vectorizer.transform([sup_name]), vectorizer.transform([ref_name]))[0,0]
        jac = token_jaccard(sup_name, ref_name)
        substr = 1 if is_substring(sup_name, ref_name) else 0
        seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()
    
        numeric_features = np.array([cos_sim, jac, substr, seq_ratio])
        
        pair_embedding = np.concatenate([numeric_features, pair_embedding])

        # print(pair_embedding.shape)
        pair_embedding_scaled = scaler.transform(pair_embedding.reshape(1,-1))
        # pair_embedding_selected = selector.transform(pair_embedding_scaled).reshape(1, 1000)

        pair_embedding = np.array(pair_embedding_scaled)
        match_prob = model.predict_proba(pair_embedding)[0,1]
        if match_prob > best_score:
            best_score = match_prob
            best_match = ref_name
    # Decide if above threshold
    if best_score >= 0.1:  # using 0.5 for this example threshold
        print(f"{sup_name} → **MATCH** with \"{best_match}\" (score={best_score:.2f})")
    else:
        print(f"{sup_name} → **NO MATCH** (outlier) (score={best_score:.2f})")

Classic Room - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Classic Room" (score=0.99)
CLASSIC ROOM ADA - ROOM ONLY → **MATCH** with "Classic Room" (score=1.00)
SUPERIOR ROOM ADA - ROOM ONLY → **MATCH** with "Superior Room" (score=0.76)
Superior Room - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Superior Room" (score=1.00)
Superior City View - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Superior Room with City View" (score=0.16)
Balcony Room - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Balcony Room" (score=1.00)


In [8]:
hotel3_supplier

['Classic Room - Olympic Queen Bed - ROOM ONLY',
 'CLASSIC ROOM ADA - ROOM ONLY',
 'SUPERIOR ROOM ADA - ROOM ONLY',
 'Superior Room - Olympic Queen Bed - ROOM ONLY',
 'Superior City View - Olympic Queen Bed - ROOM ONLY',
 'Balcony Room - Olympic Queen Bed - ROOM ONLY']

In [9]:
hotel3_reference

['Classic Room',
 'Superior Room',
 'Superior Room with City View',
 'Balcony Room',
 'Classic Room - Disability Access',
 'Superior Room - Disability Access',
 'Junior Suite - Disability Access']