In [1]:
import json
import random
with open("../data/train.json", "r") as f:
    train_data = json.load(f)


In [2]:
total_data = train_data["train_positive"] + train_data["train_negative"]
random.shuffle(total_data)


In [3]:
total_data[:10]

[['apartment 2 bedrooms', 'apartment', 1],
 ['apartment ensuite 62209', 'apartment ensuite 62209', 1],
 ['family studio suite city view', 'family studio suite', 1],
 ['deluxe double room terrace', 'holiday home', 0],
 ['quadruple room 2 double beds refrigerator microwave ocean view',
  'onebedroom apartment',
  0],
 ['presidential apartment private bathroom 3',
  'presidential apartment private bathroom 3 no pets',
  1],
 ['family room spa access', 'family room spa access', 1],
 ['junior suite 2 queen beds accessible refrigerator microwave',
  'sixbedroom house',
  0],
 ['appartraum', 'appartraum', 1],
 ['twin room with views', 'onebedroom apartment', 0]]

In [4]:
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def token_jaccard(s1, s2):
    set1, set2 = set(s1.lower().split()), set(s2.lower().split())
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0.0

def is_substring(s1, s2):
    s1_low, s2_low = s1.lower(), s2.lower()
    return (s1_low in s2_low) or (s2_low in s1_low)


# Compute features
feature_matrix = []
labels = []


all_names = []
for s, r, l in total_data:
    all_names.extend([s,r])

vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)


for sup_name, ref_name, label in total_data:
    # Similarities
    cos_sim = cosine_similarity(vectorizer.transform([sup_name]), vectorizer.transform([ref_name]))[0,0]
    jac = token_jaccard(sup_name, ref_name)
    substr = 1 if is_substring(sup_name, ref_name) else 0
    seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()
    feature_matrix.append([cos_sim, jac, substr, seq_ratio])
    labels.append(label)

# print("Feature vector for each pair:", feature_matrix)
# print("Labels:", labels)


In [5]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Convert to numpy for XGBoost
X = np.array(feature_matrix)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# If classes are imbalanced, set scale_pos_weight or use class_weight to give more weight to matches.
imbalance_ratio = sum(y==0) / sum(y==1)
model.set_params(scale_pos_weight=imbalance_ratio)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Predict on test set
y_pred = model.predict(X_test)

# Metrics calculation
print("Classification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[0]	validation_0-logloss:0.45866
[1]	validation_0-logloss:0.33205
[2]	validation_0-logloss:0.25161
[3]	validation_0-logloss:0.19804
[4]	validation_0-logloss:0.16158
[5]	validation_0-logloss:0.13459
[6]	validation_0-logloss:0.11801
[7]	validation_0-logloss:0.10464
[8]	validation_0-logloss:0.09455
[9]	validation_0-logloss:0.08830
[10]	validation_0-logloss:0.08312
[11]	validation_0-logloss:0.08032
[12]	validation_0-logloss:0.07757
[13]	validation_0-logloss:0.07504
[14]	validation_0-logloss:0.07345
[15]	validation_0-logloss:0.07207
[16]	validation_0-logloss:0.07099
[17]	validation_0-logloss:0.07051
[18]	validation_0-logloss:0.07117
[19]	validation_0-logloss:0.07128
[20]	validation_0-logloss:0.07069
[21]	validation_0-logloss:0.07073
[22]	validation_0-logloss:0.07075
[23]	validation_0-logloss:0.07070
[24]	validation_0-logloss:0.07208
[25]	validation_0-logloss:0.07210
[26]	validation_0-logloss:0.07132
[27]	validation_0-logloss:0.07124
[28]	validation_0-logloss:0.07174
[29]	validation_0-loglos

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[42]	validation_0-logloss:0.07413
[43]	validation_0-logloss:0.07430
[44]	validation_0-logloss:0.07453
[45]	validation_0-logloss:0.07459
[46]	validation_0-logloss:0.07452
[47]	validation_0-logloss:0.07442
[48]	validation_0-logloss:0.07454
[49]	validation_0-logloss:0.07504
[50]	validation_0-logloss:0.07518
[51]	validation_0-logloss:0.07588
[52]	validation_0-logloss:0.07622
[53]	validation_0-logloss:0.07699
[54]	validation_0-logloss:0.07712
[55]	validation_0-logloss:0.07695
[56]	validation_0-logloss:0.07727
[57]	validation_0-logloss:0.07750
[58]	validation_0-logloss:0.07785
[59]	validation_0-logloss:0.07836
[60]	validation_0-logloss:0.07806
[61]	validation_0-logloss:0.07778
[62]	validation_0-logloss:0.07815
[63]	validation_0-logloss:0.07797
[64]	validation_0-logloss:0.07817
[65]	validation_0-logloss:0.07802
[66]	validation_0-logloss:0.07847
[67]	validation_0-logloss:0.07869
[68]	validation_0-logloss:0.07926
[69]	validation_0-logloss:0.07982
[70]	validation_0-logloss:0.07993
[71]	validatio

In [6]:
print("Feature importances:", model.feature_importances_)

Feature importances: [0.6288842  0.05110995 0.29283082 0.02717502]


In [7]:
import re

def normalize_room_name(name: str) -> str:
    name = name.lower()                           # lowercasing
    name = re.sub(r'[^a-z0-9\s]', '', name)       # remove punctuation/special chars
    name = re.sub(r'\s+', ' ', name).strip()      # collapse multiple spaces
    return name

In [8]:
hotel3_supplier = ["Classic Room - Olympic Queen Bed - ROOM ONLY","CLASSIC ROOM ADA - ROOM ONLY","SUPERIOR ROOM ADA - ROOM ONLY","Superior Room - Olympic Queen Bed - ROOM ONLY","Superior City View - Olympic Queen Bed - ROOM ONLY","Balcony Room - Olympic Queen Bed - ROOM ONLY"]
hotel3_reference = ["Classic Room","Superior Room","Superior Room with City View","Balcony Room","Classic Room - Disability Access","Superior Room - Disability Access","Junior Suite - Disability Access"]

for sup_name in hotel3_supplier:
    best_match = None
    best_score = 0.0
    for ref_name in hotel3_reference:
        # Compute features for this pair (using the same functions and vectorizer as before)
        cos_sim = cosine_similarity(vectorizer.transform([normalize_room_name(sup_name)]), vectorizer.transform([normalize_room_name(ref_name)]))[0,0]
        jac = token_jaccard(sup_name, ref_name)
        substr = 1 if is_substring(sup_name, ref_name) else 0
        seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()
        features = np.array([[cos_sim, jac, substr, seq_ratio]])
        # Predict probability of match
        match_prob = model.predict_proba(features)[0,1]
        if match_prob > best_score:
            best_score = match_prob
            best_match = ref_name
    # Decide if above threshold
    if best_score >= 0.99:  # using 0.5 for this example threshold
        print(f"{sup_name} → **MATCH** with \"{best_match}\" (score={best_score:.2f})")
    else:
        print(f"{sup_name} → **NO MATCH** (outlier) (score={best_score:.2f})")

Classic Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.91)
CLASSIC ROOM ADA - ROOM ONLY → **MATCH** with "Classic Room" (score=1.00)
SUPERIOR ROOM ADA - ROOM ONLY → **NO MATCH** (outlier) (score=0.67)
Superior Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.96)
Superior City View - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Superior Room with City View" (score=1.00)
Balcony Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.96)
