In [3]:
import json
import random
with open("../data/train.json", "r") as f:
    train_data = json.load(f)


In [4]:
total_data = train_data["train_positive"] + train_data["train_negative"]
random.shuffle(total_data)


In [5]:
total_data[:10]

[['room speical room twin', 'room speical room twin', 1],
 ['standard double or twin', 'studio with terrace', 0],
 ['studio 1 double bed', 'threebedroom apartment', 0],
 ['standard double room 1 queen bed', 'queen room', 1],
 ['cottage 1 queen bed ensuite mountain view vinkel',
  'cottage 1 queen bed ensuite mountain view vinkel',
  1],
 ['classic room', 'classic apartment', 0],
 ['single room without ac', 'single room without ac', 1],
 ['double room puerto viejo', 'apartment', 0],
 ['himmapana hills 2 bedroom villa', 'himmapana hills 2 bedroom villa', 1],
 ['deluxe room 2 double beds', 'fivebedroom house', 0]]

In [10]:
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def token_jaccard(s1, s2):
    set1, set2 = set(s1.lower().split()), set(s2.lower().split())
    return len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0.0

def is_substring(s1, s2):
    s1_low, s2_low = s1.lower(), s2.lower()
    return (s1_low in s2_low) or (s2_low in s1_low)


# Compute features
feature_matrix = []
labels = []


all_names = []
for s, r, l in total_data:
    all_names.extend([s,r])

vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)


for sup_name, ref_name, label in total_data:
    # Similarities
    cos_sim = cosine_similarity(vectorizer.transform([sup_name]), vectorizer.transform([ref_name]))[0,0]
    jac = token_jaccard(sup_name, ref_name)
    substr = 1 if is_substring(sup_name, ref_name) else 0
    seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()
    feature_matrix.append([cos_sim, jac, substr, seq_ratio])
    labels.append(label)

# print("Feature vector for each pair:", feature_matrix)
# print("Labels:", labels)


In [25]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Convert to numpy for XGBoost
X = np.array(feature_matrix)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# If classes are imbalanced, set scale_pos_weight or use class_weight to give more weight to matches.
imbalance_ratio = sum(y==0) / sum(y==1)
model.set_params(scale_pos_weight=imbalance_ratio)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# Predict on test set
y_pred = model.predict(X_test)

# Metrics calculation
print("Classification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[0]	validation_0-logloss:0.45526
[1]	validation_0-logloss:0.32329
[2]	validation_0-logloss:0.24001
[3]	validation_0-logloss:0.18456
[4]	validation_0-logloss:0.14712
[5]	validation_0-logloss:0.12023
[6]	validation_0-logloss:0.10193
[7]	validation_0-logloss:0.08798
[8]	validation_0-logloss:0.07796
[9]	validation_0-logloss:0.07147
[10]	validation_0-logloss:0.06571
[11]	validation_0-logloss:0.06149
[12]	validation_0-logloss:0.05943
[13]	validation_0-logloss:0.05786
[14]	validation_0-logloss:0.05671
[15]	validation_0-logloss:0.05548
[16]	validation_0-logloss:0.05512
[17]	validation_0-logloss:0.05505
[18]	validation_0-logloss:0.05445
[19]	validation_0-logloss:0.05505
[20]	validation_0-logloss:0.05504
[21]	validation_0-logloss:0.05471
[22]	validation_0-logloss:0.05502
[23]	validation_0-logloss:0.05527
[24]	validation_0-logloss:0.05680
[25]	validation_0-logloss:0.05702
[26]	validation_0-logloss:0.05703
[27]	validation_0-logloss:0.05844
[28]	validation_0-logloss:0.05902


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[29]	validation_0-logloss:0.05944
[30]	validation_0-logloss:0.05950
[31]	validation_0-logloss:0.05937
[32]	validation_0-logloss:0.05929
[33]	validation_0-logloss:0.05977
[34]	validation_0-logloss:0.06025
[35]	validation_0-logloss:0.06082
[36]	validation_0-logloss:0.06037
[37]	validation_0-logloss:0.06098
[38]	validation_0-logloss:0.06094
[39]	validation_0-logloss:0.06216
[40]	validation_0-logloss:0.06309
[41]	validation_0-logloss:0.06362
[42]	validation_0-logloss:0.06397
[43]	validation_0-logloss:0.06443
[44]	validation_0-logloss:0.06525
[45]	validation_0-logloss:0.06569
[46]	validation_0-logloss:0.06650
[47]	validation_0-logloss:0.06627
[48]	validation_0-logloss:0.06659
[49]	validation_0-logloss:0.06715
[50]	validation_0-logloss:0.06759
[51]	validation_0-logloss:0.06775
[52]	validation_0-logloss:0.06799
[53]	validation_0-logloss:0.06820
[54]	validation_0-logloss:0.06772
[55]	validation_0-logloss:0.06798
[56]	validation_0-logloss:0.06812
[57]	validation_0-logloss:0.06776
[58]	validatio

Feature importances: [0.6255156  0.04862132 0.29742002 0.02844303]


In [20]:
print("Feature importances:", model.feature_importances_)

Feature importances: [0.60646695 0.04998904 0.31703532 0.02650871]


In [27]:
import re

def normalize_room_name(name: str) -> str:
    name = name.lower()                           # lowercasing
    name = re.sub(r'[^a-z0-9\s]', '', name)       # remove punctuation/special chars
    name = re.sub(r'\s+', ' ', name).strip()      # collapse multiple spaces
    return name

In [29]:
hotel3_supplier = ["Classic Room - Olympic Queen Bed - ROOM ONLY","CLASSIC ROOM ADA - ROOM ONLY","SUPERIOR ROOM ADA - ROOM ONLY","Superior Room - Olympic Queen Bed - ROOM ONLY","Superior City View - Olympic Queen Bed - ROOM ONLY","Balcony Room - Olympic Queen Bed - ROOM ONLY"]
hotel3_reference = ["Classic Room","Superior Room","Superior Room with City View","Balcony Room","Classic Room - Disability Access","Superior Room - Disability Access","Junior Suite - Disability Access"]

for sup_name in hotel3_supplier:
    best_match = None
    best_score = 0.0
    for ref_name in hotel3_reference:
        # Compute features for this pair (using the same functions and vectorizer as before)
        cos_sim = cosine_similarity(vectorizer.transform([normalize_room_name(sup_name)]), vectorizer.transform([normalize_room_name(ref_name)]))[0,0]
        jac = token_jaccard(sup_name, ref_name)
        substr = 1 if is_substring(sup_name, ref_name) else 0
        seq_ratio = SequenceMatcher(None, sup_name.lower(), ref_name.lower()).ratio()
        features = np.array([[cos_sim, jac, substr, seq_ratio]])
        # Predict probability of match
        match_prob = model.predict_proba(features)[0,1]
        if match_prob > best_score:
            best_score = match_prob
            best_match = ref_name
    # Decide if above threshold
    if best_score >= 0.99:  # using 0.5 for this example threshold
        print(f"{sup_name} → **MATCH** with \"{best_match}\" (score={best_score:.2f})")
    else:
        print(f"{sup_name} → **NO MATCH** (outlier) (score={best_score:.2f})")

Classic Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.94)
CLASSIC ROOM ADA - ROOM ONLY → **MATCH** with "Classic Room" (score=1.00)
SUPERIOR ROOM ADA - ROOM ONLY → **NO MATCH** (outlier) (score=0.97)
Superior Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.95)
Superior City View - Olympic Queen Bed - ROOM ONLY → **MATCH** with "Superior Room with City View" (score=1.00)
Balcony Room - Olympic Queen Bed - ROOM ONLY → **NO MATCH** (outlier) (score=0.95)
