In [6]:
import re
import json

def normalize_room_name(name: str) -> str:
    name = name.lower()                           # lowercasing
    name = re.sub(r'[^a-z0-9\s]', '', name)       # remove punctuation/special chars
    name = re.sub(r'\s+', ' ', name).strip()      # collapse multiple spaces
    return name


# print(normalize_room_name("Deluxe Suite with Balcony!"))  
with open("../data/merged.json", "r") as f:
    data = json.load(f)

In [8]:
clean_data = [] 
for d in data:
    clean_reference_data = []
    clean_supplier_data = []
    for ref in d["reference"]:
        clean_reference_data.append(normalize_room_name(ref))

    for sup in d["supplier"]:
        clean_supplier_data.append(normalize_room_name(sup))

    clean_data.append({"reference": clean_reference_data, "supplier": clean_supplier_data})
    

In [10]:
clean_data[:3]

[{'reference': ['house 4 bedrooms'], 'supplier': ['house 4 bedrooms']},
 {'reference': ['onebedroom apartment with terrace',
   'onebedroom apartment with terrace and sea view'],
  'supplier': ['double room single use',
   'double or twin room',
   'triple room',
   'triple room 2 adults 1 child',
   'quadruple room 2 adults 2 children',
   'quadruple room']},
 {'reference': ['condo 1 queen bed with sofa bed kitchen'],
  'supplier': ['condo 1 queen bed with sofa bed kitchen']}]

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
ignore_words = [
 'a',
 'access',
 'accessible',
 'air',
 'allowed',
 'and',
 'basic',
 'bath',
 'bathroom',
 'bathrooms',
 'bed',
 'bedroom',
 'bedrooms',
 'beds',
 'cabin',
 'chalet',
 'city',
 'classic',
 'comfort',
 'companion',
 'complimentary',
 'dinner',
 'first',
 'floor',
 'garden',
 'guest',
 'halfboard',
 'hearing',
 'high',
 'hot',
 'included',
 'is',
 'j',
 'jacuzzi',
 'japanese',
 'junior',
 'kitchen',
 'luxury',
 'mandatory',
 'menaggio',
 'microwave',
 'mobility',
 'mountain',
 'multiple',
 'netflixdis',
 'no',
 'non',
 'nonsmoking',
 'not',
 'ocean',
 'one',
 'open',
 'or',
 'panoramic',
 'partial',
 'pet',
 'pets',
 'pool',
 'resort',
 'romantic',
 'room',
 'royal',
 's',
 'sea',
 'shared',
 'signature',
 'smoking',
 'sofa',
 'spa',
 'standard',
 'suite',
 'superior',
 'supreme',
 'terrace',
 'third',
 'three',
 'tub',
 'twin',
 'two',
 'view',
 'village',
 'w',
 'waterfront',
 'western',
 'with'] + [str(i) for i in range(0,10) ]


In [45]:
def analyze_strings(data, word_to_check):
    s = data['s'].lower()
    r = data['r'].lower()
    
    # Check if any word from the list is at the start or end of 'r'

    combined = " ".join(word_to_check)
    starts_with_word = r.startswith(combined)
    ends_with_word = r.endswith(combined)
    
    s_words = s.split()
    r_words = r.split()

    if starts_with_word:
        w_index = r_words.index(word_to_check[-1])
        check_index = w_index+1
        if r_words[check_index] != s_words[0]:
            return False

    if ends_with_word:
        w_index = r_words.index(word_to_check[0])
        check_index = w_index-1
        if r_words[check_index] != s_words[-1]:
            return False
            
    
    # If no replacements found, return True if the word is added at the start or end
    return starts_with_word or ends_with_word

In [49]:
all_names = [name for hid in range(len(clean_data)) for name in clean_data[hid]["supplier"] + clean_data[hid]["reference"]]
vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)

# Identify likely matches by cosine similarity within each hotel

words_not_matching = []
threshold = 0.7
for hotels in clean_data:
    supplier = hotels['supplier']
    reference = hotels['reference']
    sims = cosine_similarity(vectorizer.transform(supplier), vectorizer.transform(reference))
    for i, sup_name in enumerate(supplier):
        highest_score = 0
        highest_match = {}
        for j, ref_name in enumerate(reference):
            if sims[i,j] > highest_score:
                highest_match = {"s": sup_name, "r": ref_name}
                highest_score = sims[i,j]

        if highest_score >= 0.7 and highest_score<0.9999999999:
            r_words = highest_match["r"].split()
            no_match = [w for w in r_words if w not in highest_match["s"].split()]
            no_match_count = len(no_match)
            for w in no_match:
                if w in ignore_words:
                    no_match_count -=1
            # print(r_words, no_match)
            if not no_match_count:
                # print(f"{highest_score:.2f}", highest_match, no_match)
                pass
            else:
                if analyze_strings(highest_match, no_match):
                    print(f"{highest_score:.2f}", highest_match["s"], "\t", highest_match["r"], no_match ,)
            # words_not_matching.extend(no_match)
            # break
# set(words_not_matching)

0.82 twin room 	 double or twin room ['double', 'or']
0.71 twin room 2 twin beds 	 double or twin room ['double', 'or']
0.74 twin room 	 deluxe twin room ['deluxe']
0.90 suite city view 	 deluxe suite city view ['deluxe']
0.70 comfort double room 	 comfort double room ensuite ['ensuite']
0.74 twin room 	 deluxe twin room ['deluxe']
0.81 villa 	 deluxe villa ['deluxe']
0.74 twin room 	 deluxe twin room ['deluxe']
0.77 tranquil studio double 	 tranquil studio double guest room 1 king city view ['guest', 'room', '1', 'king', 'city', 'view']
0.82 twin room 	 double or twin room ['double', 'or']
0.78 standard double or twin room 	 standard double or twin room with balcony ['with', 'balcony']
0.93 twin room multiple beds non smoking 	 deluxe twin room multiple beds non smoking ['deluxe']
0.76 double room 1 double bed smoking 	 business double room 1 double bed smoking ['business']
0.96 suite est 	 deluxe suite est ['deluxe']
0.83 suite 2 bedrooms sea view 	 executive suite 2 bedrooms sea vie

['0',
 '1',
 '2',
 '2200last',
 '3',
 '4',
 '6',
 '9',
 'a',
 'access',
 'accessible',
 'air',
 'allowed',
 'and',
 'apartment',
 'balcony',
 'basic',
 'bath',
 'bathroom',
 'bathrooms',
 'bed',
 'bedroom',
 'bedrooms',
 'beds',
 'bungalow',
 'business',
 'cabin',
 'chalet',
 'city',
 'classic',
 'comfort',
 'companion',
 'complimentary',
 'deluxe',
 'dinner',
 'double',
 'duplex',
 'economy',
 'ensuite',
 'executive',
 'family',
 'first',
 'floor',
 'garden',
 'grand',
 'guest',
 'halfboard',
 'hearing',
 'high',
 'hot',
 'house',
 'included',
 'is',
 'j',
 'jacuzzi',
 'japanese',
 'junior',
 'king',
 'kitchen',
 'luxury',
 'mandatory',
 'menaggio',
 'microwave',
 'mobility',
 'mountain',
 'multiple',
 'netflixdis',
 'no',
 'non',
 'nonsmoking',
 'not',
 'ocean',
 'one',
 'open',
 'or',
 'panoramic',
 'partial',
 'pet',
 'pets',
 'pool',
 'premium',
 'private',
 'quadruple',
 'queen',
 'resort',
 'romantic',
 'room',
 'royal',
 's',
 'sea',
 'shared',
 'signature',
 'single',
 'smokin