In [1]:
import re
import json

def normalize_room_name(name: str) -> str:
    name = name.lower()                           # lowercasing
    name = re.sub(r'[^a-z0-9\s]', '', name)       # remove punctuation/special chars
    name = re.sub(r'\s+', ' ', name).strip()      # collapse multiple spaces
    return name


# print(normalize_room_name("Deluxe Suite with Balcony!"))  
with open("../data/merged.json", "r") as f:
    data = json.load(f)

In [2]:
clean_data = [] 
for d in data:
    clean_reference_data = []
    clean_supplier_data = []
    for ref in d["reference"]:
        clean_reference_data.append(normalize_room_name(ref))

    for sup in d["supplier"]:
        clean_supplier_data.append(normalize_room_name(sup))

    clean_data.append({"reference": clean_reference_data, "supplier": clean_supplier_data})
    

In [3]:
clean_data[:3]

[{'reference': ['house 4 bedrooms'], 'supplier': ['house 4 bedrooms']},
 {'reference': ['onebedroom apartment with terrace',
   'onebedroom apartment with terrace and sea view'],
  'supplier': ['double room single use',
   'double or twin room',
   'triple room',
   'triple room 2 adults 1 child',
   'quadruple room 2 adults 2 children',
   'quadruple room']},
 {'reference': ['condo 1 queen bed with sofa bed kitchen'],
  'supplier': ['condo 1 queen bed with sofa bed kitchen']}]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
ignore_words = [
 'access',
 'accessible',
 'air',
 'allowed',
 'and',
 'basic',
 'bath',
 'bathroom',
 'bathrooms',
 'bed',
 'bedroom',
 'bedrooms',
 'beds',
 'cabin',
 'chalet',
 'city',
 'classic',
 'comfort',
 'companion',
 'complimentary',
 'dinner',
 'first',
 'floor',
 'garden',
 'guest',
 'halfboard',
 'hearing',
 'high',
 'hot',
 'included',
 'is',
 'j',
 'jacuzzi',
 'japanese',
 'junior',
 'kitchen',
 'luxury',
 'mandatory',
 'menaggio',
 'microwave',
 'mobility',
 'mountain',
 'multiple',
 'netflixdis',
 'no',
 'non',
 'nonsmoking',
 'not',
 'ocean',
 'one',
 'open',
 'or',
 'panoramic',
 'partial',
 'pet',
 'pets',
 'pool',
 'resort',
 'romantic',
 'room',
 'royal',
 's',
 'sea',
 'shared',
 'signature',
 'smoking',
 'sofa',
 'spa',
 'standard',
 'suite',
 'superior',
 'supreme',
 'terrace',
 'third',
 'three',
 'tub',
 'twin',
 'two',
 'view',
 'village',
 'w',
 'waterfront',
 'western',
 'with'] + [str(i) for i in range(0,10) ]


In [23]:
def analyze_strings(data, word_to_check):
    if not word_to_check:
        return True
    s = data['s'].lower()
    r = data['r'].lower()
    
    # Check if any word from the list is at the start or end of 'r'

    combined = " ".join(word_to_check)
    starts_with_word = r.startswith(combined)
    ends_with_word = r.endswith(combined)
    
    s_words = s.split()
    r_words = r.split()

    if starts_with_word:
        w_index = r_words.index(word_to_check[-1])
        check_index = w_index+1
        if check_index >= len(r_words):
            print(data, word_to_check)
        if r_words[check_index] != s_words[0]:
            return False

    if ends_with_word:
        w_index = r_words.index(word_to_check[0])
        check_index = w_index-1
        if r_words[check_index] != s_words[-1]:
            return False
            
    
    # If no replacements found, return True if the word is added at the start or end
    return starts_with_word or ends_with_word

In [58]:
all_names = [name for hid in range(len(clean_data)) for name in clean_data[hid]["supplier"] + clean_data[hid]["reference"]]
vectorizer = TfidfVectorizer(stop_words='english').fit(all_names)

# Identify likely matches by cosine similarity within each hotel

good_pairs = set()
bad_pairs = set()

words_not_matching = []
threshold = 0.7
for hotels in clean_data:
    supplier = hotels['supplier']
    reference = hotels['reference']
    sims = cosine_similarity(vectorizer.transform(supplier), vectorizer.transform(reference))
    for i, sup_name in enumerate(supplier):
        highest_score = 0
        highest_match = {}
        for j, ref_name in enumerate(reference):
            if sims[i,j] >= highest_score:
                highest_match = {"s": sup_name, "r": ref_name}
                highest_score = sims[i,j]

        if highest_score >= 0.5:# and highest_score<0.9999999999:
            r_words = highest_match["r"].split()
            no_match = [w for w in r_words if w not in highest_match["s"].split()]
            no_match_count = len(no_match)
            for w in no_match:
                if w in ignore_words:
                    no_match_count -=1
            # print(r_words, no_match)
            if not no_match_count:
                # print(f"{highest_score:.2f}", highest_match, no_match)
                if analyze_strings(highest_match, no_match):
                    # print(f"{highest_score:.2f}", highest_match["s"], "\t", highest_match["r"], no_match ,)
                    good_pairs.add((highest_match["s"], highest_match["r"], 1 ))
                    continue
 
        bad_pairs.add((highest_match["s"], highest_match["r"], 0 ))
                # if analyze_strings(highest_match, no_match):
                #     print(f"{highest_score:.2f}", highest_match["s"], "\t", highest_match["r"], no_match ,)
            # words_not_matching.extend(no_match)
            # break
# set(words_not_matching)

In [26]:
len(good_pairs), len(bad_pairs)

(52293, 61425)

In [31]:
len(good_pairs), len(bad_pairs)

(20161, 43648)

In [59]:
len(good_pairs), len(bad_pairs)

(20542, 43267)

In [33]:
list(good_pairs)[50:60]

[('standard double room non smoking schafgarbe',
  'standard double room non smoking schafgarbe',
  1),
 ('luxury penthouse 2 bedrooms balcony city view',
  'luxury penthouse 2 bedrooms balcony city view',
  1),
 ('room momomonica', 'room momomonica', 1),
 ('standard double room upperdeck', 'standard double room upperdeck', 1),
 ('superior double room external private bathroom',
  'superior double room external private bathroom',
  1),
 ('apartment 2 bedrooms accessible private pool',
  'apartment 2 bedrooms accessible private pool',
  1),
 ('executive apartment 2 queen beds', 'executive apartment 2 queen beds', 1),
 ('comfort room 2 twin beds annexe', 'comfort room 2 twin beds annexe', 1),
 ('basic single private ensuite room', 'basic single private ensuite room', 1),
 ('family room balcony', 'family room balcony', 1)]

In [65]:
d = 0 
same_list = []
diff_list = []
for pair in list(good_pairs):
    a, b,c = pair
    if a != b:
        diff_list.append(pair)
    else:
        same_list.append(pair)
        


In [66]:
len(same_list), len(diff_list)

(19735, 807)

In [70]:
import random
train_positive = random.sample(same_list, 2000- len(diff_list)) + diff_list

In [72]:
train_negative = random.sample(list(bad_pairs), 2000)

In [75]:
import json
with open("../data/train.json", "w") as f:
    json.dump({"train_positive": train_positive, "train_negative": train_negative} , f)

In [76]:
with open("../data/train.json", "r") as f:
    data_r = json.load(f)

In [None]:
data_r