# Load Cleaned Data in Jupyter Notebook

In [1]:
import pandas as pd

In [2]:
file_path='../data/hotel_bookings_final.csv'

In [3]:
df=pd.read_csv(file_path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87396 entries, 0 to 87395
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           87396 non-null  object 
 1   is_canceled                     87396 non-null  int64  
 2   lead_time                       87396 non-null  int64  
 3   arrival_date_year               87396 non-null  int64  
 4   arrival_date_month              87396 non-null  int64  
 5   arrival_date_week_number        87396 non-null  int64  
 6   arrival_date_day_of_month       87396 non-null  int64  
 7   stays_in_weekend_nights         87396 non-null  int64  
 8   stays_in_week_nights            87396 non-null  int64  
 9   adults                          87396 non-null  int64  
 10  children                        87396 non-null  float64
 11  babies                          87396 non-null  int64  
 12  meal                            

In [5]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date,total_revenue,total_nights
0,Resort Hotel,0,342,2015,7,27,1,0,0,2,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,0.0,0
1,Resort Hotel,0,737,2015,7,27,1,0,0,2,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-07-01,0.0,0
2,Resort Hotel,0,7,2015,7,27,1,0,1,1,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,75.0,1
3,Resort Hotel,0,13,2015,7,27,1,0,1,1,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,75.0,1
4,Resort Hotel,0,14,2015,7,27,1,0,2,2,...,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,196.0,2


# Select Features for Embeddings

Selecting text-based columns for embeddings

In [6]:
selected_columns = ['hotel', 'country', 'reserved_room_type', 
                    'market_segment', 'customer_type', 'deposit_type', 
                    'reservation_status']

In [7]:
df[selected_columns].isnull().sum()

hotel                 0
country               0
reserved_room_type    0
market_segment        0
customer_type         0
deposit_type          0
reservation_status    0
dtype: int64

Create a combined text field

In [8]:
df['text_data'] = df[selected_columns].astype(str).agg(' '.join, axis=1)

Display sample text data

In [9]:
df[['text_data']].head()

Unnamed: 0,text_data
0,Resort Hotel Portugal C Direct Transient No De...
1,Resort Hotel Portugal C Direct Transient No De...
2,Resort Hotel United Kingdom A Direct Transient...
3,Resort Hotel United Kingdom A Corporate Transi...
4,Resort Hotel United Kingdom A Online TA Transi...


# Generate Embeddings Using Sentence Transformers

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np

Load a pre-trained sentence transformer model

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2',device='cuda')

Generate embeddings for each booking's text data

In [12]:
embeddings_array = model.encode(df['text_data'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/2732 [00:00<?, ?it/s]

Save the embeddings as a NumPy array

In [13]:
np.save("../hotel_bookings_embeddings.npy", embeddings_array)

Save the updated DataFrame with an embeddings reference

In [14]:
df['embedding_index'] = range(len(df))

In [15]:
df.to_csv("../data/hotel_bookings_with_embeddings.csv", index=False)

In [16]:
print(f"✅ Embeddings generated and saved! Shape: {embeddings_array.shape}")

✅ Embeddings generated and saved! Shape: (87396, 384)


# Build FAISS Index

In [17]:
import faiss

Load the saved embeddings

In [18]:
embeddings_array = np.load("../hotel_bookings_embeddings.npy")

Get the embedding dimension (should be 384 for 'all-MiniLM-L6-v2')

In [19]:
embedding_dim = embeddings_array.shape[1]

In [20]:
embedding_dim

384

Create a FAISS index for fast nearest-neighbor search

In [21]:
index = faiss.IndexFlatL2(embedding_dim)

Add embeddings to FAISS index

In [22]:
index.add(embeddings_array)

Save the FAISS index

In [23]:
faiss.write_index(index, "../faiss_index.bin")

print(f"✅ FAISS Index Created & Saved! Total vectors: {index.ntotal}")

✅ FAISS Index Created & Saved! Total vectors: 87396


Define the Search Function

In [24]:
def search_faiss(query, top_k=5):
    """
    Converts a user query into an embedding, searches FAISS, 
    and retrieves the most similar hotel bookings.
    """
    # Convert query into an embedding
    query_embedding = model.encode([query])

    # Search FAISS for the top_k most similar results
    D, I = index.search(np.array(query_embedding, dtype=np.float32), k=top_k)

    # Retrieve matching rows from the dataset
    results = df.iloc[I[0]][['text_data', 'hotel', 'country', 'market_segment', 'reservation_status']]
    
    return results

In [25]:
user_query = "Find me hotels in Portugal with no deposit."
search_results = search_faiss(user_query)

# Display results
print("✅ Retrieved Bookings:")
print(search_results)

✅ Retrieved Bookings:
                                               text_data       hotel  \
34239  City Hotel Portugal A Complementary Transient ...  City Hotel   
34256  City Hotel Portugal A Complementary Transient ...  City Hotel   
34272  City Hotel Portugal A Complementary Transient ...  City Hotel   
34309  City Hotel Portugal A Complementary Transient ...  City Hotel   
34480  City Hotel Portugal A Complementary Transient ...  City Hotel   

        country market_segment reservation_status  
34239  Portugal  Complementary          Check-Out  
34256  Portugal  Complementary          Check-Out  
34272  Portugal  Complementary          Check-Out  
34309  Portugal  Complementary          Check-Out  
34480  Portugal  Complementary          Check-Out  
