# 5. BONUS: Advanced Search Engine

This section implements an advanced search engine that allows users to query a dataset of restaurants using multiple filters. The system combines text-based search with additional filters such as price range, accepted credit cards, and offered services to deliver highly relevant results.

In [64]:
import ipywidgets as widgets
from IPython.display import display

import pandas as pd
import re
from nltk.corpus import stopwords

from collections import defaultdict

The following cell defines the **`UserInputInterface`** class, designed to create an interactive user interface using **ipywidgets**. The goal is to allow users to specify advanced search criteria for restaurants.

Users can specify search terms for the following features (some or all):
- **restaurantName**
- **city**
- **cuisineType**

The other tabs are for filtering by:
- **price range**
- **Region**
- **Accepted credit cards**
- **Offered services**

In [171]:
class UserInputInterface:
    def __init__(self):

        layout_with_description = widgets.Layout(width='50%')
        label_layout = widgets.Layout(description_width='150px')
        
        self.restaurant_name = widgets.Text(placeholder='Name of restaurant', description='Restaurant:')
        self.city = widgets.Text(placeholder='City', description='City:')
        self.cuisine_type = widgets.Text(placeholder='Cucine type', description='Cucine:')

        price_options = {
            '€': 1,
            '€€': 2,
            '€€€': 3,
            '€€€€': 4
        }

        self.price_range = widgets.SelectionRangeSlider(
            options=list(price_options.keys()),
            index=(0, 3),
            description='Price:',
            continuous_update=False
        )

        self.regions = widgets.SelectMultiple(
            options=['Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 
                     'Friuli Venezia Giulia', 'Lazio', 'Liguria', 'Lombardia', 'Marche', 
                     'Molise', 'Piemonte', 'Puglia', 'Sardegna', 'Sicilia', 'Toscana', 
                     'Trentino-Alto Adige', 'Umbria', 'Valle d\'Aosta', 'Veneto'],
            description='Region:',
            rows=7
        )

        credit_card_options = ['Amex', 'Dinersclub', 'Mastercard', 'Visa', 'Discover', 'JCB', 'Unionpay', 'Maestro', 'CartaSi']
        self.credit_card_checkboxes = [widgets.Checkbox(value=False, description=card) for card in credit_card_options]

        self.credit_cards_grid = widgets.GridBox(
            children=self.credit_card_checkboxes,
            layout=widgets.Layout(grid_template_columns="repeat(3, 1fr)", gap="10px")
        )

        self.facilities = widgets.SelectMultiple(
            options=['Air conditioning', 'Interesting wine list', 'Wheelchair access', 'Terrace', 'Counter dining',
                     'Great view', 'Garden or park', 'Car park', 'Restaurant offering vegetarian menus', 'Brunch','Valet parking'],
            description='Services:',
            rows=11,
            layout=widgets.Layout(width='400px')
        )

        self.output = widgets.Output()
        self.search_button = widgets.Button(description="Start Search")
        self.search_button.on_click(self.on_search_button_clicked)
    
    def display(self):
        tab = widgets.Tab()
        tab_contents = [
            widgets.VBox([self.restaurant_name, self.city, self.cuisine_type]),
            widgets.VBox([self.price_range]),
            widgets.VBox([self.regions]),
            widgets.VBox([self.credit_cards_grid]),
            widgets.VBox([self.facilities])
        ]
        tab.children = tab_contents
        tab.set_title(0, 'General Criteria')
        tab.set_title(1, 'Price')
        tab.set_title(2, 'Region')
        tab.set_title(3, 'Accepted Cards')
        tab.set_title(4, 'Offered Services')

        display(tab, self.search_button, self.output)

    def get_values(self):
        return {
            'restaurantName': self.restaurant_name.value,
            'city': self.city.value,
            'cuisineType': self.cuisine_type.value,
            'priceRange': self.price_range.value,
            'regions': list(self.regions.value),
            'creditCards': [checkbox.description for checkbox in self.credit_card_checkboxes if checkbox.value],
            'facilities': list(self.facilities.value)
        }

    def on_search_button_clicked(self, b):
        with self.output:
            self.output.clear_output()
            values = self.get_values()
            print("Collected Values:")
            for key, value in values.items():
                print(f"{key}: {value}")

The following cell performs **pre-processing** on the restaurant dataset:

- **Stopwords**: Defines and combines Italian and English stopwords.
- **Cleaning Functions**:
  - **`clean_text_with_stopwords`**: Cleans text by removing stopwords and special characters.
  - **`clean_text_basic`**: Cleans text while keeping all words (used for the `city` field).
- **Cleaning Application**:
  - Creates normalized (`_clean`) versions of the `restaurantName`, `cuisineType`, and `city` fields.
- **Decoding**: Converts `facilitiesServices` and `creditCards` fields into Python lists.
- **Output**: Displays a preview of the pre-processed DataFrame.


In [172]:
# Italian and English stopwords
stopwords_italian = set([
    'il', 'lo', 'la', 'i', 'gli', 'le', 'un', 'una', 'del', 'della', 'dello',
    'dei', 'di', 'a', 'ai', 'al', 'allo', 'alle', 'degli', 'd', 'da', 'dal',
    'dallo', 'in', 'nel', 'nello', 'su', 'sul', 'sullo', 'con', 'per', 'tra',
    'fra', 'e'
])
stopwords_english = set(stopwords.words('english'))
stopwords_combined = stopwords_italian.union(stopwords_english)

# Removing stopwords
def clean_text_with_stopwords(text):
    if pd.isna(text):
        return ''
    text = text.lower().strip()
    text = re.sub(r"[^\w\s']", '', text) 
    words = text.split()
    words = [word for word in words if word not in stopwords_combined]  
    return ' '.join(words)

# Basic clean without removing stopwords
def clean_text_basic(text):
    if pd.isna(text):
        return ''
    return text.strip().lower()

#Read CSV
file_path = r'C:\Users\matti\Documents\ADM\HW3\merged_output.csv'
df = pd.read_csv(file_path)


df['restaurantName_clean'] = df['restaurantName'].apply(clean_text_with_stopwords)
df['cuisineType_clean'] = df['cuisineType'].apply(clean_text_with_stopwords)
# For the city field, we do not remove stopwords but apply basic cleaning, as removing them could affect the meaning of the city name.
df['city_clean'] = df['city'].apply(clean_text_basic)

df['facilitiesServices_clean'] = df['facilitiesServices'].apply(lambda x: eval(x) if pd.notna(x) else [])
df['creditCards_clean'] = df['creditCards'].apply(lambda x: eval(x) if pd.notna(x) else [])

print("Pre_processed data:")
display(df[['restaurantName','restaurantName_clean', 'city','city_clean', 'cuisineType','cuisineType_clean', 'facilitiesServices','facilitiesServices_clean', 'creditCards','creditCards_clean','priceRange']].head())


Pre_processed data:


Unnamed: 0,restaurantName,restaurantName_clean,city,city_clean,cuisineType,cuisineType_clean,facilitiesServices,facilitiesServices_clean,creditCards,creditCards_clean,priceRange
0,Burro & Alici,burro alici,Marotta,marotta,"Seafood, Classic Cuisine",seafood classic cuisine,['Terrace'],[Terrace],"['Amex', 'Mastercard', 'Visa']","[Amex, Mastercard, Visa]",€€
1,Oishi,oishi,Teramo,teramo,"Japanese, Fusion",japanese fusion,"['Air conditioning', 'Counter dining', 'Terrace']","[Air conditioning, Counter dining, Terrace]","['Amex', 'Unionpay', 'Dinersclub', 'Discover',...","[Amex, Unionpay, Dinersclub, Discover, Jcb, Ma...",€€
2,Uliassi,uliassi,Senigallia,senigallia,"Creative, Seafood",creative seafood,"['Air conditioning', 'Great view', 'Interestin...","[Air conditioning, Great view, Interesting win...","['Amex', 'Dinersclub', 'Mastercard', 'Visa']","[Amex, Dinersclub, Mastercard, Visa]",€€€€
3,Oberlechner,oberlechner,Algund,algund,"Regional Cuisine, Classic Cuisine",regional cuisine classic cuisine,"['Car park', 'Garden or park', 'Great view', '...","[Car park, Garden or park, Great view, Terrace]","['Mastercard', 'Visa']","[Mastercard, Visa]",€€
4,Ai Gondolieri,gondolieri,sestiere Dorsoduro 366,sestiere dorsoduro 366,"Venetian, Traditional Cuisine",venetian traditional cuisine,['Air conditioning'],[Air conditioning],"['Amex', 'Maestrocard', 'Mastercard', 'Visa']","[Amex, Maestrocard, Mastercard, Visa]",€€€


In [173]:
# Function to create an inverted index for a specific field
def build_inverted_index(df, column):
    inverted_index = defaultdict(set)  
    for idx, value in df[column].items():
        terms = value.split() 
        for term in terms:
            inverted_index[term].add(idx) 
    return inverted_index

# Creation of inverted indices
restaurant_name_index = build_inverted_index(df, 'restaurantName_clean')
city_index = build_inverted_index(df, 'city_clean')
cuisine_type_index = build_inverted_index(df, 'cuisineType_clean')

print("Indice invertito per restaurantName (parziale):")
for term, indices in list(restaurant_name_index.items()):
    print(f"'{term}': {indices}")


Indice invertito per restaurantName (parziale):
'burro': {0}
'alici': {0, 1390}
'oishi': {1}
'uliassi': {2}
'oberlechner': {3}
'gondolieri': {4}
'locanda': {1152, 1028, 5, 645, 392, 649, 1674, 1419, 1548, 655, 1552, 273, 20, 1815, 1948, 1823, 803, 1957, 1191, 683, 1838, 559, 53, 1590, 1591, 1465, 1978, 1468, 574, 1859, 196, 198, 327, 845, 719, 464, 209, 1103, 1361, 1487, 213, 726, 1618, 88, 1243, 732, 1116, 866, 994, 1890, 614, 615, 1128, 745, 1001, 1384, 1388, 749, 1134, 367, 496, 241, 1775, 628, 887, 1274, 892}
'belvedere': {578, 1076, 5}
'tenuta': {433, 1949, 6}
'gallo': {166, 934, 6, 1395, 122, 894}
'fico': {7}
'trentacareghe': {7}
'trattoria': {1923, 1019, 8, 906, 654, 400, 1298, 22, 1303, 24, 1176, 922, 1178, 1690, 1820, 1944, 672, 800, 1575, 1194, 555, 301, 1457, 826, 187, 188, 1596, 1853, 1980, 192, 965, 583, 1483, 593, 1617, 1369, 90, 1754, 861, 1248, 1377, 228, 1510, 231, 635, 1641, 234, 491, 874, 493, 621, 1768, 1773, 1137, 1777, 1395, 1651, 122, 507, 764}
'parco': {8, 582}


The following cell implements the **advanced search**:

- **`search_inverted_indices`**:
  - Searches indexed fields for terms provided by the user.
  - Calculates a score for each result based on term matches.

- **`advanced_search_with_filters`**:
  - Applies the same cleaning process to both user input and indexed fields.
  - Filters raw results using additional criteria: **price range**, **accepted credit cards**, and **offered services**.
  - Sorts and returns the most relevant results.


In [174]:
# Function to search within indexed fields and calculate scores
def search_inverted_indices(query, indices):
    results = defaultdict(int) 

    for field, terms in query.items():
        if field not in indices:
            continue
        for term in terms.split(): 
            if term in indices[field]:
                for idx in indices[field][term]:
                    results[idx] += 1 
    
    return results

def advanced_search_with_filters(user_input, indices, df, top_n=10):
    price_order = {
        '€': 1,
        '€€': 2,
        '€€€': 3,
        '€€€€': 4
    }

    # Apply the same cleaning process used for specific fields in the dataset to the user input as well.
    query = {
        'restaurantName_clean': clean_text_with_stopwords(user_input['restaurantName']),
        'city_clean': clean_text_with_stopwords(user_input['city']),
        'cuisineType_clean': clean_text_with_stopwords(user_input['cuisineType'])
    }

    raw_results = search_inverted_indices(query, indices)

    # Filter the results based on additional criteria such as price, region, accepted credit cards, and offered services.
    filtered_results = []
    for idx, score in raw_results.items():
        row = df.loc[idx]

        # Price filter
        if not (price_order[row['priceRange']] >= price_order[user_input['priceRange'][0]] and
                price_order[row['priceRange']] <= price_order[user_input['priceRange'][1]]):
            continue

        # Cards Filter
        if not all(card in row['creditCards_clean'] for card in user_input['creditCards']):
            continue

        # Services Filter
        if not all(facility in row['facilitiesServices_clean'] for facility in user_input['facilities']):
            continue

        filtered_results.append((idx, score))

    sorted_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)
    top_results = df.loc[[r[0] for r in sorted_results[:top_n]]]

    return top_results[['restaurantName', 'address', 'cuisineType', 'priceRange', 'website']]


In [None]:
# UI
ui = UserInputInterface()
ui.display()

Tab(children=(VBox(children=(Text(value='', description='Restaurant:', placeholder='Name of restaurant'), Text…

Button(description='Start Search', style=ButtonStyle())

Output()

In [176]:
#Result
user_input = ui.get_values()
print(user_input)
indices = {
    'restaurantName_clean': restaurant_name_index,
    'city_clean': city_index,
    'cuisineType_clean': cuisine_type_index
}
results = advanced_search_with_filters(user_input, indices, df)
display(results)


{'restaurantName': '', 'city': 'Rome', 'cuisineType': '', 'priceRange': ('€€€€', '€€€€'), 'regions': [], 'creditCards': [], 'facilities': ['Terrace']}


Unnamed: 0,restaurantName,address,cuisineType,priceRange,website
770,All'Oro,via Giuseppe Pisanelli 25,"Creative, Italian Contemporary",€€€€,https://www.ristorantealloro.it/
54,Mirabelle,via di Porta Pinciana 14,Italian Contemporary,€€€€,https://www.mirabelle.it
1601,Per Me Giulio Terrinoni,vicolo del Malpasso 9,"Italian, Creative",€€€€,https://www.giulioterrinoni.it
708,La Pergola,via Cadlolo 101,"Mediterranean Cuisine, Contemporary",€€€€,https://romecavalieri.com/it/la-pergola-it/
610,Orma Roma,via Boncompagni 31,"Italian Contemporary, Colombian",€€€€,http://www.ormaroma.it
1382,Mater Terrae,largo Febo 2,"Vegetarian, Contemporary",€€€€,https://www.biohotelraphael.com/
624,Il Ristorante - Niko Romito,via di Ripetta 73,Italian Contemporary,€€€€,https://www.bulgarihotels.com/it_IT/rome/dinin...
506,Aroma,via Labicana 125,Modern Cuisine,€€€€,https://www.manfredihotels.com/aroma/
