### *Import Necessary Libraries and Define Preprocessing Functions*

In [2]:
import pandas as pd
import re
import math
import json
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Initialize NLP tools for preprocessing
stop_words = set(stopwords.words('english'))  # Common stopwords
stemmer = PorterStemmer()  # Stemmer for reducing words to their base forms
lemmatizer = WordNetLemmatizer()  # Lemmatizer for better linguistic normalization

def preprocess_text(text):
    """
    Preprocess the text:
    - Remove punctuation
    - Convert to lowercase
    - Remove stopwords
    - Apply stemming and lemmatization for normalization
    """
    if not isinstance(text, str):
        return []
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = text.lower().split()
    return [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stop_words]

## *Create Vocabulary and Inverted Index (Task 2.1.1)*

In [6]:
def build_vocabulary(documents):
    """
    Create a vocabulary mapping each unique word in the dataset to a unique integer (term ID).
    """
    return {word: idx for idx, word in enumerate(sorted(set(word for doc in documents for word in doc)))}

def build_inverted_index(documents, vocabulary):
    """
    Create an inverted index that maps each term ID to the list of document IDs where the term appears.
    """
    inverted_index = defaultdict(list)
    for doc_id, doc in enumerate(documents):
        for word in doc:
            if word in vocabulary:  # Only process words in the vocabulary
                term_id = vocabulary[word]
                if doc_id not in inverted_index[term_id]:  # Avoid duplicate entries
                    inverted_index[term_id].append(doc_id)
    return inverted_index

# Load dataset
df = pd.read_csv("/Users/roberto/Desktop/ADM-HM3/REPO/michelin_restaurants.csv")

# Preprocess restaurant descriptions
descriptions = [preprocess_text(desc) for desc in df['description']]

# Create vocabulary and inverted index
vocabulary = build_vocabulary(descriptions)  # Map words to unique IDs
inverted_index = build_inverted_index(descriptions, vocabulary)  # Map term IDs to document IDs

# Save vocabulary and inverted index for future use
vocabulary_path = "/Users/roberto/Desktop/ADM-HM3/REPO_ADM_HM3/ADM-HM3/directory_HM3_ADM/vocabulary.csv"
inverted_index_path = "/Users/roberto/Desktop/ADM-HM3/REPO_ADM_HM3/ADM-HM3/directory_HM3_ADM/json/inverted_index.json"

# Save vocabulary as CSV
pd.DataFrame(list(vocabulary.items()), columns=["Word", "Term ID"]).to_csv(vocabulary_path, index=False)

# Save inverted index as JSON
with open(inverted_index_path, 'w') as f:
    json.dump(inverted_index, f)

print(f"Vocabulary and inverted index created and saved to files.")


Vocabulary and inverted index created and saved to files.


## *Execute Conjunctive Query (Task 2.1.2)*

In [11]:
def conjunctive_query(query, vocabulary, inverted_index):
    """
    Execute a conjunctive query:
    - Find restaurants where all query words are present in their description.
    """
    query_tokens = preprocess_text(query)  # Preprocess the query terms
    term_ids = [vocabulary[word] for word in query_tokens if word in vocabulary]  # Map query words to term IDs

    if not term_ids:  # If no query words are in the vocabulary, return empty
        return []

    # Find documents containing all the terms (intersection of lists)
    matching_docs = set(inverted_index[term_ids[0]])  # Start with the first term's document list
    for term_id in term_ids[1:]:
        matching_docs &= set(inverted_index.get(term_id, []))  # Intersect with subsequent term's document lists

    return list(matching_docs)  # Return the matching document IDs


# Input query from user
query = input("Enter your query: ")

# Execute conjunctive query
matching_docs = conjunctive_query(query, vocabulary, inverted_index)

# Create a results table
table = []
for idx in matching_docs:
    row = df.iloc[idx]
    table.append({
        "restaurantName": row['restaurantName'],  # Restaurant name
        "address": row['address'],               # Address
        "description": row['description'],       # Description
        "website": row['website']                # Website URL
    })

# Display the results table in the desired format
display(pd.DataFrame(table))
print(f"Number of matching restaurants: {len(matching_docs)}")


Unnamed: 0,restaurantName,address,description,website
0,Pipero Roma,corso Vittorio Emanuele II 250,Situated opposite the church of Santa Maria in...,https://www.piperoroma.it/
1,Il Luogo Aimo e Nadia,via Montecuccoli 6,This long-established restaurant has been part...,https://www.aimoenadia.com/il-luogo-aimo-e-nadia
2,[àbitat],via Henry Dunant 1,"A young, enthusiastic and professional couple ...",https://www.abitatproject.it
3,Babette,via Michelangelo 17,Situated just beyond the centre of Albenga in ...,https://www.ristorantebabette.net/
4,Contrasto,via Roma 55,"Having returned to his native village, owner-c...",https://contrastoristorante.it
5,San Michele,via Castello di Fagagna 33,Situated next to the ruins of the old castle a...,http://sanmichele.restaurant
6,Gallery Bistrot Contemporaneo,via Regina Margherita 3/b,"Modern, tasty and carefully curated cuisine, w...",
7,Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, al...",https://ristorantesaur.it
8,Sintesi,viale dei Castani 17,"A modern, welcoming restaurant whose motto “Tr...",http://ristorantesintesi.it
9,Osteria del Miglio 2.10,via Patrioti 2,Although the town may not be of major importan...,


Number of matching restaurants: 41


### *Build Ranked Search Engine with TF-IDF (Task 2.2.1)*

In [13]:
def compute_tf(document):
    """
    Compute term frequency (TF):
    - Calculate how often each word appears in the document relative to its length.
    """
    tf = defaultdict(int)
    for word in document:
        tf[word] += 1
    return {word: count / len(document) for word, count in tf.items()}  # Normalize by document length

def compute_idf(documents, vocabulary):
    """
    Compute inverse document frequency (IDF):
    - Measures the importance of a word across the entire dataset.
    """
    num_docs = len(documents)  # Total number of documents
    doc_freq = defaultdict(int)
    for doc in documents:
        unique_words = set(doc)  # Consider only unique words in each document
        for word in unique_words:
            if word in vocabulary:
                doc_freq[word] += 1
    return {word: math.log((num_docs + 1) / (doc_freq[word] + 1)) + 1 for word in vocabulary}  # Smoothed IDF

def compute_tfidf(document, idf):
    """
    Compute TF-IDF for a document:
    - Combines term frequency (TF) and inverse document frequency (IDF).
    """
    tf = compute_tf(document)
    return {word: tf[word] * idf[word] for word in document if word in idf}

# Calculate IDF for all words in the vocabulary
idf = compute_idf(descriptions, vocabulary)

# Calculate TF-IDF scores for all documents
tfidf = [compute_tfidf(doc, idf) for doc in descriptions]

# Build updated inverted index with TF-IDF scores
tfidf_inverted_index = defaultdict(list)
for doc_id, doc_tfidf in enumerate(tfidf):
    for word, score in doc_tfidf.items():
        term_id = vocabulary[word]
        tfidf_inverted_index[term_id].append((doc_id, score))  # Store document ID and TF-IDF score

# Save updated inverted index
tfidf_inverted_index_path = "/Users/roberto/Desktop/ADM-HM3/REPO_ADM_HM3/ADM-HM3/directory_HM3_ADM/json/inverted_index.json"
with open(tfidf_inverted_index_path, 'w') as f:
    json.dump(tfidf_inverted_index, f)

print("TF-IDF scores computed and updated inverted index saved.")


TF-IDF scores computed and updated inverted index saved.


### Execute Ranked Query (Task 2.2.2)

In [15]:
def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity:
    - Measures similarity between two vectors.
    """
    common_words = set(vec1.keys()) & set(vec2.keys())  # Find common words
    numerator = sum(vec1[word] * vec2[word] for word in common_words)  # Dot product
    norm_vec1 = math.sqrt(sum(val ** 2 for val in vec1.values()))  # Magnitude of vec1
    norm_vec2 = math.sqrt(sum(val ** 2 for val in vec2.values()))  # Magnitude of vec2
    return numerator / (norm_vec1 * norm_vec2) if norm_vec1 and norm_vec2 else 0  # Handle zero vectors

def ranked_query(query, k=5):
    """
    Execute a ranked query:
    - Find and rank restaurants based on cosine similarity with the query.
    """
    query_tokens = preprocess_text(query)  # Preprocess the query terms
    query_tfidf = compute_tfidf(query_tokens, idf)  # Compute TF-IDF for the query

    scores = []
    for doc_id, doc_tfidf in enumerate(tfidf):
        similarity = cosine_similarity(query_tfidf, doc_tfidf)  # Compute similarity
        scores.append((doc_id, similarity))  # Store document ID and similarity score

    # Sort by similarity score and return top-k results
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:k]

# Input query from user
query = input("Enter your query: ")
k = int(input("Enter the number of top results to display: "))

# Execute ranked query
top_k_results = ranked_query(query, k)

# Display results in a table
results_table = [
    {
        "Restaurant Name": df.iloc[doc_id]["restaurantName"],
        "Address": df.iloc[doc_id]["address"],
        "Description": df.iloc[doc_id]["description"],
        "Website": df.iloc[doc_id]["website"],
        "Similarity Score": f"{score:.4f}"
    }
    for doc_id, score in top_k_results
]

display(pd.DataFrame(results_table))  # Print the results table


Unnamed: 0,Restaurant Name,Address,Description,Website,Similarity Score
0,Aroma,via Labicana 125,The open-view kitchen is the first thing guest...,https://www.manfredihotels.com/aroma/,1.0
1,La Terrazza,via Ludovisi 49,"Situated on the rooftop of the Eden hotel, thi...",https://www.dorchestercollection.com/it/rome/h...,0.2054
2,Mirabelle,via di Porta Pinciana 14,"Without a shadow of a doubt, the view of Rome ...",https://www.mirabelle.it,0.175
3,Eea,via Umberto I,"Named after the Ancient Greek name for Ponza, ...",https://www.mondoeea.it,0.1526
4,Repubblica di Perno,vicolo Cavour 5,Situated in the heart of the gastronomic Langa...,http://www.repubblicadiperno.it,0.1396
5,La Risacca Blu,via Tunisia,The first thing you see when you enter this re...,https://www.larisaccablu.com/,0.139
6,Casa del Nonno 13,via Caracciolo 13,"Recently re-opened under new management, this ...",https://www.casadelnonno13.it/,0.1291
7,Augurio,via Dietro le Mura B 16,Opened by three brothers (Augurio is the famil...,https://www.augurioristorante.it,0.1218
8,Hostaria del Teatro,via Ordanino 5b,This welcoming restaurant with an open-view ki...,https://www.hostariadelteatro.it/,0.1184
9,Quattro Gigli,piazza Michele da Montopoli 2,This restaurant occupying a 15C palazzo in the...,https://www.quattrogigli.it/,0.1178


## Ex 3. Define a New Score!
Now, we will define a custom ranking metric to prioritize restaurants based on user queries.

Steps:
* User Query: The user provides a text query. We’ll retrieve relevant documents using the search engine built in Step 2.1.
* New Ranking Metric: After retrieving relevant documents, we’ll rank them using a new custom score. Instead of limiting the scoring to only the description field, we can include other attributes like priceRange, facilitiesServices, and cuisineType.
* You will use a heap data structure (e.g., Python’s heapq library) to maintain the top-k restaurants.

#### New Scoring Function:  
Define a scoring function that takes into account various attributes:
* Description Match: Give weight based on the query similarity to the description (using TF-IDF scores).
* Cuisine Match: Increase the score for matching cuisine types.
Facilities and Services: Give more points for matching facilities/services (e.g., “Terrace,” “Air conditioning”).
* Price Range: Higher scores could be given to more affordable options based on the user’s choice.

### Output:
The output should include:
* restaurantName
* address
* description
* website
* The new similarity score based on the custom metric.  

Are the results you obtain better than with the previous scoring function? Explain and compare results.

In [17]:
import numpy as np
from collections import defaultdict
import math
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from heapq import nlargest
import re


# Initialize NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess the text by removing stopwords, punctuation, and applying stemming and lemmatization.
    """
    if not isinstance(text, str):
        return []
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    tokens = text.lower().split()
    return [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stop_words]

def build_vocabulary(documents):
    """
    Build vocabulary from a list of documents.
    """
    return {word: idx for idx, word in enumerate(sorted(set(word for doc in documents for word in doc)))}

def optimized_get_idf(documents, vocabulary):
    """
    Optimized calculation of IDF scores.
    """
    doc_freq = defaultdict(int)
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            if word in vocabulary:
                doc_freq[word] += 1
    num_docs = len(documents)
    return {word: math.log((num_docs + 1) / (doc_freq[word] + 1)) + 1 for word in vocabulary}

def compute_tfidf(document, idf):
    """
    Compute TF-IDF for a single document.
    """
    tf = defaultdict(int)
    for word in document:
        tf[word] += 1
    return {word: (tf[word] / len(document)) * idf[word] for word in document if word in idf}

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two TF-IDF vectors.
    """
    common_words = set(vec1.keys()) & set(vec2.keys())
    numerator = sum(vec1[word] * vec2[word] for word in common_words)
    norm_vec1 = math.sqrt(sum(val ** 2 for val in vec1.values()))
    norm_vec2 = math.sqrt(sum(val ** 2 for val in vec2.values()))
    return numerator / (norm_vec1 * norm_vec2) if norm_vec1 and norm_vec2 else 0

def compute_custom_score(tfidf_query, tfidf_doc, query_cuisine, doc_cuisines, query_facility, doc_facilities, max_price, doc_price, weights):
    """
    Compute a custom score combining multiple attributes.
    """
    w_desc, w_cuis, w_facil, w_price = weights

    # Cosine similarity for description
    sim_desc = cosine_similarity(tfidf_query, tfidf_doc) * w_desc

    # Matching cuisines
    matching_cuis = len(set(query_cuisine) & set(doc_cuisines)) * w_cuis

    # Matching facilities
    matching_facil = len(set(query_facility) & set(doc_facilities)) * w_facil

    # Price similarity
    sim_price = (1 / (1 + abs(max_price - doc_price))) * w_price

    # Total score
    return sim_desc + matching_cuis + matching_facil + sim_price

def get_top_k(query, cuis, facil, max_price, descriptions, cuisines, facilities, prices, idf_desc, tfidf_desc, idf_cuis, idf_facil, k=5, weights=(0.4, 0.2, 0.2, 0.2)):
    """
    Retrieve the top-k ranked restaurants based on a custom scoring metric.
    """
    # Preprocess the queries
    query_tokens = preprocess_text(query)
    cuis_tokens = preprocess_text(cuis)
    facil_tokens = preprocess_text(facil)

    # Compute query TF-IDF
    tfidf_query_desc = compute_tfidf(query_tokens, idf_desc)

    scores = []
    for idx in range(len(descriptions)):
        # Compute custom score
        score = compute_custom_score(
            tfidf_query_desc,
            tfidf_desc[idx],
            cuis_tokens,
            cuisines[idx],
            facil_tokens,
            facilities[idx],
            max_price,
            prices[idx],
            weights
        )
        scores.append((idx, score))

    # Get top-k results using a heap
    return nlargest(k, scores, key=lambda x: x[1])

# Example Usage
if __name__ == "__main__":
    # Load dataset
    df = pd.read_csv("//Users/roberto/Desktop/ADM-HM3/REPO_ADM_HM3/ADM-HM3/directory_HM3_ADM/michelin_restaurants.csv")
    descriptions = [preprocess_text(desc) for desc in df['description']]
    cuisines = [preprocess_text(cuisine) for cuisine in df['cuisineType']]
    facilities = [preprocess_text(facility) for facility in df['facilitiesServices']]
    prices = [len(price) for price in df['priceRange']]  # Convert € symbols to numeric scale

    # Build vocabularies and IDF
    vocabulary_desc = build_vocabulary(descriptions)
    idf_desc = optimized_get_idf(descriptions, vocabulary_desc)

    vocabulary_cuis = build_vocabulary(cuisines)
    idf_cuis = optimized_get_idf(cuisines, vocabulary_cuis)

    vocabulary_facil = build_vocabulary(facilities)
    idf_facil = optimized_get_idf(facilities, vocabulary_facil)

    # Compute TF-IDF for documents
    tfidf_desc = [compute_tfidf(doc, idf_desc) for doc in descriptions]

    # User inputs
    query = input("Enter your query for the description: ")
    cuis = input("Enter the cuisine types: ")
    facil = input("Enter the facilities: ")
    max_price = len(input("Enter the maximum price (€, €€, etc.): ").strip())  # Convert € symbols to numeric

    # Get top-k results
    top_k_results = get_top_k(query, cuis, facil, max_price, descriptions, cuisines, facilities, prices, idf_desc, tfidf_desc, idf_cuis, idf_facil, k=5)

    # Display results
    results_table = [
        {
            "Restaurant Name": df.iloc[idx]['restaurantName'],
            "Address": df.iloc[idx]['address'],
            "Description": df.iloc[idx]['description'],
            "Cuisine Type": df.iloc[idx]['cuisineType'],
            "Facilities": df.iloc[idx]['facilitiesServices'],
            "Price Range": df.iloc[idx]['priceRange'],
            "Score": f"{score:.4f}"
        }
        for idx, score in top_k_results
    ]

    display(pd.DataFrame(results_table))


Unnamed: 0,Restaurant Name,Address,Description,Cuisine Type,Facilities,Price Range,Score
0,Aroma,via Labicana 125,The open-view kitchen is the first thing guest...,Modern Cuisine,Air conditioning; Great view; Restaurant offer...,€€€€,0.4667
1,Repubblica di Perno,vicolo Cavour 5,Situated in the heart of the gastronomic Langa...,Piedmontese,Terrace,€€,0.2558
2,Quattro Gigli,piazza Michele da Montopoli 2,This restaurant occupying a 15C palazzo in the...,"Tuscan, Regional Cuisine",Garden or park; Terrace,€€,0.2471
3,La RiMa,viale Dante Alighieri 11/c,This restaurant in the heart of the city provi...,Contemporary,Air conditioning,€€,0.2451
4,Bistrot,piazza San Rufo 25,"Overlooking a delightful, quiet little square,...","Italian, Classic Cuisine",Terrace,€€,0.241
