In [122]:
import pandas as pd
import numpy as np

import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem import SnowballStemmer

import os
import re

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacopocaldana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacopocaldana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [123]:
def preprocess_text(text):

    # Initialize stopwords, stemmer, and punctuation
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords, punctuation, and apply stemming
    processed_words = [
        stemmer.stem(word)  # Apply stemming
        for word in words
        if word not in stop_words and word not in string.punctuation and word.isalnum()
    ]
    return ' '.join(processed_words)

In [124]:
def preprocess_text(text):
    # Initialize stopwords, stemmer, and punctuation
    stop_words = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    

    # Convert to lowercase
    text = text.lower()
    
    text = re.sub(r'\b0\w*\b', '', text) # Remove words that begin with the digit 0
    # text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Keep only letters and numbers
    # text = re.sub(r"\n", " ", text) # Remove newline
    # text = text.strip() # Remove leading and trailing whitespaces
    # text = re.sub(' +', ' ', text) # Remove multiple whitespaces
    
    # Tokenize
    words = word_tokenize(text)
    
    # Remove stopwords, punctuation, apply stemming, filter out short words and exclude non alphanumeric words if they exist.
    processed_words = [
        stemmer.stem(word)  # Apply stemming
        
        for word in words
        if word not in stop_words and 
           word not in string.punctuation and 
           word.isalnum() and # Exclude
           len(word) > 2  # Exclude very short words
    ]
    return ' '.join(processed_words)

In [125]:
# List of file paths (or you can use a pattern to match all TSV files)
file_paths = os.listdir('restaurants_parsed_data')  # Replace with your actual file names or paths
file_paths = ['restaurants_parsed_data/' + i for i in file_paths]

# Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t') for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/1982 [00:00<?, ?it/s]

In [126]:
restaurant_descriptions = df['description'].values

In [127]:
processed_descriptions = [preprocess_text(description) for description in restaurant_descriptions]

In [128]:
def create_vocabulary(df):
    
    # Concatenate all preprocessed descriptions into a single list
    all_descriptions = ' '.join(df['description'].tolist())
    
    # Preprocess the concatenated text
    processed_descriptions = preprocess_text(all_descriptions)
    
    # Get unique terms
    unique_terms = sorted(set(processed_descriptions.split()))
    # Create vocabulary DataFrame
    vocab_df = pd.DataFrame({
        'term_id': range(len(unique_terms)),
        'term': unique_terms
    })
    
    # Save vocabulary to CSV
    vocab_df.to_csv('vocabulary.csv', index=False)
    
    print(f"Created vocabulary with {len(vocab_df)} unique terms")
    # Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t') for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Create vocabulary
create_vocabulary(df)

  0%|          | 0/1982 [00:00<?, ?it/s]

Created vocabulary with 6956 unique terms


In [129]:
import csv

In [137]:
def build_inverted_index(processed_descriptions):
    
    inverted_index = {}
    
    # Load the existing vocabulary from the CSV file
    vocabulary = {}
    with open('vocabulary.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            vocabulary[row['term']] = int(row['term_id'])
    
    for restaurant_id, description in enumerate(processed_descriptions):
        for term in str(description).lower().split():
            if term in vocabulary:
                term_id = vocabulary[term]
                if term_id not in inverted_index:
                    inverted_index[term_id] = []
                if restaurant_id not in inverted_index[term_id]:
                   inverted_index[term_id].append(restaurant_id)
    
    return inverted_index

In [138]:
def search_restaurants(query, inverted_index, df):
    # Load the vocabulary from the CSV file
    vocabulary = {}
    with open('vocabulary.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            vocabulary[row['term']] = int(row['term_id'])
    
    # Preprocess the query
    query_terms = set(preprocess_text(query).split())
    print(query_terms)
    
    # Find the intersection of restaurant IDs that contain all query terms
    matching_restaurant_ids = None
    for term in query_terms:
        if term in vocabulary:
            term_id = vocabulary[term]
            if term_id in inverted_index:
                if matching_restaurant_ids is None:
                    matching_restaurant_ids = set(inverted_index[term_id])
                else:
                    matching_restaurant_ids.intersection_update(set(inverted_index[term_id]))
    print(matching_restaurant_ids)
    # If no matching restaurants, return an empty DataFrame
    if matching_restaurant_ids is None or not matching_restaurant_ids:
        return pd.DataFrame(columns=['Restaurant Name', 'Address', 'Description', 'Website'])
    
    # Construct the output DataFrame of matching restaurants
    matching_restaurants = pd.DataFrame({
        'Restaurant Name': df.loc[list(matching_restaurant_ids), 'restaurantName'],
        'Address': df.loc[list(matching_restaurant_ids), 'address'],
        'Description': df.loc[list(matching_restaurant_ids), 'description'],
        'Website': df.loc[list(matching_restaurant_ids), 'website']
    })
    
    return matching_restaurants

In [139]:

query="modern seasonal cuisine"
inverted_index=build_inverted_index(processed_descriptions)
search_restaurants(query,inverted_index,df)


{'modern', 'season', 'cuisin'}
{512, 387, 391, 904, 265, 266, 526, 1551, 1430, 1950, 801, 546, 676, 1192, 172, 46, 177, 1587, 1971, 1463, 1592, 1852, 830, 63, 1475, 840, 1356, 1105, 1236, 601, 1504, 1634, 1765, 485, 364, 492, 628, 757, 1270, 1527, 1147}


Unnamed: 0,Restaurant Name,Address,Description,Website
512,Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, al...",https://ristorantesaur.it
387,Gallery Bistrot Contemporaneo,via Regina Margherita 3/b,"Modern, tasty and carefully curated cuisine, w...",
391,Materia | Spazio Cucina,via Teatro Massimo 29,The entrance to this restaurant is typical of ...,https://www.materiaspaziocucina.it/
904,Ronchi Rò,località Cime di Dolegna 12,Ronchi Rò is an estate-cum-agriturismo surroun...,https://www.ronchiro.it
265,Contrasto,via Roma 55,"Having returned to his native village, owner-c...",https://contrastoristorante.it
266,Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant st...",
526,San Michele,via Castello di Fagagna 33,Situated next to the ruins of the old castle a...,http://sanmichele.restaurant
1551,Le Vie del Borgo,via alla Piazza 6,Le Vie del Borgo is situated in a restored rus...,https://www.leviedelborgoguesthouse.it/
1430,Savô,piazza XXV Aprile 8,The reopening in 2022 of the Hotel Windsor wit...,http://www.thewindsor.it
1950,Casin del Gamba,via Roccolo Pizzati 1,The journey to get here – a winding road throu...,https://www.casindelgamba.it/
