In [2]:
import pandas as pd
import numpy as np

import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import os

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacopocaldana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacopocaldana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def preprocess_text(text):

    # Initialize stopwords, stemmer, and punctuation
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords, punctuation, and apply stemming
    processed_words = [
        stemmer.stem(word)  # Apply stemming
        for word in words
        if word not in stop_words and word not in string.punctuation and word.isalnum()
    ]
    return ' '.join(processed_words)

In [4]:
# List of file paths (or you can use a pattern to match all TSV files)
file_paths = os.listdir('restaurants_parsed_data')  # Replace with your actual file names or paths
file_paths = ['restaurants_parsed_data/' + i for i in file_paths]

# Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t') for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/1982 [00:00<?, ?it/s]

In [5]:
restaurant_descriptions = df['description'].values

In [6]:
processed_descriptions = [preprocess_text(description) for description in restaurant_descriptions]

In [7]:
def create_vocabulary(df):
    
    # Concatenate all preprocessed descriptions into a single list
    all_descriptions = ' '.join(df['description'].tolist())
    
    # Preprocess the concatenated text
    processed_descriptions = preprocess_text(all_descriptions)
    
    # Get unique terms
    unique_terms = sorted(set(processed_descriptions.split()))
    # Create vocabulary DataFrame
    vocab_df = pd.DataFrame({
        'term_id': range(len(unique_terms)),
        'term': unique_terms
    })
    
    # Save vocabulary to CSV
    vocab_df.to_csv('vocabulary.csv', index=False)
    
    print(f"Created vocabulary with {len(vocab_df)} unique terms")
    # Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t') for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Create vocabulary
create_vocabulary(df)

  0%|          | 0/1982 [00:00<?, ?it/s]

Created vocabulary with 7106 unique terms


In [13]:
import csv

In [29]:
def build_inverted_index(processed_descriptions):
    
    inverted_index = {}
    
    # Load the existing vocabulary from the CSV file
    vocabulary = {}
    with open('vocabulary.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            vocabulary[row['term']] = int(row['term_id'])
    
    for restaurant_id, description in enumerate(restaurant_descriptions):
        for term in str(description).lower().split():
            if term in vocabulary:
                term_id = vocabulary[term]
                if term_id not in inverted_index:
                    inverted_index[term_id] = []
                if restaurant_id not in inverted_index[term_id]:
                   inverted_index[term_id].append(restaurant_id)
    
    return inverted_index

In [30]:
print(build_inverted_index(processed_descriptions))

{5438: [0, 22, 60, 62, 141, 192, 237, 420, 499, 519, 553, 556, 572, 657, 666, 709, 714, 722, 752, 851, 860, 920, 938, 987, 1022, 1109, 1224, 1237, 1266, 1386, 1489, 1566, 1673, 1729, 1803, 1816, 1819, 1832, 1950, 1978], 771: [0, 2, 87, 153, 195, 263, 339, 456, 560, 600, 815, 816, 840, 945, 1022, 1105, 1152, 1415, 1430, 1523, 1719, 1841, 1857, 1931, 1934, 1964, 1965], 4487: [0, 13, 21, 46, 71, 104, 113, 126, 264, 297, 381, 413, 493, 502, 513, 552, 566, 572, 598, 653, 704, 729, 804, 838, 856, 894, 906, 928, 938, 951, 1012, 1029, 1053, 1057, 1091, 1092, 1162, 1185, 1193, 1207, 1275, 1312, 1342, 1351, 1375, 1393, 1401, 1403, 1420, 1433, 1447, 1499, 1506, 1536, 1538, 1541, 1567, 1570, 1628, 1643, 1657, 1676, 1723, 1778, 1793, 1827, 1864, 1881, 1885, 1901, 1904, 1928, 1944], 5479: [0, 2, 6, 7, 13, 14, 16, 22, 26, 32, 34, 38, 40, 42, 47, 50, 55, 64, 65, 66, 71, 76, 77, 95, 99, 104, 110, 112, 113, 120, 122, 129, 139, 140, 143, 150, 152, 155, 164, 165, 176, 177, 178, 179, 186, 188, 190, 198, 20

In [42]:
def search_restaurants(query, inverted_index, df):
    # Load the vocabulary from the CSV file
    vocabulary = {}
    with open('vocabulary.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            vocabulary[row['term']] = int(row['term_id'])
    
    # Preprocess the query
    query_terms = set(preprocess_text(query).split())
    
    # Find the intersection of restaurant IDs that contain all query terms
    matching_restaurant_ids = None
    for term in query_terms:
        if term in vocabulary:
            term_id = vocabulary[term]
            if term_id in inverted_index:
                if matching_restaurant_ids is None:
                    matching_restaurant_ids = set(inverted_index[term_id])
                else:
                    matching_restaurant_ids &= set(inverted_index[term_id])
    
    # If no matching restaurants, return an empty list
    if matching_restaurant_ids is None or not matching_restaurant_ids:
        return pd.DataFrame(columns=['restaurantName', 'address', 'description', 'website'])
    
    # Construct the output list of matching restaurants
    matching_restaurants = pd.DataFrame(columns=['restaurantName', 'address', 'description', 'website'])
    for restaurant_id in matching_restaurant_ids:
        matching_restaurants.append({
            'restaurantName': df.loc[restaurant_id, 'restaurantName'],
            'address': df.loc[restaurant_id, 'address'],
            'description': df.loc[restaurant_id, 'description'],
            'website': df.loc[restaurant_id, 'website']
        })
    
    return pd.DataFrame(matching_restaurants)

In [51]:
def search_restaurants(query, inverted_index, df):
    # Load the vocabulary from the CSV file
    vocabulary = {}
    with open('vocabulary.csv', 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            vocabulary[row['term']] = int(row['term_id'])
    
    # Preprocess the query
    query_terms = set(preprocess_text(query).split())
    
    # Find the intersection of restaurant IDs that contain all query terms
    matching_restaurant_ids = None
    for term in query_terms:
        if term in vocabulary:
            term_id = vocabulary[term]
            if term_id in inverted_index:
                if matching_restaurant_ids is None:
                    matching_restaurant_ids = set(inverted_index[term_id])
                else:
                    matching_restaurant_ids &= set(inverted_index[term_id])
    
    # If no matching restaurants, return an empty DataFrame
    if matching_restaurant_ids is None or not matching_restaurant_ids:
        return pd.DataFrame(columns=['Restaurant Name', 'Address', 'Description', 'Website'])
    
    # Construct the output DataFrame of matching restaurants
    matching_restaurants = pd.DataFrame({
        'Restaurant Name': df.loc[list(matching_restaurant_ids), 'restaurantName'],
        'Address': df.loc[list(matching_restaurant_ids), 'address'],
        'Description': df.loc[list(matching_restaurant_ids), 'description'],
        'Website': df.loc[list(matching_restaurant_ids), 'website']
    })
    
    return matching_restaurants

In [52]:
query="modern seasonal cuisine"
inverted_index=build_inverted_index(processed_descriptions)
search_restaurants(query,inverted_index,df)


Unnamed: 0,Restaurant Name,Address,Description,Website
1105,Vesta Mare,viale Roma 41,"This typical, elegant Versilian beach club wit...",https://vestafiorichiari.com/mare/
