In [31]:
import pandas as pd
import numpy as np

import string

# import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import os

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# nltk.download('stopwords')
# nltk.download('punkt')

In [26]:
# Initialize stopwords, stemmer, and punctuation
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
punctuation = string.punctuation

# Sample restaurant description
restaurant_description = """
The best Italian restaurant in the city, offering delicious pasta and pizza with fresh ingredients.
Experience a cozy atmosphere and enjoy an unforgettable meal!
"""

# Convert the description to lowercase
text = restaurant_description.lower()

# Tokenize the text into words
words = word_tokenize(text)

# Clean the words: Remove stopwords, punctuation, and apply stemming
processed_words = [
    stemmer.stem(word)  # Apply stemming
    for word in words
    if word not in stop_words and word not in punctuation and word.isalnum()  # Remove stopwords and punctuation
]

# Join the cleaned words back into a single string
processed_text = ' '.join(processed_words)

print("Stemmed Processed Description:", processed_text)

# Optional: Lemmatization for better accuracy (after stemming, if needed)
lemmatizer = WordNetLemmatizer()

# Applying lemmatization (optional but can improve accuracy)
lemmatized_words = [
    lemmatizer.lemmatize(word)  # Apply lemmatization
    for word in words
    if word not in stop_words and word not in punctuation and word.isalnum()  # Remove stopwords and punctuation
]


# Join lemmatized words back into a string
final_text = ' '.join(lemmatized_words)

print("Lemmatized Processed Description:", final_text)


Stemmed Processed Description: best italian restaur citi offer delici pasta pizza fresh ingredi experi cozi atmospher enjoy unforgett meal
Lemmatized Processed Description: best italian restaurant city offering delicious pasta pizza fresh ingredient experience cozy atmosphere enjoy unforgettable meal


In [32]:
def preprocess_text(text):

    # Initialize stopwords, stemmer, and punctuation
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords, punctuation, and apply stemming
    processed_words = [
        stemmer.stem(word)  # Apply stemming
        for word in words
        if word not in stop_words and word not in string.punctuation and word.isalnum()
    ]
    return ' '.join(processed_words)

In [33]:
# List of file paths (or you can use a pattern to match all TSV files)
file_paths = os.listdir('restaurants_parsed_data')  # Replace with your actual file names or paths
file_paths = ['restaurants_parsed_data/' + i for i in file_paths]

# Read all files and store them in a list of DataFrames
dfs = [pd.read_csv(file, sep='\t') for file in tqdm(file_paths)]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1982.0), HTML(value='')))




In [34]:
restaurant_descriptions = df['description'].values
query = "modern seasonal cuisine"

In [35]:
processed_descriptions = [preprocess_text(description) for description in restaurant_descriptions]
processed_query = preprocess_text(query)

In [36]:
corpus = processed_descriptions + [processed_query]

In [37]:
# max_ngram_length = 3 if len(processed_query.split())>=3 else len(processed_query.split())

# vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram_length))

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(corpus)

In [38]:
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

In [39]:
similarity_scores = cosine_similarities.flatten()

In [40]:
top_k_indices = similarity_scores.argsort()[:-6:-1]  #[-5:][::-1]

In [41]:
similarity_scores[top_k_indices]

array([0.3238558 , 0.270665  , 0.2610231 , 0.24801757, 0.23625421])

In [42]:
df.loc[top_k_indices,:]

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
1241,Saur,via Filippo Turati 8,Barco,25034,Italy,€€,Italian Contemporary,"In a tiny rural village, this contemporary, al...","['Air conditioning', 'Terrace', 'Wheelchair ac...","['Mastercard', 'Visa']",+39 030 941149,https://ristorantesaur.it
230,Razzo,via Andrea Doria 17/f,Turin,10123,Italy,€€,"Modern Cuisine, Mediterranean Cuisine","A quiet restaurant with a relaxed, young and m...","['Air conditioning', 'Terrace', 'Wheelchair ac...","['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 011 020 1580,https://vadoarazzo.it/
749,La Botte,via Giuseppe Garibaldi 8,Stresa,28838,Italy,€€,Modern Cuisine,A modern and welcoming contemporary bistro sit...,"['Air conditioning', 'Wheelchair access']","['Mastercard', 'Visa']",+39 0323 30462,http://www.trattorialabottestresa.it
86,Piccolo Lord,corso San Maurizio 69 bis/g,Turin,10124,Italy,€€,"Mediterranean Cuisine, Seasonal Cuisine","Professional service in a welcoming, modern re...",['Air conditioning'],"['Amex', 'Maestrocard', 'Mastercard', 'Visa']",+39 011 836145,https://www.ristorantepiccololord.it/
1581,La Valle,via Umberto I 25,località Valle Sauglio,Trofarello,10028,€€€,Contemporary,A well - run restaurant in a quiet area just o...,"['Air conditioning', 'Interesting wine list', ...","['Amex', 'Mastercard', 'Visa']",+39 011 649 9238,https://www.ristorantelavalle.it/


In [43]:
df.loc[top_k_indices,'description'].values

array(['In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.',
       'A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.',
       'A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.',
       'Professional service in a welcoming, modern restaurant run by a young couple. He works in the kitchen while she (having also worked as a chef in the past) runs the front of house. Delicious Mediterranean cuisine with a seasonal focus.',
       'A well - run restaurant in a quiet area just outside the village, where the o