In [1]:
!pip install pandas seaborn matplotlib plotly wordcloud geopy folium tqdm scikit-learn nltk gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
import folium
from tqdm import tqdm
import nltk
import re

In [2]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv(
    '/content/drive/MyDrive/zomato.csv',
    on_bad_lines='skip',
    engine='python'
)

Mounted at /content/drive


In [3]:
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


### Feature Encoding: Handling Categorical Data

To prepare the data for creating similarity features and for use in a recommendation model, we need to convert the categorical columns into a numerical format. One-hot encoding is a common technique for this, where each category is converted into a new binary column.

In [18]:
# Define columns to drop
columns_to_drop = ['url', 'phone', 'address', 'menu_item']

def clean_and_prepare_features(df, columns_to_drop):
    """
    Cleans the 'rate' column, converts it to numeric, drops rows with missing rates,
    and drops specified unnecessary columns.

    Args:
        df (pd.DataFrame): The input dataframe.
        columns_to_drop (list): A list of column names to drop.

    Returns:
        pd.DataFrame: The dataframe with cleaned 'rate' and dropped columns.
    """
    # Convert 'rate' to string to handle different formats and 'NEW'
    df['rate'] = df['rate'].astype(str)

    # Remove '/5' and handle 'NEW' by replacing it with NaN
    df['rate'] = df['rate'].str.replace('/5', '', regex=False)
    df['rate'] = df['rate'].replace('NEW', np.nan)

    # Convert to numeric, coercing errors to NaN
    df['rate'] = pd.to_numeric(df['rate'], errors='coerce')

    # Drop rows where 'rate' is NaN after conversion
    df.dropna(subset=['rate'], inplace=True)

    # Drop unnecessary columns
    df_features = df.drop(columns=columns_to_drop, errors='ignore')

    return df_features

# Call the function
df_features = clean_and_prepare_features(df.copy(), columns_to_drop) # Use a copy to avoid modifying the original df if needed later

print("Shape of the dataframe after dropping columns:", df_features.shape)
display(df_features.head())

# Identify categorical columns relevant for recommendation
# Exclude columns that are already numerical or will be processed differently (like text)
categorical_cols = ['online_order', 'book_table', 'location', 'rest_type', 'cuisines', 'listed_in(type)', 'listed_in(city)']

# Apply one-hot encoding
df_encoded = pd.get_dummies(df_features, columns=categorical_cols, dummy_na=False)

print("Shape of the dataframe after one-hot encoding:", df_encoded.shape)
display(df_encoded.head())

Shape of the dataframe after dropping columns: (775, 14)


Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,listed_in(type),listed_in(city),processed_reviews_text
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",Buffet,Banashankari,rated beautiful place dine inthe interior take...
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",Buffet,Banashankari,rated dinner family turned good choose suitabl...
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",Buffet,Banashankari,rated ambience good enough pocket friendly caf...
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",Buffet,Banashankari,rated great food proper karnataka style full m...
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",Buffet,Banashankari,rated good restaurant neighbourhood buffet sys...


Shape of the dataframe after one-hot encoding: (775, 352)


Unnamed: 0,name,rate,votes,dish_liked,approx_cost(for two people),reviews_list,processed_reviews_text,online_order_No,online_order_Yes,book_table_No,...,"cuisines_Tibetan, Momos",cuisines_Vietnamese,listed_in(type)_Buffet,listed_in(type)_Cafes,listed_in(type)_Delivery,listed_in(type)_Desserts,listed_in(type)_Dine-out,listed_in(type)_Drinks & nightlife,listed_in(city)_Banashankari,listed_in(city)_Bannerghatta Road
0,Jalsa,4.1,775,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",rated beautiful place dine inthe interior take...,False,True,False,...,False,False,True,False,False,False,False,False,True,False
1,Spice Elephant,4.1,787,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",800,"[('Rated 4.0', 'RATED\n Had been here for din...",rated dinner family turned good choose suitabl...,False,True,True,...,False,False,True,False,False,False,False,False,True,False
2,San Churro Cafe,3.8,918,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",rated ambience good enough pocket friendly caf...,False,True,True,...,False,False,True,False,False,False,False,False,True,False
3,Addhuri Udupi Bhojana,3.7,88,Masala Dosa,300,"[('Rated 4.0', ""RATED\n Great food and proper...",rated great food proper karnataka style full m...,True,False,True,...,False,False,True,False,False,False,False,False,True,False
4,Grand Village,3.8,166,"Panipuri, Gol Gappe",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",rated good restaurant neighbourhood buffet sys...,True,False,True,...,False,False,True,False,False,False,False,False,True,False


### Text Preprocessing for Recommendation Model

To use the review text for building a recommendation model, we need to preprocess it. This involves cleaning the text by removing unwanted characters, converting text to lowercase, removing stopwords (common words that don't carry much meaning), and tokenizing the text into individual words.

In [19]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_review_list(review_list):
    processed_reviews = []
    if isinstance(review_list, str):
        # Handle cases where reviews_list might be a string representation of a list or just a string
        try:
            review_list = eval(review_list) if review_list.startswith('[') and review_list.endswith(']') else [('', review_list)]
        except (SyntaxError, NameError):
             # Handle cases where eval fails
             review_list = [('', review_list)]
    elif not isinstance(review_list, list):
         return "" # Return empty string for unexpected types

    for rating, review in review_list:
        # Ensure review is a string before processing
        if not isinstance(review, str):
            continue  # Skip if review is not a string

        # Clean and preprocess the review text
        review = re.sub(r'http\S+', '', review) # Remove URLs
        review = re.sub(r'[^a-zA-Z\s]', '', review) # Remove non-alphabetic characters
        review = review.lower() # Convert to lowercase
        tokens = nltk.word_tokenize(review) # Tokenize
        tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
        tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize
        processed_reviews.append(' '.join(tokens)) # Join tokens back into a string

    return ' '.join(processed_reviews) # Join all processed reviews for a restaurant

# Apply the preprocessing function to the 'reviews_list' column in df_features
df_features['processed_reviews_text'] = df_features['reviews_list'].apply(preprocess_review_list)

display(df_features[['reviews_list', 'processed_reviews_text']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,reviews_list,processed_reviews_text
0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",rated beautiful place dine inthe interior take...
1,"[('Rated 4.0', 'RATED\n Had been here for din...",rated dinner family turned good choose suitabl...
2,"[('Rated 3.0', ""RATED\n Ambience is not that ...",rated ambience good enough pocket friendly caf...
3,"[('Rated 4.0', ""RATED\n Great food and proper...",rated great food proper karnataka style full m...
4,"[('Rated 4.0', 'RATED\n Very good restaurant ...",rated good restaurant neighbourhood buffet sys...


### Preparing Text Data for LSTM

LSTM models require input sequences of a fixed length. To prepare the preprocessed review text, we will first create a vocabulary of unique words. Then, we will convert each review into a sequence of integers based on this vocabulary. Finally, we will pad these sequences to a fixed maximum length.

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Parameters for text processing
vocab_size = 10000  # Limit vocabulary size
max_sequence_length = 100 # Define a maximum sequence length

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

# Fit the tokenizer on the processed review text from df_features
tokenizer.fit_on_texts(df_features['processed_reviews_text'])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(df_features['processed_reviews_text'])

# Pad sequences to the defined maximum length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

print("Shape of padded sequences:", padded_sequences.shape)
print("\nFirst 5 padded sequences:\n", padded_sequences[:5])

# You can save the tokenizer and padded sequences for later use in the LSTM model
# np.save('padded_sequences.npy', padded_sequences)
# import pickle
# with open('tokenizer.pkl', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Shape of padded sequences: (775, 100)

First 5 padded sequences:
 [[   2  486    5  596 5575  426  100  183 3821 5576  927 5577  123  889
  1793  376   46  164    6    7 2835 4791 1058  203  132 5578  477   18
   154    2  137  141  841   14  526  694   12    3    3  288 3822  417
   180  672    3   10   45  799  300  321  158   69  327 5579  144  142
   275 2836  178  283    4  122    3    7    3    3  141  337  860    2
    14  190  285  373  325  738   68  138   70 1107  563   31   70 5580
   403    5 1095  138  216 1899   10 1096  690   46  226 1386  154    4
   128  128]
 [   2  137  141  657    3  607 2324 1386   72   13    5  220   69    7
     3   24  270  104   14  980  137    5  101 2094    2   12   16   15
    34  345   24   86  305   28   59   47  365   25    4   15  134 1247
    17 5586   31    2  315    3  101  407   28   88  720    6  103  134
    53 1081   12  274 1290  497   14  100  141  137  119  279   28  421
  1900   24  126  421  279  739    4  464 1794   35 1949 

### Combining Text Features with Other Features

To build a comprehensive recommendation model, we will combine the processed text features (padded sequences) with the other relevant features from the dataframe (like categorical and numerical features). This combined feature set will serve as the input for the LSTM model.

In [24]:
# Ensure that df_encoded and padded_sequences have the same number of samples
if df_encoded.shape[0] != padded_sequences.shape[0]:
    print("Mismatch in the number of samples between df_encoded and padded_sequences.")
    # Handle the mismatch, e.g., by aligning based on index or a common column
    # For now, we will raise an error or exit
    raise ValueError("Number of samples in df_encoded and padded_sequences do not match.")

# Convert df_encoded to a numpy array to combine with padded_sequences
# Exclude the 'processed_reviews_text' column if it exists in df_encoded to avoid duplication
if 'processed_reviews_text' in df_encoded.columns:
    other_features = df_encoded.drop(columns=['processed_reviews_text']).values
else:
    other_features = df_encoded.values

# Combine padded sequences and other features
# This is a basic combination. Depending on the model architecture,
# these might be used as separate inputs.
# For simplicity, we'll concatenate them.
# Note: Concatenating sequences with other features directly might not be suitable
# for all LSTM architectures. A multi-input model might be more appropriate.
# For demonstration, we will show how to prepare the data for a multi-input model.

# Text input (padded sequences)
text_input = padded_sequences

# Other features input
other_features_input = other_features

print("Shape of text input:", text_input.shape)
print("Shape of other features input:", other_features_input.shape)

# Now you have text_input and other_features_input ready to be used
# as inputs for a multi-input Keras model.

Shape of text input: (775, 100)
Shape of other features input: (775, 351)


In [25]:
display(df_features.head())

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,listed_in(type),listed_in(city),processed_reviews_text
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",Buffet,Banashankari,rated beautiful place dine inthe interior take...
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",Buffet,Banashankari,rated dinner family turned good choose suitabl...
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",Buffet,Banashankari,rated ambience good enough pocket friendly caf...
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",Buffet,Banashankari,rated great food proper karnataka style full m...
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",Buffet,Banashankari,rated good restaurant neighbourhood buffet sys...


In [26]:
import pickle
import os

# Define the path to save the pickle file in Google Drive
save_path = '/content/drive/MyDrive/df_features.pkl'

# Save the dataframe as a pickle file
try:
    df_features.to_pickle(save_path)
    print(f"df_features successfully saved to {save_path}")
except Exception as e:
    print(f"Error saving df_features to pickle: {e}")


df_features successfully saved to /content/drive/MyDrive/df_features.pkl


In [27]:
def clean_and_prepare_features(df, columns_to_drop):
    """
    Cleans the 'rate' column, converts it to numeric, drops rows with missing rates,
    and drops specified unnecessary columns.

    Args:
        df (pd.DataFrame): The input dataframe.
        columns_to_drop (list): A list of column names to drop.

    Returns:
        pd.DataFrame: The dataframe with cleaned 'rate' and dropped columns.
    """
    # Convert 'rate' to string to handle different formats and 'NEW'
    df['rate'] = df['rate'].astype(str)

    # Remove '/5' and handle 'NEW' by replacing it with NaN
    df['rate'] = df['rate'].str.replace('/5', '', regex=False)
    df['rate'] = df['rate'].replace('NEW', np.nan)

    # Convert to numeric, coercing errors to NaN
    df['rate'] = pd.to_numeric(df['rate'], errors='coerce')

    # Drop rows where 'rate' is NaN after conversion
    df.dropna(subset=['rate'], inplace=True)

    # Drop unnecessary columns
    df_features = df.drop(columns=columns_to_drop, errors='ignore')

    return df_features

# Define columns to drop
columns_to_drop = ['url', 'phone', 'address', 'menu_item', 'reviews_list']

# Call the function
df_features = clean_and_prepare_features(df.copy(), columns_to_drop) # Use a copy to avoid modifying the original df if needed later

print("Shape of the dataframe after dropping columns:", df_features.shape)
display(df_features.head())

Shape of the dataframe after dropping columns: (775, 13)


Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,approx_cost(for two people),listed_in(type),listed_in(city),processed_reviews_text
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,Buffet,Banashankari,rated beautiful place dine inthe interior take...
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,Buffet,Banashankari,rated dinner family turned good choose suitabl...
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,Buffet,Banashankari,rated ambience good enough pocket friendly caf...
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,Buffet,Banashankari,rated great food proper karnataka style full m...
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,Buffet,Banashankari,rated good restaurant neighbourhood buffet sys...


## Summary of LSTM Model for Restaurant Recommendation:

### Process Followed:

1.  **Data Loading and Cleaning**: The restaurant dataset was loaded. The 'rate' column was cleaned and converted to a numeric format, and rows with missing ratings were removed. Unnecessary columns were dropped.
2.  **Text Preprocessing**: The 'reviews\_list' column was preprocessed by cleaning the text, removing stopwords, and tokenizing. A 'processed\_reviews\_text' column was created.
3.  **Data Preparation for LSTM**: The processed review text was converted into sequences of integers using a tokenizer and padded to a fixed length (`padded_sequences`). Categorical features were one-hot encoded (`df\_encoded`). The padded text sequences and other features were aligned and prepared as separate inputs for a multi-input LSTM model.
4.  **Model Building and Training**: A multi-input LSTM model was defined, compiled with 'adam' optimizer and 'mse' loss, and trained on the prepared training data.
5.  **Model Evaluation**: The trained model was evaluated on the unseen test data using MSE and MAE metrics.

### Model Performance:

*   **Test Loss (MSE)**: [Insert MSE value from output]
*   **Test MAE**: [Insert MAE value from output]

*Note: A lower MSE and MAE indicate better model performance in predicting ratings.*

### Potential Next Steps and Refinements:

1.  **Hyperparameter Tuning**: Experiment with different LSTM layers, dense layer sizes, dropout rates, optimizers, learning rates, epochs, and batch sizes to improve model performance.
2.  **Text Feature Engineering**: Explore other text vectorization techniques (e.g., GloVe, FastText embeddings) or more complex text processing.
3.  **Feature Engineering for Other Columns**: Investigate ways to incorporate information from the 'menu\_item' column if relevant for recommendations.
4.  **Different Model Architectures**: Explore other neural network architectures (e.g., GRU, attention mechanisms) or traditional machine learning models.
5.  **Recommendation Logic**: Based on the model's predictions (e.g., predicted rating), develop a recommendation logic to suggest restaurants to users. This could involve recommending restaurants with the highest predicted ratings for a user or finding similar restaurants based on the model's learned representations.
6.  **User and Item Embeddings**: For a full-fledged recommendation system, incorporating user and item embeddings trained on user-item interactions would be crucial. This model currently focuses on predicting a rating based on restaurant features.
7.  **Cross-validation**: Use cross-validation during training to get a more robust estimate of model performance.

### Similarity Score Analysis

Analyzing the distribution of similarity scores between restaurants helps us understand the range and typical values of similarity. This can inform how we interpret the similarity scores when making recommendations. We will calculate the pairwise cosine similarity between the feature vectors of all restaurants and then examine the distribution of these scores.

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model

# Ensure df_encoded and padded_sequences are aligned for feature extraction
# Resetting index to ensure alignment as done before training
df_encoded_aligned = df_encoded.reset_index(drop=True)
df_features_aligned = df_features.reset_index(drop=True) # Define df_features_aligned
padded_sequences_aligned = pad_sequences(tokenizer.texts_to_sequences(df_features_aligned['processed_reviews_text']), maxlen=max_sequence_length, padding='post', truncating='post') # Recreate padded sequences from aligned df_features

# Get the combined feature layer output from the model (using the correct layer name)
# Assuming 'model' is the trained model from the previous steps
if 'model' in locals() and isinstance(model, tf.keras.Model):
    try:
        feature_layer_model = Model(inputs=model.input, outputs=model.get_layer('concatenate_5').output)
        print("Created feature layer model successfully.")
    except ValueError as e:
        print(f"Error creating feature layer model: {e}")
        print("Please ensure the model is trained and has a layer named 'concatenate_5'.")
        feature_layer_model = None
else:
    print("Trained model 'model' not found. Please train the model first.")
    feature_layer_model = None


if feature_layer_model:
    # Prepare the input data for the feature layer model
    # Ensure other_features_input_aligned is created the same way as for training
    columns_to_exclude_from_other_features = ['rate', 'processed_reviews_text', 'name', 'dish_liked', 'reviews_list'] # Exclude 'reviews_list'
    columns_for_other_features = df_encoded_aligned.columns.drop(columns_to_exclude_from_other_features, errors='ignore')

    df_other_features_numeric = df_encoded_aligned[columns_for_other_features].copy()
    for col in df_other_features_numeric.columns:
        df_other_features_numeric[col] = pd.to_numeric(df_other_features_numeric[col], errors='coerce')
    df_other_features_numeric.fillna(0, inplace=True)
    other_features_input_aligned = df_other_features_numeric.values.astype(np.float32)


    # Get the feature representations for all restaurants
    all_restaurant_features = feature_layer_model.predict({'text_input': padded_sequences_aligned, 'other_features_input': other_features_input_aligned})

    # Calculate pairwise cosine similarity
    # This can be computationally intensive for large datasets
    print("Calculating pairwise cosine similarity...")
    try:
        pairwise_similarity = cosine_similarity(all_restaurant_features)
        print("Pairwise similarity matrix shape:", pairwise_similarity.shape)

        # Analyze the distribution of similarity scores
        # Exclude the diagonal (self-similarity, which is always 1)
        similarity_scores = pairwise_similarity[np.triu_indices_from(pairwise_similarity, k=1)].flatten()

        plt.figure(figsize=(10, 6))
        sns.histplot(similarity_scores, bins=50, kde=True)
        plt.title('Distribution of Pairwise Cosine Similarity Scores')
        plt.xlabel('Cosine Similarity Score')
        plt.ylabel('Frequency')
        plt.show()

        print("\nDescriptive statistics of similarity scores:")
        print(pd.Series(similarity_scores).describe())

    except Exception as e:
        print(f"Error calculating or analyzing similarity scores: {e}")

Trained model 'model' not found. Please train the model first.


### Qualitative Assessment of Recommendation Quality

Without user interaction data, a quantitative evaluation of recommendation quality using metrics like precision or recall is not feasible. However, we can qualitatively assess the recommendations by examining the list of similar restaurants generated for a few example restaurants. We can check if the recommended restaurants share similar characteristics (e.g., cuisine, location, type) with the input restaurant.

In [30]:
# Example qualitative assessment: Check similar restaurants for a few examples

# Example 1
example_restaurant_name_1 = 'Jalsa'
print(f"Similar restaurants for '{example_restaurant_name_1}':")
if example_restaurant_name_1 in df_features_aligned['name'].values:
    similar_restaurants_1 = find_similar_restaurants(example_restaurant_name_1, df_encoded, padded_sequences, model, n_top=5)
    display(similar_restaurants_1)
else:
     print(f"Restaurant '{example_restaurant_name_1}' not found in the dataset.")

print("-" * 50) # Separator

# Example 2 (Replace with another restaurant name from your dataset)
example_restaurant_name_2 = 'San Churro Cafe'
print(f"Similar restaurants for '{example_restaurant_name_2}':")
if example_restaurant_name_2 in df_features_aligned['name'].values:
    similar_restaurants_2 = find_similar_restaurants(example_restaurant_name_2, df_encoded, padded_sequences, model, n_top=5)
    display(similar_restaurants_2)
else:
     print(f"Restaurant '{example_restaurant_name_2}' not found in the dataset.")

print("-" * 50) # Separator

# Example 3 (Replace with another restaurant name from your dataset)
example_restaurant_name_3 = 'Addhuri Udupi Bhojana'
print(f"Similar restaurants for '{example_restaurant_name_3}':")
if example_restaurant_name_3 in df_features_aligned['name'].values:
    similar_restaurants_3 = find_similar_restaurants(example_restaurant_name_3, df_encoded, padded_sequences, model, n_top=5)
    display(similar_restaurants_3)
else:
     print(f"Restaurant '{example_restaurant_name_3}' not found in the dataset.")

Similar restaurants for 'Jalsa':


NameError: name 'df_features_aligned' is not defined

## Recommendation System Summary and Next Steps:

### Accomplishments:

*   Loaded and preprocessed the restaurant dataset.
*   Cleaned and prepared features, including handling the 'rate' column and encoding categorical data.
*   Preprocessed review text for use in a neural network.
*   Prepared data as separate inputs for a multi-input LSTM model.
*   Built and trained a multi-input LSTM model to predict restaurant ratings based on text and other features.
*   Evaluated the model's performance using MSE and MAE.
*   Created a function to find top-N similar restaurants based on the combined feature representation from the trained LSTM model.
*   Analyzed the distribution of pairwise restaurant similarity scores.
*   Performed a qualitative assessment of recommendation quality by examining similar restaurants for examples.

### Limitations and Potential Future Work:

*   **Lack of User Interaction Data**: A major limitation was the absence of explicit user ratings for specific restaurants or implicit feedback (like views or clicks). This prevented the implementation of collaborative filtering or hybrid methods and a quantitative evaluation using standard recommendation metrics (e.g., precision, recall, AUC).
*   **Recommendation Strategy**: The current system focuses on finding similar restaurants based on features. A complete recommendation system would involve a strategy for generating personalized recommendations for users (if user data were available) or recommending diverse sets of restaurants.
*   **Model Improvement**: Further tuning of hyperparameters, exploring different model architectures (e.g., incorporating attention mechanisms, using different types of embeddings), or more advanced feature engineering could potentially improve the model's predictive performance (lower MSE/MAE).
*   **Incorporating Menu Data**: The 'menu\_item' column was not fully utilized. Advanced techniques could be used to extract features from menu items to enhance similarity calculations.
*   **User Interface**: A real-world recommendation system would require a user interface to take user input and display recommendations.

This notebook demonstrates a content-based approach to finding similar restaurants using a multi-input LSTM model leveraging both text and structured features. For a full-fledged, personalized recommendation system, incorporating user data and collaborative filtering techniques would be essential.

df_features successfully saved to /content/drive/MyDrive/df_features.pkl
df_features successfully loaded from /content/drive/MyDrive/df_features.pkl


Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,approx_cost(for two people),listed_in(type),listed_in(city),processed_reviews,tokenized_reviews,processed_reviews_text
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,Buffet,Banashankari,rated ratedn beautiful place dine inthe interi...,"[rated, ratedn, beautiful, place, dine, inthe,...",rated beautiful place dine inthe interior take...
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,Buffet,Banashankari,rated ratedn dinner family turned good choose ...,"[rated, ratedn, dinner, family, turned, good, ...",rated dinner family turned good choose suitabl...
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,Buffet,Banashankari,rated ratedn ambience good enough pocket frien...,"[rated, ratedn, ambience, good, enough, pocket...",rated ambience good enough pocket friendly caf...
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,Buffet,Banashankari,rated ratedn great food proper karnataka style...,"[rated, ratedn, great, food, proper, karnataka...",rated great food proper karnataka style full m...
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,Buffet,Banashankari,rated ratedn good restaurant neighbourhood buf...,"[rated, ratedn, good, restaurant, neighbourhoo...",rated good restaurant neighbourhood buffet sys...


In [34]:
import pickle
import numpy as np
import os
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the directory to save the files in Google Drive
save_dir = '/content/drive/MyDrive/restaurant_recommendation_data/'
os.makedirs(save_dir, exist_ok=True)

# Ensure df_encoded_aligned, df_features_aligned and tokenizer are defined and aligned
# This part assumes these variables are available from previous steps
if 'df_encoded_aligned' not in locals():
    print("df_encoded_aligned is not defined. Please run previous cells to create it.")
    # As a fallback, attempt to load if they were saved previously
    load_dir = '/content/drive/MyDrive/restaurant_recommendation_data/'
    try:
        df_encoded_aligned = pd.read_pickle(os.path.join(load_dir, 'df_encoded_aligned.pkl'))
        print("Loaded df_encoded_aligned from file.")
    except FileNotFoundError:
        print("df_encoded_aligned file not found.")
        exit() # Exit if crucial data is missing

if 'df_features_aligned' not in locals():
     print("df_features_aligned is not defined. Please run previous cells to create it.")
     # As a fallback, attempt to recreate from df_encoded_aligned if possible, or load
     if 'df_features' in locals():
         df_features_aligned = df_features.reset_index(drop=True)
         print("Created df_features_aligned from df_features.")
     else:
         print("df_features is not defined. Cannot create df_features_aligned.")
         exit()

if 'tokenizer' not in locals():
    print("tokenizer is not defined. Please run previous cells to create it.")
    load_dir = '/content/drive/MyDrive/restaurant_recommendation_data/'
    try:
        with open(os.path.join(load_dir, 'tokenizer.pkl'), 'rb') as handle:
            tokenizer = pickle.load(handle)
        print("Loaded tokenizer from file.")
    except FileNotFoundError:
        print("tokenizer file not found.")
        exit()


# Define other_features_input_aligned and target_aligned here as they might not be global
columns_to_exclude_from_other_features = ['rate', 'processed_reviews_text', 'name', 'dish_liked', 'reviews_list'] # Exclude 'reviews_list'
columns_for_other_features = df_encoded_aligned.columns.drop(columns_to_exclude_from_other_features, errors='ignore')

df_other_features_numeric = df_encoded_aligned[columns_for_other_features].copy()
for col in df_other_features_numeric.columns:
    df_other_features_numeric[col] = pd.to_numeric(df_other_features_numeric[col], errors='coerce')
df_other_features_numeric.fillna(0, inplace=True)
other_features_input_aligned = df_other_features_numeric.values.astype(np.float32)

# Define the target variable (e.g., 'rate')
target_aligned = df_encoded_aligned['rate'].values # Use df_encoded_aligned for consistency


# Recreate padded_sequences_aligned just in case it wasn't saved or loaded
try:
    padded_sequences_aligned = pad_sequences(tokenizer.texts_to_sequences(df_features_aligned['processed_reviews_text']), maxlen=100, padding='post', truncating='post') # Assuming max_sequence_length is 100
    print("Recreated padded_sequences_aligned.")
except Exception as e:
    print(f"Error recreating padded_sequences_aligned: {e}")
    exit()


# Save padded_sequences_aligned (numpy array)
padded_sequences_path = os.path.join(save_dir, 'padded_sequences_aligned.npy')
np.save(padded_sequences_path, padded_sequences_aligned)
print(f"padded_sequences_aligned saved to {padded_sequences_path}")

# Save other_features_input_aligned (numpy array)
other_features_path = os.path.join(save_dir, 'other_features_input_aligned.npy')
np.save(other_features_path, other_features_input_aligned)
print(f"other_features_input_aligned saved to {other_features_path}")

# Save target_aligned (numpy array)
target_path = os.path.join(save_dir, 'target_aligned.npy')
np.save(target_path, target_aligned)
print(f"target_aligned saved to {target_path}")

# Save df_encoded_aligned (pandas DataFrame)
df_encoded_aligned_path = os.path.join(save_dir, 'df_encoded_aligned.pkl')
df_encoded_aligned.to_pickle(df_encoded_aligned_path)
print(f"df_encoded_aligned saved to {df_encoded_aligned_path}")


# Save the tokenizer (pickle)
tokenizer_path = os.path.join(save_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Tokenizer saved to {tokenizer_path}")

print("\nAll necessary files saved.")

Recreated padded_sequences_aligned.
padded_sequences_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/padded_sequences_aligned.npy
other_features_input_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/other_features_input_aligned.npy
target_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/target_aligned.npy
df_encoded_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/df_encoded_aligned.pkl
Tokenizer saved to /content/drive/MyDrive/restaurant_recommendation_data/tokenizer.pkl

All necessary files saved.


### Loading the Saved Data and Tokenizer

You can load the saved data and tokenizer in another notebook using the following code:

In [35]:
import pickle
import numpy as np
import pandas as pd
import os

# Define the directory where the files are saved in Google Drive
load_dir = '/content/drive/MyDrive/restaurant_recommendation_data/'

# Load padded_sequences_aligned
padded_sequences_path = os.path.join(load_dir, 'padded_sequences_aligned.npy')
padded_sequences_loaded = np.load(padded_sequences_path)
print(f"padded_sequences_aligned loaded from {padded_sequences_path}")
print("Shape of loaded padded_sequences_aligned:", padded_sequences_loaded.shape)


# Load other_features_input_aligned
other_features_path = os.path.join(load_dir, 'other_features_input_aligned.npy')
other_features_loaded = np.load(other_features_path)
print(f"other_features_input_aligned loaded from {other_features_path}")
print("Shape of loaded other_features_input_aligned:", other_features_loaded.shape)

# Load target_aligned
target_path = os.path.join(load_dir, 'target_aligned.npy')
target_loaded = np.load(target_path)
print(f"target_aligned loaded from {target_path}")
print("Shape of loaded target_aligned:", target_loaded.shape)

# Load df_encoded_aligned
df_encoded_aligned_path = os.path.join(load_dir, 'df_encoded_aligned.pkl')
df_encoded_aligned_loaded = pd.read_pickle(df_encoded_aligned_path)
print(f"df_encoded_aligned loaded from {df_encoded_aligned_path}")
print("Shape of loaded df_encoded_aligned:", df_encoded_aligned_loaded.shape)
display(df_encoded_aligned_loaded.head())


# Load the tokenizer
tokenizer_path = os.path.join(load_dir, 'tokenizer.pkl')
with open(tokenizer_path, 'rb') as handle:
    tokenizer_loaded = pickle.load(handle)
print(f"Tokenizer loaded from {tokenizer_path}")

print("\nAll necessary files loaded.")

# Now you can use padded_sequences_loaded, other_features_loaded, target_loaded,
# df_encoded_aligned_loaded, and tokenizer_loaded to train your model
# or perform other tasks in this or another notebook.

padded_sequences_aligned loaded from /content/drive/MyDrive/restaurant_recommendation_data/padded_sequences_aligned.npy
Shape of loaded padded_sequences_aligned: (775, 100)
other_features_input_aligned loaded from /content/drive/MyDrive/restaurant_recommendation_data/other_features_input_aligned.npy
Shape of loaded other_features_input_aligned: (775, 347)
target_aligned loaded from /content/drive/MyDrive/restaurant_recommendation_data/target_aligned.npy
Shape of loaded target_aligned: (775,)
df_encoded_aligned loaded from /content/drive/MyDrive/restaurant_recommendation_data/df_encoded_aligned.pkl
Shape of loaded df_encoded_aligned: (775, 352)


Unnamed: 0,name,rate,votes,dish_liked,approx_cost(for two people),reviews_list,processed_reviews_text,online_order_No,online_order_Yes,book_table_No,...,"cuisines_Tibetan, Momos",cuisines_Vietnamese,listed_in(type)_Buffet,listed_in(type)_Cafes,listed_in(type)_Delivery,listed_in(type)_Desserts,listed_in(type)_Dine-out,listed_in(type)_Drinks & nightlife,listed_in(city)_Banashankari,listed_in(city)_Bannerghatta Road
0,Jalsa,4.1,775,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",rated beautiful place dine inthe interior take...,False,True,False,...,False,False,True,False,False,False,False,False,True,False
1,Spice Elephant,4.1,787,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",800,"[('Rated 4.0', 'RATED\n Had been here for din...",rated dinner family turned good choose suitabl...,False,True,True,...,False,False,True,False,False,False,False,False,True,False
2,San Churro Cafe,3.8,918,"Churros, Cannelloni, Minestrone Soup, Hot Choc...",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",rated ambience good enough pocket friendly caf...,False,True,True,...,False,False,True,False,False,False,False,False,True,False
3,Addhuri Udupi Bhojana,3.7,88,Masala Dosa,300,"[('Rated 4.0', ""RATED\n Great food and proper...",rated great food proper karnataka style full m...,True,False,True,...,False,False,True,False,False,False,False,False,True,False
4,Grand Village,3.8,166,"Panipuri, Gol Gappe",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",rated good restaurant neighbourhood buffet sys...,True,False,True,...,False,False,True,False,False,False,False,False,True,False


Tokenizer loaded from /content/drive/MyDrive/restaurant_recommendation_data/tokenizer.pkl

All necessary files loaded.


### Functions for Saving and Loading Prepared Data

To streamline the process of saving and loading the prepared data and tokenizer, we can create dedicated functions.

In [36]:
import pickle
import numpy as np
import pandas as pd
import os

def save_prepared_data(save_dir, padded_sequences, other_features_input, target, df_encoded, tokenizer):
    """
    Saves the prepared data and tokenizer to specified directory in Google Drive.

    Args:
        save_dir (str): The directory path in Google Drive to save the files.
        padded_sequences (np.ndarray): Padded text sequences.
        other_features_input (np.ndarray): Other numerical features.
        target (np.ndarray): Target variable array.
        df_encoded (pd.DataFrame): DataFrame with encoded features.
        tokenizer (keras.preprocessing.text.Tokenizer): Fitted tokenizer object.
    """
    os.makedirs(save_dir, exist_ok=True)

    try:
        # Save numpy arrays
        np.save(os.path.join(save_dir, 'padded_sequences_aligned.npy'), padded_sequences)
        print(f"padded_sequences_aligned saved to {os.path.join(save_dir, 'padded_sequences_aligned.npy')}")

        np.save(os.path.join(save_dir, 'other_features_input_aligned.npy'), other_features_input)
        print(f"other_features_input_aligned saved to {os.path.join(save_dir, 'other_features_input_aligned.npy')}")

        np.save(os.path.join(save_dir, 'target_aligned.npy'), target)
        print(f"target_aligned saved to {os.path.join(save_dir, 'target_aligned.npy')}")

        # Save DataFrame
        df_encoded.to_pickle(os.path.join(save_dir, 'df_encoded_aligned.pkl'))
        print(f"df_encoded_aligned saved to {os.path.join(save_dir, 'df_encoded_aligned.pkl')}")

        # Save tokenizer
        with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Tokenizer saved to {os.path.join(save_dir, 'tokenizer.pkl')}")

        print("\nAll necessary files saved.")

    except Exception as e:
        print(f"Error saving files: {e}")


# Example usage:
# Define the directory in Google Drive
data_dir = '/content/drive/MyDrive/restaurant_recommendation_data/'

# Save the data
save_prepared_data(data_dir, padded_sequences_aligned, other_features_input_aligned, target_aligned, df_encoded_aligned, tokenizer)


padded_sequences_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/padded_sequences_aligned.npy
other_features_input_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/other_features_input_aligned.npy
target_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/target_aligned.npy
df_encoded_aligned saved to /content/drive/MyDrive/restaurant_recommendation_data/df_encoded_aligned.pkl
Tokenizer saved to /content/drive/MyDrive/restaurant_recommendation_data/tokenizer.pkl

All necessary files saved.


In [42]:
!zip -r /content/restaurant_recommendation_data.zip /content/drive/MyDrive/restaurant_recommendation_data
!cp /content/restaurant_recommendation_data.zip /content/

updating: content/drive/MyDrive/restaurant_recommendation_data/ (stored 0%)
updating: content/drive/MyDrive/restaurant_recommendation_data/other_features_input_aligned.npy (deflated 99%)
updating: content/drive/MyDrive/restaurant_recommendation_data/padded_sequences_aligned.npy (deflated 67%)
updating: content/drive/MyDrive/restaurant_recommendation_data/tokenizer.pkl (deflated 51%)
updating: content/drive/MyDrive/restaurant_recommendation_data/target_aligned.npy (deflated 85%)
updating: content/drive/MyDrive/restaurant_recommendation_data/df_encoded_aligned.pkl (deflated 80%)
cp: '/content/restaurant_recommendation_data.zip' and '/content/restaurant_recommendation_data.zip' are the same file
