In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embedding_data = pd.read_csv('../processed_data/restaurants_for_embedding.csv')
print(f"Loaded {len(embedding_data)} restaurants for embedding")
print(f"Columns: {list(embedding_data.columns)}")

Loaded 10343 restaurants for embedding
Columns: ['name', 'address', 'location', 'cuisines', 'restaurant_type', 'embedding_text', 'rating', 'cost', 'rating_normalized', 'cost_normalized', 'votes_normalized', 'online_order', 'book_table', 'votes']


In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and efficient model
print(f"Loaded embedding model: {model}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded embedding model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [5]:
restaurant_texts = embedding_data['embedding_text'].tolist()
restaurant_names = embedding_data['name'].tolist()

print(f"Creating embeddings for {len(restaurant_texts)} restaurants...")
print(f"Sample text: {restaurant_texts[0]}")

# %%
# Generate embeddings
embeddings = model.encode(restaurant_texts, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")

Creating embeddings for 10343 restaurants...
Sample text: cuisines: oriya, fast food | type: quick bites | location: btm | popular dishes: rasgulla, mutton kosha, chicken kasha, samosa chaat, kheer, veg thali | services: online ordering available


Batches: 100%|██████████| 324/324 [05:31<00:00,  1.02s/it]


Embeddings shape: (10343, 384)


In [6]:
# Extract normalized numerical features
numerical_features = embedding_data[['rating_normalized', 'cost_normalized', 'votes_normalized']].values

# Combine text embeddings with numerical features (weighted)
text_weight = 0.8
numerical_weight = 0.2

# Scale numerical features to match embedding magnitude
numerical_features_scaled = numerical_features * np.std(embeddings)

# Create hybrid embeddings
hybrid_embeddings = np.concatenate([
    embeddings * text_weight,
    numerical_features_scaled * numerical_weight
], axis=1)

print(f"Hybrid embeddings shape: {hybrid_embeddings.shape}")

Hybrid embeddings shape: (10343, 387)


In [7]:
# Create restaurant index mapping
restaurant_index = {
    'names': restaurant_names,
    'data': embedding_data,
    'text_embeddings': embeddings,
    'hybrid_embeddings': hybrid_embeddings,
    'model': model
}

print("Restaurant index created successfully")

Restaurant index created successfully


In [8]:
def find_restaurant_by_name(query_name, restaurant_data, top_k=5):
    """Find restaurants with similar names"""
    query_name_lower = query_name.lower()
    
    # Exact match
    exact_matches = restaurant_data[restaurant_data['name'].str.lower() == query_name_lower]
    if not exact_matches.empty:
        return exact_matches.index[0], exact_matches.iloc[0]
    
    # Partial match
    partial_matches = restaurant_data[restaurant_data['name'].str.lower().str.contains(query_name_lower, na=False)]
    if not partial_matches.empty:
        print(f"Found {len(partial_matches)} restaurants matching '{query_name}':")
        for idx, row in partial_matches.head(top_k).iterrows():
            print(f"  - {row['name']} ({row['location']})")
        return partial_matches.index[0], partial_matches.iloc[0]
    
    return None, None

# Test the function
test_idx, test_restaurant = find_restaurant_by_name("Cafe", embedding_data)
if test_restaurant is not None:
    print(f"\nTest result: Found {test_restaurant['name']}")

Found 703 restaurants matching 'Cafe':
  - Cafe Talkhouse (hsr)
  - Cross Road Cafe (jayanagar)
  - Cafe D'hide (koramangala 5th block)
  - Sidewalk Cafe - Nahar's Heritage Hotel (st. marks road)
  - Brew Cafe (btm)

Test result: Found Cafe Talkhouse


In [10]:
def get_similar_restaurants(query_restaurant_idx, embeddings, restaurant_data, top_k=10, use_hybrid=True):
    """Get similar restaurants using cosine similarity"""
    
    # Choose embedding type
    embedding_matrix = embeddings['hybrid_embeddings'] if use_hybrid else embeddings['text_embeddings']
    
    # Get query restaurant embedding
    query_embedding = embedding_matrix[query_restaurant_idx].reshape(1, -1)
    
    # Calculate similarities
    similarities = cosine_similarity(query_embedding, embedding_matrix)[0]
    
    # Get top-k most similar (excluding the query restaurant itself)
    similar_indices = np.argsort(similarities)[::-1]
    
    # Filter out the query restaurant and get top-k
    filtered_indices = [idx for idx in similar_indices if idx != query_restaurant_idx][:top_k]
    
    # Prepare results
    results = []
    for idx in filtered_indices:
        similarity_score = similarities[idx]
        restaurant = restaurant_data.iloc[idx]
        
        results.append({
            'index': idx,
            'name': restaurant['name'],
            'location': restaurant['location'],
            'cuisines': restaurant['cuisines'],
            'restaurant_type': restaurant['restaurant_type'],
            'rating': restaurant['rating'],
            'cost': restaurant['cost'],
            'similarity_score': similarity_score,
            'online_order': restaurant['online_order'],
            'book_table': restaurant['book_table']
        })
    
    return results


def recommend_restaurants(restaurant_name, restaurant_index, top_k=10, use_hybrid=True):
    """Main recommendation function"""
    
    print(f"Looking for restaurants similar to: '{restaurant_name}'")
    print("-" * 60)
    
    # Find the query restaurant
    query_idx, query_restaurant = find_restaurant_by_name(restaurant_name, restaurant_index['data'])
    
    if query_restaurant is None:
        print(f"Restaurant '{restaurant_name}' not found!")
        return []
    
    print(f"Query Restaurant: {query_restaurant['name']}")
    print(f"Location: {query_restaurant['location']}")
    print(f"Cuisines: {query_restaurant['cuisines']}")
    print(f"Rating: {query_restaurant['rating']:.1f}")
    print(f"Cost for two: ₹{query_restaurant['cost']:.0f}")
    print("-" * 60)
    
    # Get similar restaurants
    recommendations = get_similar_restaurants(
        query_idx, 
        restaurant_index, 
        restaurant_index['data'], 
        top_k=top_k,
        use_hybrid=use_hybrid
    )
    
    print(f"Top {len(recommendations)} Similar Restaurants:")
    print("-" * 60)
    
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec['name']}")
        print(f"   Location: {rec['location']}")
        print(f"   Cuisines: {rec['cuisines']}")
        print(f"   Rating: {rec['rating']:.1f} | Cost: ₹{rec['cost']:.0f}")
        print(f"   Services: {rec['online_order']} online order, {rec['book_table']} table booking")
        print(f"   Similarity: {rec['similarity_score']:.3f}")
        print()
    
    return recommendations


# Test with a specific restaurant
test_recommendations = recommend_restaurants("kfc", restaurant_index, top_k=5)


# Test with another restaurant
# test_recommendations_2 = recommend_restaurants("Empire Restaurant", restaurant_index, top_k=5, use_hybrid=False)



Looking for restaurants similar to: 'kfc'
------------------------------------------------------------
Query Restaurant: KFC
Location: koramangala 6th block
Cuisines: burger, fast food
Rating: 3.8
Cost for two: ₹450
------------------------------------------------------------
Top 5 Similar Restaurants:
------------------------------------------------------------
1. KFC
   Location: koramangala 6th block
   Cuisines: burger, fast food
   Rating: 3.8 | Cost: ₹450
   Services: Yes online order, No table booking
   Similarity: 1.000

2. The Charcoal Factory
   Location: koramangala 6th block
   Cuisines: burger, pizza, momos
   Rating: 4.1 | Cost: ₹600
   Services: Yes online order, No table booking
   Similarity: 0.954

3. The Charcoal Factory
   Location: koramangala 6th block
   Cuisines: burger, pizza, momos
   Rating: 4.1 | Cost: ₹600
   Services: Yes online order, No table booking
   Similarity: 0.954

4. Meat And Eat
   Location: koramangala 7th block
   Cuisines: fast food, burger


In [11]:
os.makedirs('../models', exist_ok=True)

# Save the restaurant index (without the model to avoid serialization issues)
index_to_save = {
    'names': restaurant_index['names'],
    'text_embeddings': restaurant_index['text_embeddings'],
    'hybrid_embeddings': restaurant_index['hybrid_embeddings']
}

with open('../models/restaurant_embeddings.pkl', 'wb') as f:
    pickle.dump(index_to_save, f)

# Save restaurant data separately
embedding_data.to_csv('../models/restaurant_data_indexed.csv', index=True)

print("Recommendation system saved!")
print("Files created:")
print("- ../models/restaurant_embeddings.pkl")
print("- ../models/restaurant_data_indexed.csv")

Recommendation system saved!
Files created:
- ../models/restaurant_embeddings.pkl
- ../models/restaurant_data_indexed.csv


In [12]:
def load_recommendation_system():
    """Load the saved recommendation system"""
    
    # Load embeddings
    with open('../models/restaurant_embeddings.pkl', 'rb') as f:
        embeddings_data = pickle.load(f)
    
    # Load restaurant data
    restaurant_data = pd.read_csv('../models/restaurant_data_indexed.csv', index_col=0)
    
    # Recreate the model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Recreate restaurant index
    restaurant_index = {
        'names': embeddings_data['names'],
        'data': restaurant_data,
        'text_embeddings': embeddings_data['text_embeddings'],
        'hybrid_embeddings': embeddings_data['hybrid_embeddings'],
        'model': model
    }
    
    return restaurant_index

# Test loading
loaded_index = load_recommendation_system()
print(f"Successfully loaded recommendation system with {len(loaded_index['names'])} restaurants")


Successfully loaded recommendation system with 10343 restaurants


In [13]:

print("Testing loaded recommendation system:")
loaded_recommendations = recommend_restaurants("Pizza", loaded_index, top_k=3)

Testing loaded recommendation system:
Looking for restaurants similar to: 'Pizza'
------------------------------------------------------------
Found 108 restaurants matching 'Pizza':
  - La Pino'z Pizza (koramangala 4th block)
  - Garage Pizza (basavanagudi)
  - Pizza Hut (old airport road)
  - Eagles Pizza (kalyan nagar)
  - Pizza Republic (electronic city)
Query Restaurant: La Pino'z Pizza
Location: koramangala 4th block
Cuisines: italian, pizza, beverages
Rating: 4.1
Cost for two: ₹600
------------------------------------------------------------
Top 3 Similar Restaurants:
------------------------------------------------------------
1. La Pino'z Pizza
   Location: koramangala 4th block
   Cuisines: italian, pizza, beverages
   Rating: 4.1 | Cost: ₹600
   Services: Yes online order, No table booking
   Similarity: 1.000

2. La Pino'z Pizza
   Location: koramangala 4th block
   Cuisines: italian, pizza, beverages
   Rating: 4.1 | Cost: ₹600
   Services: Yes online order, No table booki