In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.neighbors import NearestNeighbors
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load processed data
df_clean = pd.read_csv('processed_restaurant_data.csv')
similarity_features = pd.read_csv('similarity_features.csv')
content_features = pd.read_csv('content_features.csv')
text_features = pd.read_csv('text_features.csv')
hybrid_features = pd.read_csv('hybrid_features.csv')

print("=== DATA LOADED ===")
print(f"Restaurants: {len(df_clean)}")
print(f"Feature dimensions: {similarity_features.shape[1]}")

=== DATA LOADED ===
Restaurants: 15515
Feature dimensions: 298


In [9]:
print("NaNs in similarity_features:", similarity_features.isna().sum().sum())
print("NaNs in content_features:", content_features.isna().sum().sum())
print("NaNs in text_features:", text_features.isna().sum().sum())
print("NaNs in hybrid_features:", hybrid_features.isna().sum().sum())


NaNs in similarity_features: 3222870
NaNs in content_features: 2119740
NaNs in text_features: 1103130
NaNs in hybrid_features: 3222870


In [7]:
# feature_matrix = feature_matrix.fillna(0)   # replace

text_features.isnull().sum()  # check for NaN values in text features

rating_scaled        10815
popularity_scaled    10815
tfidf_0              10815
tfidf_1              10815
tfidf_2              10815
                     ...  
tfidf_95             10815
tfidf_96             10815
tfidf_97             10815
tfidf_98             10815
tfidf_99             10815
Length: 102, dtype: int64

In [2]:
df_clean.isnull().sum()

url                               0
address                           0
name                              0
online_order                      0
book_table                        0
rate                           2347
votes                             0
phone                           389
location                          0
rest_type                         0
dish_liked                     8426
cuisines                          0
approx_cost(for two people)     115
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
rating                            0
cost_for_two                      0
phone_clean                       0
cuisine_list                      0
dish_list                         0
combined_features                 0
review_text                    2311
online_order_binary               0
book_table_binary                 0
location_grouped                  0
rest_type_grouped           

In [3]:
df_clean.rating.unique()

array([3.9, 2.8, 3.8, 3.5, 3.2, 3.7, 4.2, 3.4, 2.9, 3.6, 3. , 4.3, 4.1,
       3.3, 3.1, 4.4, 4.5, 4. , 4.6, 2.7, 2.6, 4.7, 4.9, 2.5, 4.8, 2.3,
       2.4, 2.2, 2.1, 1.8, 2. ])

In [4]:
class RestaurantRecommender:
    def __init__(self, restaurant_data, feature_matrix, similarity_metric='cosine'):
        """
        Initialize the recommender system
        
        Args:
            restaurant_data: DataFrame with restaurant information
            feature_matrix: DataFrame with features for similarity calculation
            similarity_metric: 'cosine', 'euclidean', or 'knn'
        """
        self.restaurant_data = restaurant_data.reset_index(drop=True)
        self.feature_matrix = feature_matrix.reset_index(drop=True)
        self.similarity_metric = similarity_metric
        
        # Create restaurant name to index mapping
        self.name_to_idx = {name: idx for idx, name in enumerate(self.restaurant_data['name'])}
        
        # Precompute similarity matrix
        self._compute_similarity_matrix()
        
    def _compute_similarity_matrix(self):
        """Compute similarity matrix based on chosen metric"""
        print(f"Computing {self.similarity_metric} similarity matrix...")
        
        if self.similarity_metric == 'cosine':
            self.similarity_matrix = cosine_similarity(self.feature_matrix)
        
        elif self.similarity_metric == 'euclidean':
            distances = euclidean_distances(self.feature_matrix)
            # Convert distances to similarities (higher = more similar)
            max_distance = distances.max()
            self.similarity_matrix = 1 - (distances / max_distance)
        
        elif self.similarity_metric == 'knn':
            # Use KNN for similarity
            self.knn_model = NearestNeighbors(
                n_neighbors=50, 
                metric='cosine', 
                algorithm='brute'
            )
            self.knn_model.fit(self.feature_matrix)
            self.similarity_matrix = None  # Will compute on-demand
        
        print(f"Similarity computation complete!")
    
    def get_restaurant_index(self, restaurant_name):
        """Get restaurant index by name"""
        if restaurant_name not in self.name_to_idx:
            # Fuzzy matching
            possible_matches = [name for name in self.name_to_idx.keys() 
                              if restaurant_name.lower() in name.lower()]
            if possible_matches:
                return self.name_to_idx[possible_matches[0]]
            else:
                raise ValueError(f"Restaurant '{restaurant_name}' not found!")
        return self.name_to_idx[restaurant_name]
    
    def get_recommendations(self, restaurant_name, n_recommendations=10, 
                          include_similar_cuisine=True, include_similar_location=True):
        """
        Get restaurant recommendations
        
        Args:
            restaurant_name: Name of the reference restaurant
            n_recommendations: Number of recommendations to return
            include_similar_cuisine: Whether to boost similar cuisine restaurants
            include_similar_location: Whether to boost same location restaurants
        """
        try:
            restaurant_idx = self.get_restaurant_index(restaurant_name)
        except ValueError as e:
            return str(e)
        
        if self.similarity_metric == 'knn':
            # Use KNN for recommendations
            distances, indices = self.knn_model.kneighbors(
                [self.feature_matrix.iloc[restaurant_idx]], 
                n_neighbors=n_recommendations+1
            )
            # Remove the restaurant itself (first result)
            recommended_indices = indices[0][1:]
            similarity_scores = 1 - distances[0][1:]  # Convert distances to similarities
        else:
            # Use precomputed similarity matrix
            similarity_scores = self.similarity_matrix[restaurant_idx]
            
            # Apply filters and boosts
            if include_similar_cuisine or include_similar_location:
                similarity_scores = self._apply_filters_and_boosts(
                    restaurant_idx, similarity_scores, 
                    include_similar_cuisine, include_similar_location
                )
            
            # Get top similar restaurants (excluding the restaurant itself)
            similar_indices = np.argsort(similarity_scores)[::-1]
            recommended_indices = [idx for idx in similar_indices if idx != restaurant_idx][:n_recommendations]
            similarity_scores = similarity_scores[recommended_indices]
        
        # Create recommendations DataFrame
        recommendations = self.restaurant_data.iloc[recommended_indices].copy()
        recommendations['similarity_score'] = similarity_scores[:len(recommended_indices)]
        
        # Select relevant columns
        columns_to_show = [
            'name', 'cuisines', 'location', 'rating', 'cost_for_two', 
            'rest_type', 'online_order', 'book_table', 'similarity_score'
        ]
        
        available_columns = [col for col in columns_to_show if col in recommendations.columns]
        recommendations = recommendations[available_columns]
        
        return recommendations
    
    def _apply_filters_and_boosts(self, restaurant_idx, similarity_scores, 
                                 include_similar_cuisine, include_similar_location):
        """Apply cuisine and location boosts to similarity scores"""
        reference_restaurant = self.restaurant_data.iloc[restaurant_idx]
        
        if include_similar_cuisine:
            # Boost restaurants with similar cuisines
            reference_cuisines = set(str(reference_restaurant['cuisines']).lower().split(', '))
            for idx in range(len(similarity_scores)):
                if idx != restaurant_idx:
                    restaurant_cuisines = set(str(self.restaurant_data.iloc[idx]['cuisines']).lower().split(', '))
                    cuisine_overlap = len(reference_cuisines.intersection(restaurant_cuisines))
                    if cuisine_overlap > 0:
                        similarity_scores[idx] *= (1 + 0.2 * cuisine_overlap)  # 20% boost per overlapping cuisine
        
        if include_similar_location:
            # Boost restaurants in the same location
            reference_location = reference_restaurant['location']
            for idx in range(len(similarity_scores)):
                if idx != restaurant_idx:
                    if self.restaurant_data.iloc[idx]['location'] == reference_location:
                        similarity_scores[idx] *= 1.3  # 30% boost for same location
        
        return similarity_scores
    
    def get_restaurant_details(self, restaurant_name):
        """Get detailed information about a restaurant"""
        try:
            restaurant_idx = self.get_restaurant_index(restaurant_name)
            return self.restaurant_data.iloc[restaurant_idx]
        except ValueError as e:
            return str(e)
    
    def find_similar_restaurants_by_criteria(self, cuisine=None, location=None, 
                                           price_range=None, rating_min=None, n_results=20):
        """Find restaurants based on specific criteria"""
        filtered_data = self.restaurant_data.copy()
        
        # Apply filters
        if cuisine:
            filtered_data = filtered_data[
                filtered_data['cuisines'].str.contains(cuisine, case=False, na=False)
            ]
        
        if location:
            filtered_data = filtered_data[
                filtered_data['location'].str.contains(location, case=False, na=False)
            ]
        
        if price_range:
            if price_range == 'budget':
                filtered_data = filtered_data[filtered_data['cost_for_two'] <= 300]
            elif price_range == 'mid':
                filtered_data = filtered_data[
                    (filtered_data['cost_for_two'] > 300) & (filtered_data['cost_for_two'] <= 600)
                ]
            elif price_range == 'expensive':
                filtered_data = filtered_data[filtered_data['cost_for_two'] > 600]
        
        if rating_min:
            filtered_data = filtered_data[filtered_data['rating'] >= rating_min]
        
        # Sort by rating and popularity
        filtered_data = filtered_data.sort_values(['rating', 'votes'], ascending=[False, False])
        
        return filtered_data.head(n_results)

In [5]:
class ContentBasedRecommender(RestaurantRecommender):
    """Recommender based on restaurant content features (cuisine, location, price, etc.)"""
    
    def __init__(self, restaurant_data, feature_matrix):
        super().__init__(restaurant_data, feature_matrix, similarity_metric='cosine')
        
    def get_cuisine_based_recommendations(self, restaurant_name, n_recommendations=10):
        """Get recommendations based primarily on cuisine similarity"""
        return self.get_recommendations(
            restaurant_name, n_recommendations, 
            include_similar_cuisine=True, include_similar_location=False
        )
    
    def get_location_based_recommendations(self, restaurant_name, n_recommendations=10):
        """Get recommendations based primarily on location similarity"""
        return self.get_recommendations(
            restaurant_name, n_recommendations,
            include_similar_cuisine=False, include_similar_location=True
        )


class TextBasedRecommender(RestaurantRecommender):
    """Recommender based on text features (reviews, dishes, descriptions)"""
    
    def __init__(self, restaurant_data, feature_matrix):
        super().__init__(restaurant_data, feature_matrix, similarity_metric='cosine')


class HybridRecommender:
    """Hybrid recommender combining multiple approaches"""
    
    def __init__(self, restaurant_data, content_features, text_features):
        self.restaurant_data = restaurant_data
        
        # Initialize individual recommenders
        self.content_recommender = ContentBasedRecommender(restaurant_data, content_features)
        self.text_recommender = TextBasedRecommender(restaurant_data, text_features)
        
        # Weights for different recommendation types
        self.weights = {
            'content': 0.6,
            'text': 0.4
        }
    
    def get_hybrid_recommendations(self, restaurant_name, n_recommendations=10):
        """Get recommendations using hybrid approach"""
        
        # Get recommendations from both approaches
        content_recs = self.content_recommender.get_recommendations(
            restaurant_name, n_recommendations * 2
        )
        text_recs = self.text_recommender.get_recommendations(
            restaurant_name, n_recommendations * 2
        )
        
        if isinstance(content_recs, str) or isinstance(text_recs, str):
            return "Restaurant not found!"
        
        # Combine and weight scores
        combined_scores = {}
        
        # Process content-based recommendations
        for idx, row in content_recs.iterrows():
            restaurant_name_rec = row['name']
            score = row['similarity_score'] * self.weights['content']
            combined_scores[restaurant_name_rec] = combined_scores.get(restaurant_name_rec, 0) + score
        
        # Process text-based recommendations
        for idx, row in text_recs.iterrows():
            restaurant_name_rec = row['name']
            score = row['similarity_score'] * self.weights['text']
            combined_scores[restaurant_name_rec] = combined_scores.get(restaurant_name_rec, 0) + score
        
        # Sort by combined score
        sorted_restaurants = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Get top N recommendations
        top_restaurant_names = [name for name, score in sorted_restaurants[:n_recommendations]]
        
        # Create final recommendations DataFrame
        final_recommendations = []
        for name in top_restaurant_names:
            restaurant_info = self.restaurant_data[self.restaurant_data['name'] == name].iloc[0]
            final_recommendations.append({
                'name': restaurant_info['name'],
                'cuisines': restaurant_info['cuisines'],
                'location': restaurant_info['location'],
                'rating': restaurant_info['rating'],
                'cost_for_two': restaurant_info['cost_for_two'],
                'rest_type': restaurant_info['rest_type'],
                'online_order': restaurant_info['online_order'],
                'book_table': restaurant_info['book_table'],
                'hybrid_score': combined_scores[name]
            })
        
        return pd.DataFrame(final_recommendations)

In [8]:
content_features.isnull().sum()  # Check for NaN values in content features

rating_scaled                        10815
cost_scaled                          10815
online_order_binary                  10815
book_table_binary                    10815
cuisine_afghan                       10815
                                     ...  
rest_type_Quick Bites                10815
rest_type_Sweet Shop                 10815
rest_type_Sweet Shop, Quick Bites    10815
rest_type_Takeaway                   10815
rest_type_Takeaway, Delivery         10815
Length: 196, dtype: int64

In [6]:
# Initialize all recommenders
print("=== INITIALIZING RECOMMENDERS ===")

content_recommender = ContentBasedRecommender(df_clean, content_features)
text_recommender = TextBasedRecommender(df_clean, text_features)
hybrid_recommender = HybridRecommender(df_clean, content_features, text_features)

print("All recommenders initialized successfully!")

# Test with a sample restaurant
sample_restaurant = df_clean['name'].iloc[0]
print(f"\nTesting with restaurant: {sample_restaurant}")

# Test content-based recommendations
print("\n=== CONTENT-BASED RECOMMENDATIONS ===")
content_recs = content_recommender.get_recommendations(sample_restaurant, 5)
print(content_recs[['name', 'cuisines', 'location', 'rating', 'similarity_score']])

# Test text-based recommendations
print("\n=== TEXT-BASED RECOMMENDATIONS ===")
text_recs = text_recommender.get_recommendations(sample_restaurant, 5)
print(text_recs[['name', 'cuisines', 'location', 'rating', 'similarity_score']])

# Test hybrid recommendations
print("\n=== HYBRID RECOMMENDATIONS ===")
hybrid_recs = hybrid_recommender.get_hybrid_recommendations(sample_restaurant, 5)
print(hybrid_recs[['name', 'cuisines', 'location', 'rating', 'hybrid_score']])

=== INITIALIZING RECOMMENDERS ===
Computing cosine similarity matrix...


ValueError: Input contains NaN.