In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from prettytable import PrettyTable

# Data Cleaning

In [2]:
df=pd.read_csv('zomato_data.csv')

In [3]:
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/krishna-bhoja...,"2A/3, 15th Cross, Green Garden Layout, Shirdi ...",Krishna Bhojan,No,No,,0,+91 9606123181,Marathahalli,Quick Bites,,North Indian,300,"[('Rated 4.0', 'RATED\n Good food\n\n\nA litt...",[],Dine-out,Brookefield
1,https://www.zomato.com/bangalore/el-salver-btm...,"18, Shreenidhi Arcade, Maruthi Nagar Main Road...",El Salver,Yes,Yes,3.9 /5,74,+91 9916802277,BTM,Casual Dining,Fish Curry,"North Indian, Chinese, South Indian",900,"[('Rated 4.0', 'RATED\n This is a new restaur...","['El Salver Chicken Roast', 'Fish Finger', 'Ch...",Delivery,Koramangala 6th Block
2,https://www.zomato.com/bangalore/atithi-araman...,"56, Near Passport Office, Outer Ring Road, Bel...",Atithi Aramane Veg,Yes,No,3.9/5,48,+91 9036009717\r\n+91 9480356930,Bellandur,Quick Bites,,"South Indian, North Indian, Chinese",300,"[('Rated 3.0', 'RATED\n So finally I found so...","['Gobi Manchurian', 'Baby Corn Manchurian', 'P...",Dine-out,Bellandur
3,https://www.zomato.com/bangalore/mahesh-friend...,"14/6, 9th Main Road, Opposite Water Tank, 100 ...",Mahesh Friends Food Center,Yes,No,2.9/5,126,+91 9731081364,BTM,Quick Bites,Lassi,"Chinese, North Indian",300,"[('Rated 1.0', ""RATED\n Only if zero star was...",[],Dine-out,Bannerghatta Road
4,https://www.zomato.com/bangalore/mayur-hosur-r...,"321/3A, Sharif Complex, Hosur Main Road, Oppos...",Mayur,Yes,Yes,3.7/5,21,+91 9676181454,Hosur Road,Casual Dining,,"Andhra, North Indian, Chinese, Seafood",700,"[('Rated 5.0', 'RATED\n Hope this is the best...","['Chicken Fried Rice', 'Butter Naan', 'Chicken...",Delivery,Koramangala 4th Block


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25858 entries, 0 to 25857
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          25858 non-null  object
 1   address                      25858 non-null  object
 2   name                         25858 non-null  object
 3   online_order                 25858 non-null  object
 4   book_table                   25858 non-null  object
 5   rate                         21891 non-null  object
 6   votes                        25858 non-null  int64 
 7   phone                        25234 non-null  object
 8   location                     25849 non-null  object
 9   rest_type                    25747 non-null  object
 10  dish_liked                   11803 non-null  object
 11  cuisines                     25838 non-null  object
 12  approx_cost(for two people)  25691 non-null  object
 13  reviews_list                 25

we can drop the unnecessary columns

In [5]:
df.drop(columns=['url','votes','listed_in(city)','dish_liked','address','menu_item','reviews_list'], inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25858 entries, 0 to 25857
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         25858 non-null  object
 1   online_order                 25858 non-null  object
 2   book_table                   25858 non-null  object
 3   rate                         21891 non-null  object
 4   phone                        25234 non-null  object
 5   location                     25849 non-null  object
 6   rest_type                    25747 non-null  object
 7   cuisines                     25838 non-null  object
 8   approx_cost(for two people)  25691 non-null  object
 9   listed_in(type)              25858 non-null  object
dtypes: object(10)
memory usage: 2.0+ MB


Lets rename the column names

In [7]:
df.rename(columns={'approx_cost(for two people)': 'cost'}, inplace=True)
df.rename(columns={'listed_in(type)': 'type'}, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25858 entries, 0 to 25857
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          25858 non-null  object
 1   online_order  25858 non-null  object
 2   book_table    25858 non-null  object
 3   rate          21891 non-null  object
 4   phone         25234 non-null  object
 5   location      25849 non-null  object
 6   rest_type     25747 non-null  object
 7   cuisines      25838 non-null  object
 8   cost          25691 non-null  object
 9   type          25858 non-null  object
dtypes: object(10)
memory usage: 2.0+ MB


We need to drop the rows which we cannot impute

In [9]:
df.dropna(subset=['cuisines'], inplace=True)
df.dropna(subset=['phone'], inplace=True)
df.dropna(subset=['rest_type'], inplace=True)
df.dropna(subset=['cost'], inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25116 entries, 0 to 25857
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          25116 non-null  object
 1   online_order  25116 non-null  object
 2   book_table    25116 non-null  object
 3   rate          21385 non-null  object
 4   phone         25116 non-null  object
 5   location      25116 non-null  object
 6   rest_type     25116 non-null  object
 7   cuisines      25116 non-null  object
 8   cost          25116 non-null  object
 9   type          25116 non-null  object
dtypes: object(10)
memory usage: 2.1+ MB


Now we need to handle the missing values in the "rate" column

In [11]:
df['rate'].unique()

array([nan, '3.9 /5', '3.9/5', '2.9/5', '3.7/5', '3.1 /5', '2.8 /5',
       '3.5/5', '3.4 /5', '3.5 /5', '3.6 /5', '3.4/5', '3.8/5', '4.5/5',
       '4.1 /5', '4.0/5', '4.5 /5', '4.2/5', '4.2 /5', '3.0/5', '3.2/5',
       '4.1/5', 'NEW', '4.0 /5', '3.6/5', '3.8 /5', '3.3 /5', '3.1/5',
       '3.3/5', '3.2 /5', '4.3/5', '4.4 /5', '4.3 /5', '3.7 /5', '2.9 /5',
       '4.4/5', '4.6 /5', '3.0 /5', '2.8/5', '2.6/5', '-', '4.7 /5',
       '4.7/5', '2.3/5', '4.6/5', '4.8/5', '2.5 /5', '2.7/5', '2.4 /5',
       '4.8 /5', '2.5/5', '2.6 /5', '4.9/5', '2.7 /5', '2.1 /5', '2.4/5',
       '1.8/5', '2.2 /5', '2.1/5', '4.9 /5', '2.2/5', '2.0/5', '2.3 /5',
       '2.0 /5', '1.8 /5'], dtype=object)

First, we need to remove '/5' first before replacing the missing values

In [12]:
df['rate'] = df['rate'].str.replace('/5', '')

In [13]:
df['rate'].unique()

array([nan, '3.9 ', '3.9', '2.9', '3.7', '3.1 ', '2.8 ', '3.5', '3.4 ',
       '3.5 ', '3.6 ', '3.4', '3.8', '4.5', '4.1 ', '4.0', '4.5 ', '4.2',
       '4.2 ', '3.0', '3.2', '4.1', 'NEW', '4.0 ', '3.6', '3.8 ', '3.3 ',
       '3.1', '3.3', '3.2 ', '4.3', '4.4 ', '4.3 ', '3.7 ', '2.9 ', '4.4',
       '4.6 ', '3.0 ', '2.8', '2.6', '-', '4.7 ', '4.7', '2.3', '4.6',
       '4.8', '2.5 ', '2.7', '2.4 ', '4.8 ', '2.5', '2.6 ', '4.9', '2.7 ',
       '2.1 ', '2.4', '1.8', '2.2 ', '2.1', '4.9 ', '2.2', '2.0', '2.3 ',
       '2.0 ', '1.8 '], dtype=object)

In order to identify the new restaurent we can replace tyhe new value with 0.1

In [14]:
df[df['rate']=='NEW']

Unnamed: 0,name,online_order,book_table,rate,phone,location,rest_type,cuisines,cost,type
37,Agraharaa,No,No,NEW,+91 9945006529,Koramangala 6th Block,Quick Bites,South Indian,150,Dine-out
151,Bhartiya Jalpan,No,No,NEW,+91 8041649637,Commercial Street,Casual Dining,"North Indian, Street Food, Chinese",800,Delivery
166,Rajsi Rasoi,Yes,No,NEW,+91 9916904421,South Bangalore,"Takeaway, Delivery","North Indian, Chinese",600,Dine-out
173,Dunkin' Donuts,Yes,No,NEW,+91 9739222288,Sarjapur Road,"Quick Bites, Dessert Parlor","Desserts, Cafe, Beverages, Burger, Fast Food",300,Delivery
185,Sree Banashankari Donne Biriyani,Yes,No,NEW,+91 8861188512,Rajajinagar,Quick Bites,"North Indian, Kebab",250,Delivery
...,...,...,...,...,...,...,...,...,...,...
25808,Delicious Mom's Kitchen,No,No,NEW,+91 6202500859\r\n+91 9708037348,BTM,Quick Bites,"Bengali, North Indian",200,Delivery
25811,Lassi House,No,No,NEW,+91 8722266000,Commercial Street,Beverage Shop,"Beverages, Desserts",150,Delivery
25823,Authentic Aandhra,Yes,No,NEW,+91 9986865444\r\n+91 9886865444,BTM,Delivery,"Andhra, North Indian",300,Delivery
25846,My Tea House,Yes,Yes,NEW,080 49652475,Banashankari,Casual Dining,"Continental, Asian",800,Delivery


In [15]:
df['rate'] = df['rate'].replace('NEW', '0.1')

In [16]:
df[df['rate']=='0.1']

Unnamed: 0,name,online_order,book_table,rate,phone,location,rest_type,cuisines,cost,type
37,Agraharaa,No,No,0.1,+91 9945006529,Koramangala 6th Block,Quick Bites,South Indian,150,Dine-out
151,Bhartiya Jalpan,No,No,0.1,+91 8041649637,Commercial Street,Casual Dining,"North Indian, Street Food, Chinese",800,Delivery
166,Rajsi Rasoi,Yes,No,0.1,+91 9916904421,South Bangalore,"Takeaway, Delivery","North Indian, Chinese",600,Dine-out
173,Dunkin' Donuts,Yes,No,0.1,+91 9739222288,Sarjapur Road,"Quick Bites, Dessert Parlor","Desserts, Cafe, Beverages, Burger, Fast Food",300,Delivery
185,Sree Banashankari Donne Biriyani,Yes,No,0.1,+91 8861188512,Rajajinagar,Quick Bites,"North Indian, Kebab",250,Delivery
...,...,...,...,...,...,...,...,...,...,...
25808,Delicious Mom's Kitchen,No,No,0.1,+91 6202500859\r\n+91 9708037348,BTM,Quick Bites,"Bengali, North Indian",200,Delivery
25811,Lassi House,No,No,0.1,+91 8722266000,Commercial Street,Beverage Shop,"Beverages, Desserts",150,Delivery
25823,Authentic Aandhra,Yes,No,0.1,+91 9986865444\r\n+91 9886865444,BTM,Delivery,"Andhra, North Indian",300,Delivery
25846,My Tea House,Yes,Yes,0.1,080 49652475,Banashankari,Casual Dining,"Continental, Asian",800,Delivery


Now we need to change the datatype to impute

In [17]:
df['rate'] = pd.to_numeric(df['rate'], errors='coerce')
mean_rating = df['rate'].mean()

In [18]:
df['rate'].fillna(mean_rating, inplace=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25116 entries, 0 to 25857
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          25116 non-null  object 
 1   online_order  25116 non-null  object 
 2   book_table    25116 non-null  object 
 3   rate          25116 non-null  float64
 4   phone         25116 non-null  object 
 5   location      25116 non-null  object 
 6   rest_type     25116 non-null  object 
 7   cuisines      25116 non-null  object 
 8   cost          25116 non-null  object 
 9   type          25116 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.1+ MB


now we have filled the missing value, now we can change the dtype of cost

In [20]:
df['cost'] = df['cost'].str.replace(',', '').astype(float)

In [21]:
df['rest_type'].unique()

array(['Quick Bites', 'Casual Dining', 'Beverage Shop', 'Delivery', 'Pub',
       'Takeaway, Delivery', 'Dessert Parlor', 'Bar', 'Bar, Quick Bites',
       'Microbrewery, Casual Dining', 'Cafe', 'Sweet Shop', 'Fine Dining',
       'Casual Dining, Cafe', 'Beverage Shop, Dessert Parlor',
       'Food Court, Quick Bites', 'Lounge', 'Cafe, Casual Dining',
       'Casual Dining, Pub', 'Takeaway', 'Dessert Parlor, Quick Bites',
       'Bar, Casual Dining', 'Casual Dining, Bar', 'Mess',
       'Casual Dining, Microbrewery', 'Bakery, Quick Bites',
       'Lounge, Casual Dining', 'Club', 'Food Truck',
       'Quick Bites, Dessert Parlor', 'Cafe, Bakery', 'Food Court',
       'Bakery', 'Beverage Shop, Quick Bites', 'Kiosk',
       'Cafe, Quick Bites', 'Quick Bites, Cafe', 'Dhaba', 'Pub, Bar',
       'Pub, Cafe', 'Pub, Casual Dining', 'Dessert Parlor, Cafe',
       'Bakery, Dessert Parlor', 'Quick Bites, Beverage Shop',
       'Cafe, Dessert Parlor', 'Quick Bites, Sweet Shop', 'Cafe, Bar',
      

In [22]:
type_={
    'Casual Dining':'Dining',
    'Cafe, Casual Dining':'Dining',
    'Casual Dining, Cafe':'Dining',
    'Fine Dining':'Dining', 
    'Pub, Casual Dining':'Pub',     
    'Casual Dining, Bar':'Bar',      
    'Microbrewery, Casual Dining':'Dining', 
    'Bar, Casual Dining':'Bar',      
    'Casual Dining, Pub':'Pub',       
    'Food Court, Casual Dining':'Dining', 
    'Casual Dining, Microbrewery':'Dining',      
    'Lounge, Casual Dining':'Dining',       
    'Casual Dining, Irani Cafee':'Dining', 
    'Fine Dining, Lounge':'Dining',       
    'Fine Dining, Microbrewery':'Dining',
    'Fine Dining, Bar':'Bar',  
    'Casual Dining, Quick Bites':'Dining',
    'Casual Dining, Lounge':'Dining',  
    'Club, Casual Dining':'Club',
    'Quick Bites':'Quick Bites',
    'Quick Bites, Cafe':'Quick Bites',
    'Cafe, Quick Bites':'Quick Bites',
    'Beverage Shop, Quick Bites':'Quick Bites',      
    'Quick Bites, Beverage Shop':'Quick Bites', 
    'Bakery, Quick Bites':'Quick Bites', 
    'Sweet Shop, Quick Bites':'Quick Bites',
    'Quick Bites, Dessert Parlor':'Quick Bites',       
    'Quick Bites, Bakery':'Quick Bites', 
    'Dessert Parlor, Quick Bites':'Quick Bites',        
    'Quick Bites, Sweet Shop':'Quick Bites',       
    'Microbrewery':'Microbrewery', 
    'Food Court, Quick Bites':'Quick Bites',       
    'Quick Bites, Food Court':'Quick Bites',       
    'Bar, Quick Bites':'Bar', 
    'Mess, Quick Bites':'Quick Bites',       
    'Quick Bites, Mess':'Quick Bites', 
    'Quick Bites, Meat Shop':'Quick Bites', 
    'Quick Bites, Kiosk':'Quick Bites',
    'Cafe':'Cafe',
    'Cafe, Bakery':'Cafe', 
    'Cafe, Food Court':'Cafe', 
    'Beverage Shop, Cafe':'Cafe',
    'Cafe, Dessert Parlor':'Cafe', 
    'Dessert Parlor, Cafe':'Cafe',
    'Bakery, Cafe':'Cafe', 
    'Pub, Cafe':'Cafe',
    'Cafe, Bar':'Cafe',
    'Cafe, Lounge':'Cafe',
    'Lounge, Cafe':'Cafe',
    'Dessert Parlor':'Dessert Parlor',
    'Bakery, Dessert Parlor':'Dessert Parlor', 
    'Dessert Parlor, Sweet Shop':'Dessert Parlor',            
    'Beverage Shop, Dessert Parlor':'Dessert Parlor', 
    'Dessert Parlor, Beverage Shop':'Dessert Parlor',        
    'Food Court, Dessert Parlor':'Dessert Parlor',    
    'Sweet Shop, Dessert Parlor':'Dessert Parlor', 
    'Dessert Parlor, Bakery':'Dessert Parlor',
    'Dessert Parlor, Kiosk':'Dessert Parlor',
    'Dessert Parlor, Food Court':'Dessert Parlor',
    'Mess':'Mess',
    'Pub':'Pub', 
    'Pub, Bar':'Pub',  
    'Microbrewery, Pub':'Pub',  
    'Pub, Microbrewery':'Pub',
    'Bar, Pub':'Pub', 
    'Bar, Lounge':'Bar',
    'Bar':'Bar', 
    'Lounge, Bar':'Bar',
    'Microbrewery, Bar':'Bar', 
    'Bar, Lounge':'Bar',
    'Bakery':'Bakery',
    'Bakery, Beverage Shop':'Bakery',  
    'Bakery, Kiosk':'Bakery', 
    'Bakery, Sweet Shop':'Bakery', 
    'Bakery, Food Court':'Bakery',
    'Beverage Shop':'Beverage Shop',
    'Confectionery':'Confectionery', 
    'Kiosk':'Kiosk', 
    'Food Truck':'Food Truck', 
    'Takeaway':'Takeaway', 
    'Lounge':'Lounge', 
    'Food Court':'Food Court', 
    'Dhaba':'Dhaba', 
    'Microbrewery, Lounge':'Microbrewery', 
    'Bhojanalya':'Bhojanalya', 
    'Pop Up':'Pop Up', 
    'Lounge, Microbrewery':'Microbrewery', 
    'Food Court, Beverage Shop':'Food Court'

}

In [23]:
df.replace(type_, inplace=True)

In [24]:
df['rest_type'].unique()

array(['Quick Bites', 'Dining', 'Beverage Shop', 'Delivery', 'Pub',
       'Takeaway, Delivery', 'Dessert Parlor', 'Bar', 'Cafe',
       'Sweet Shop', 'Lounge', 'Takeaway', 'Mess', 'Club', 'Food Truck',
       'Food Court', 'Bakery', 'Kiosk', 'Dhaba', 'Microbrewery',
       'Confectionery', 'Bhojanalya', 'Pop Up'], dtype=object)

Here we have 24 types of restuarents.

# Preprocessing

In [25]:
# Combine location, restaurant type, and cost into a single text feature
df["combined_features"] = df["location"] + ' ' + df["rest_type"] + ' ' + df["cost"].astype(str)

# Assuming df["combined_features"] contains the combined textual information
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["combined_features"])

# Normalize the "cost" column to be between 0 and 1
df["cost_normalized"] = (df["cost"] - df["cost"].min()) / (df["cost"].max() - df["cost"].min())

# Concatenate the matrices horizontally
combined_tfidf_matrix = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), df["cost_normalized"]], axis=1)

# Fill NaN values with 0
combined_tfidf_matrix.fillna(0, inplace=True)

# Calculate cosine similarities
cosine_similarities = cosine_similarity(combined_tfidf_matrix)


# Model Training

Function to recommend hotels

In [26]:
def recommend_hotels(hotel_location, hotel_rest_type, hotel_cost, top_n=5):
    # Combine selected features into a single text feature
    selected_features = hotel_location + ' ' + hotel_rest_type + ' ' + str(hotel_cost)

    # Transform the selected features using the same vectorizer
    selected_features_matrix = vectorizer.transform([selected_features])

    # Calculate cosine similarities with the selected features
    cosine_similarities_selected = cosine_similarity(selected_features_matrix, tfidf_matrix)

    # Get the indices of the most similar entries
    similar_indices = cosine_similarities_selected.argsort()[0][-top_n:][::-1]

    # Filter hotels based on user's cost input
    similar_indices = [i for i in similar_indices if df.iloc[i]['cost_normalized'] <= hotel_cost]

    # Check if there are any recommendations
    if similar_indices:
        # Return the recommended hotels
        recommended_hotels = df.iloc[similar_indices]
        # Print the recommended hotels
        print("\nHere are the top 5 recommended hotels:")
        for index, row in recommended_hotels.iterrows():
            rating_info = f" - Rating: {row['rate']:.2f}" if row['rate'] != 0.1 else " - New Restaurent"
            phone_info = f" - Phone: {row['phone']}" if not pd.isnull(row['phone']) else ""
            print(f"\n{row['name']}")
            print(f"  - Location: {row['location']}")
            print(f"  - Restaurant Type: {row['rest_type']}")
            print(f"  - Cost: ₹{row['cost']:.2f}{rating_info}{phone_info}")
    else:
        print("No matching hotels found within the specified cost range.")

In [28]:
# Get user input for the selected hotel factors
selected_hotel_location = input("Enter the location of the selected hotel: ").capitalize()
selected_hotel_rest_type = input("Enter the restaurant type of the selected hotel: ").capitalize()
selected_hotel_cost = float(input("Enter the cost of the selected hotel: "))

# Get recommended hotels based on user input
recommend_hotels(selected_hotel_location, selected_hotel_rest_type, selected_hotel_cost)


Enter the location of the selected hotel: BTM
Enter the restaurant type of the selected hotel: CAFE
Enter the cost of the selected hotel: 500

Here are the top 5 recommended hotels:

De Street Cafe
  - Location: BTM
  - Restaurant Type: Cafe
  - Cost: ₹500.00 - Rating: 3.60 - Phone: +91 8048534323

De Street Cafe
  - Location: BTM
  - Restaurant Type: Cafe
  - Cost: ₹500.00 - Rating: 3.60 - Phone: +91 8048534323

Cafe Hush
  - Location: BTM
  - Restaurant Type: Cafe
  - Cost: ₹500.00 - Rating: 3.60 - Phone: +91 9113838818

Ruh's Cafe
  - Location: BTM
  - Restaurant Type: Cafe
  - Cost: ₹500.00 - Rating: 4.10 - Phone: +91 9986271611
+91 9844803695

The 90s Hash House
  - Location: BTM
  - Restaurant Type: Cafe
  - Cost: ₹500.00 - Rating: 3.80 - Phone: 080 75961114
+91 9900873434
