In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [22]:


# Attempt to read the CSV file
try:
    df = pd.read_csv(
        '/content/zomato.csv',
        on_bad_lines='skip',  # Skips bad lines with inconsistent columns
        encoding='utf-8',     # Specify encoding (adjust if necessary)
        delimiter=',',        # Specify delimiter (adjust if your file uses something else)
        engine='python'       # Use Python engine for more flexibility
    )
    print("File loaded successfully!")
    print(df.head())  # Display the first few rows to verify
except pd.errors.ParserError as e:
    print("ParserError encountered:", e)
except Exception as e:
    print("An error occurred:", e)

File loaded successfully!
                                                 url  \
0  https://www.zomato.com/bangalore/jalsa-banasha...   
1  https://www.zomato.com/bangalore/spice-elephan...   
2  https://www.zomato.com/SanchurroBangalore?cont...   
3  https://www.zomato.com/bangalore/addhuri-udupi...   
4  https://www.zomato.com/bangalore/grand-village...   

                                             address                   name  \
0  942, 21st Main Road, 2nd Stage, Banashankari, ...                  Jalsa   
1  2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...         Spice Elephant   
2  1112, Next to KIMS Medical College, 17th Cross...        San Churro Cafe   
3  1st Floor, Annakuteera, 3rd Stage, Banashankar...  Addhuri Udupi Bhojana   
4  10, 3rd Floor, Lakshmi Associates, Gandhi Baza...          Grand Village   

  online_order book_table   rate  votes                             phone  \
0          Yes        Yes  4.1/5    775    080 42297555\r\n+91 9743772233   
1       

In [23]:
df1=df.head(20000)

In [24]:
df1.shape

(1408, 17)

In [25]:
df1 = df1.rename(columns={'listed_in(type)': 'type'})
df1 = df1.rename(columns={'listed_in(city)': 'city'})

In [26]:
df1 = df1.rename(columns={'approx_cost(for two people)': 'cost'})
df1.dropna(how='any', inplace=True)
df1['cost']=df1['cost'].astype(str)
df1['cost']=df1['cost'].apply(lambda x: x.replace(',',''))
df1['cost']=df1['cost'].astype(float)

In [27]:
df1.shape

(611, 17)

In [28]:
df1=df1[df1.rate!='NEW']
df1=df1[df1.rate!='-'].reset_index(drop=True)
remove_slash=lambda x: x.replace('/5','')if type(x)==str else x
df1.rate=df1.rate.apply(remove_slash).str.strip().astype('float')

In [29]:
df1.name = df1.name.apply(lambda x:x.title())
df1.online_order.replace(('Yes','No'),(True, False),inplace=True)
df1.book_table.replace(('Yes','No'),(True, False),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1.online_order.replace(('Yes','No'),(True, False),inplace=True)
  df1.online_order.replace(('Yes','No'),(True, False),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1.book_table.replace(('Yes','No'),(True, False),inplace=True)
  df1.book_table.replace(('Yes','N

In [30]:
rest_co = df1['name'].nunique()
rest_co

435

In [31]:
df1['Mean Rating'] = 0

In [32]:
resturants = list(df1['name'].unique())

In [33]:
!pip install textblob
from textblob import TextBlob
import re
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
from nltk.tokenize import word_tokenize

In [35]:
df1['reviews_list'] = df1['reviews_list'].str.lower()

In [36]:
STOPWORD = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in str(text).split() if word not in STOPWORD])

In [37]:
df1['reviews_list'] = df1['reviews_list'].apply(lambda text: remove_stopwords(text))

In [38]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [39]:
df1['reviews_list'] = df1['reviews_list'].apply(lambda text: remove_urls(text))

In [40]:
import re

In [41]:
df1[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
297,"[('rated 4.0', 'rated\n well, maggi lover plac...","Fast Food, Beverages"
141,"[('rated 3.0', ""rated\n place visited wait fri...","Cafe, American, Pizza"
208,"[('rated 1.0', 'rated\n think mistake visiting...","North Indian, Street Food"
92,"[('rated 4.0', 'rated\n sunday noon friend dam...",South Indian
511,"[('rated 4.0', ""rated\n place serves great var...","Street Food, South Indian"
233,"[('rated 3.0', ""rated\n always fan corner hous...","Ice Cream, Desserts"
54,"[('rated 3.0', 'rated\n ididnt like much.\n\nn...",North Indian
571,"[('rated 3.0', ""rated\n great place quiet time...","Burger, Cafe, Desserts, Sandwich, Italian"
264,"[('rated 4.0', 'rated\n excellent food spicy c...","North Indian, Chinese"
285,"[('rated 3.0', 'rated\n place near banashankar...","Chinese, Thai"


In [42]:
def remove_rated(text):
  text = re.sub(r'rated\s*\d+\.?\d*', '', str(text))
  return text

In [43]:
df1['reviews_list'] = df1['reviews_list'].apply(remove_rated)

In [44]:
df1[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
431,"[('', 'rated\n thing good place packaging.\n\n...","Healthy Food, Chinese, Biryani, North Indian, ..."
374,"[('', 'rated\n buffet lunch @ empire restauran...","North Indian, Mughlai, South Indian, Chinese"
286,"[('', 'rated\n selected pizzas gud.rest everyt...","Pizza, Italian, Salad"
475,"[('', 'rated\n ordered bombay masala quesidill...","Mexican, Italian"
500,"[('', 'rated\n ordered egg masala omlette comb...",Cafe
331,"[('', 'rated\n one oldest restaurant basavanag...","South Indian, Chinese, North Indian, Street Fo..."
39,"[('', ""rated\n dear biryanis more...\ni cannot...","Biryani, North Indian, Chinese, Andhra, South ..."
334,"[('', 'rated\n no-frills place nr colony serve...",South Indian
490,"[('', ""rated\n kanti sweets extremely close ho...","Mithai, Street Food"
115,"[('', 'rated\n words paneer...one places u get...","North Indian, Cafe, Chinese, Fast Food"


In [45]:
def remove_ratedn(text):
  text = re.sub(r'ratedn', '', str(text))
  return text

In [46]:
df1[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
399,"[('', 'rated\n lovely place lovely ambience lo...","Cafe, Burger, Beverages"
488,"[('', 'rated\n good punjabi food find branch b...",North Indian
562,"[('', 'rated\n good stumptous parantha, tasty ...",North Indian
343,"[('', 'rated\n good dosa place.love kali dosa ...",South Indian
93,"[('', ""rated\n tiny cafã\x83ã\x83ã\x82ã\x8...","Cafe, French, North Indian"
98,"[('', 'rated\n minute away place . glad veg pl...","Pizza, Sandwich, Burger, Fast Food"
607,"[('', ""rated\n got zomato gold 1+1 food here.\...","North Indian, Continental"
215,"[('', 'rated\n food - 1/5\n\nambience - 4/5\n\...","Pizza, Italian, Continental, Desserts"
100,"[('', 'rated\n great place team lunch. well pl...","North Indian, Chinese, BBQ"
75,"[('', 'rated\n quite small cute place visit. a...","Beverages, Ice Cream"


In [47]:
resturant_name = list(df1['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [48]:
df1=df1.drop(['address','rest_type','type','menu_item','votes'],axis=1)

In [49]:
df_percent=df1.sample(frac=0.5)

In [50]:
df_percent.shape

(305, 13)

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [53]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import linear_kernel # Import linear_kernel

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [55]:
cosine_similarities

array([[1.        , 0.0411331 , 0.03251528, ..., 0.01700662, 0.01509178,
        0.03898666],
       [0.0411331 , 1.        , 0.00661856, ..., 0.00248461, 0.01812532,
        0.01574315],
       [0.03251528, 0.00661856, 1.        , ..., 0.00443755, 0.00789837,
        0.0309591 ],
       ...,
       [0.01700662, 0.00248461, 0.00443755, ..., 1.        , 0.00326299,
        0.01143161],
       [0.01509178, 0.01812532, 0.00789837, ..., 0.00326299, 1.        ,
        0.01791213],
       [0.03898666, 0.01574315, 0.0309591 , ..., 0.01143161, 0.01791213,
        1.        ]])

In [56]:
def recommend(name, cosine_similarities=cosine_similarities):
    recommend_restaurant = []

    idx = indices[indices == name].index[0]

    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    top30_indexes = list(score_series.iloc[0:31].index)

    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])

    # Instead of appending, create a list to store DataFrames
    df_list = []

    for each in recommend_restaurant:
        # Append the DataFrame to the list
        df_list.append(pd.DataFrame(df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == each].sample()))

    # Concatenate all DataFrames in the list into a single DataFrame
    df_new = pd.concat(df_list, ignore_index=True)

    df_new = df_new.drop_duplicates(subset=['cuisines', 'Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)

    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))

    return df_new

In [58]:
recommend('Jalsa')

TOP 10 RESTAURANTS LIKE Jalsa WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
4,"North Indian, South Indian, Chinese",0,800.0
19,"North Indian, Chinese, Continental",0,800.0
29,Finger Food,0,1300.0
28,"North Indian, Continental",0,1000.0
27,North Indian,0,800.0
26,"Cafe, Asian, Burger, Continental, Italian, Salad",0,600.0
25,South Indian,0,300.0
24,"North Indian, Continental",0,1100.0
23,"North Indian, Chinese, Biryani",0,500.0
22,"North Indian, Chinese, BBQ",0,1500.0
