**In this analysis I have tried to: **
* clean the data as possible 
* Finding the Insights and Data Visualisation
* Finding hidden informations and exploring
* Extracting ratings from reviews
* Food trend per location
* Extracting frequently used names of dishes from reviews




In [None]:
#import the necessary libraries
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
# load data to pandas dataframe
data=pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')

In [None]:
#rename column
data1=data.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'food_type','reviews_list':'review','listed_in(city)':'city'}) #rename columns


In [None]:
print("Percentage null or na values in df")
((data1.isnull() | data1.isna()).sum() * 100 / data1.index.size).round(2)

In [None]:
# getting some info abut data
data1.info()

In [None]:
#converting to string
data1['rate']=data1['rate'].astype('str')

In [None]:
#finding the unique words
data1['rate'].unique()

* Now we found this column has 'NEW' ,"-" values which should be replaced by np.nan


In [None]:
#replacing by nan
data1.rate.replace(('NEW','-'),np.nan,inplace =True)  

As you can see the rate column is string type with an extra /5 with all the ratings. This should be cleaned.It is important to convert the string back to float !!

In [None]:
data1['rate']=data1['rate'].astype('str')
data1['rate']=data1['rate'].apply(lambda x: x.replace('/5','').strip())

In [None]:
data1.dropna(how='any',inplace = True)

In [None]:
data1=data1.loc[data1['votes']!='nan']

In [None]:
# deleting unnecessary column
column_to_drop = ['address','url', 'phone']
data1.drop(columns=column_to_drop, axis=1,inplace=True)

In [None]:
#checking duplicates
data1.duplicated().sum() 

In [None]:
#removing duplicates
data1.drop_duplicates(inplace=True)              

In [None]:
data1['rate']=data1['rate'].astype('float')

In [None]:
data1.shape

In [None]:
#which restaurant has more rating
test1=data1.groupby('name',as_index=False)['rate'].mean()
test1.sort_values('rate',ascending=False)[:10]             #descending


In [None]:
data1['rest_type'].unique()

# most preferred restaurant type

In [None]:
plt.rcParams['figure.figsize'] = 6,8
rest=data1['rest_type'].value_counts()[:20]
sns.barplot(rest,rest.index)
plt.title('preferred rest_type', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xlabel("count")

*  Casual dining & Quick Bites type restaurants dominate.

In [None]:
data1['rate']=data1['rate'].astype('float')


we are going to set a range for rate column which will be as follows
* not recommended : 0-2
* average : 2-3
* good : 3-4
* highly recommended : 4-5

In [None]:
bins =[0,2,3,4,5]
labels =['not recommended','average','good','highly recommended']
data1['rate_range'] = pd.cut(data1['rate'], bins=bins,labels=labels)
data1.loc[:5,['rate','rate_range']]


In [None]:
ct= pd.crosstab(data1['food_type'],data1['rate_range'])
ct.plot.bar(stacked=True)
plt.legend(title='rate')                               #stack
plt.title('preferring food type', fontdict={'fontweight':'bold', 'fontsize': 18})   #font style
plt.show()

*** Table booking vs cost**

In [None]:
plt.rcParams['figure.figsize'] = (15, 9)
x = pd.crosstab(data1['cost'], data1['book_table'])
x.div(x.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked = True,color=['red','yellow'])
plt.title('Table booking vs cost', fontweight = 30, fontsize = 20)
plt.legend(loc="upper right")
plt.show()

# let's figure out,which city has more food lovers?

In [None]:
plt.rcParams['figure.figsize'] = 15,8
# sns.barplot(y=data0['location'].value_counts()[:2].index,color='#abcdef')
rest=data1['location'].value_counts()[:20]
sns.barplot(rest.index,rest)
plt.title('food lovers belongs to', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.show()

# ordering in online vs booking table 

In [None]:
plt.rcParams['figure.figsize'] = 15,8
plt.subplot(2,1,1)
# plt.rcParams['figure.figsize'] = 15,8
sns.countplot('location',hue='online_order',data=data1)
plt.title('preferring online order', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.show()

plt.subplot(2,1,2)
# plt.rcParams['figure.figsize'] = 15,8
sns.countplot('location',hue='book_table',data=data1)
plt.title('preferring book table ', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.show()

* No doubt people are preferring order in online

In [None]:
#expensive food_type

temp=data1[['food_type','cost']].sort_values('cost',ascending=False).reset_index(drop=True)  
temp=temp.head(20)
temp.style.background_gradient(cmap='Blues')

# Affordable costs for ordering in online

In [None]:

plt.rcParams['figure.figsize'] = 15,8
sns.countplot(y='rate',hue='online_order',data=data1)
plt.title('rate vs online', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.show()

In [None]:
data1['cost'].unique()

remove commma and join to get integer value

In [None]:
data1['cost']=data1['cost'].astype('str')

In [None]:
data1['cost']=data1['cost'].apply(lambda x: x.replace(',','').strip())

In [None]:
data1['cost']=data1['cost'].astype('int')

# Preferred price rate

In [None]:
plt.rcParams['figure.figsize'] = 8,15
sns.distplot(data1['cost'],color='#abcdef',kde=False)
plt.title('peferred price rate', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.show()

* preferred cost per location

In [None]:
plt.rcParams['figure.figsize'] = 18,15
sns.scatterplot(x='cost',y='location',hue='online_order',data=data1)
plt.title('affordable cost per location', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.grid()
plt.show()

#. How many Restuarants have online order service? & book table service?

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
plt.rcParams['figure.figsize'] = 9,6
plt.subplot(1, 2, 1)
data1.online_order.value_counts().plot('pie',colors=Pastel1_7.hex_colors)
# add white circle to male donut plot
w_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(w_circle)
plt.title('Online Order Service',weight ='bold')

plt.subplot(1, 2, 2)
data1.book_table.value_counts().plot('pie',colors=Pastel1_7.hex_colors)
# add white circle to male donut plot
w_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(w_circle)
plt.title('Book Table Service',weight ='bold')
plt.tight_layout()
plt.rcParams['figure.figsize'] = 9,6
plt.show()


# Top dish

A lot of the values in dish liked column appeared as comma separated and contains much information behind if we split it and count each separately

In [None]:
# BEST DISH
from collections import Counter 
lst=[]
for line in data1['dish_liked']:
    word=line.split(',')
    for i in range(0,len(word)):
        lst.append(word[i])
        
Counter = Counter(lst) 
most_occur = Counter.most_common(10) 
print(most_occur)

# extracting rate and text from reveiw column
* To find hidden information

In [None]:
#extracting rate from review column
data1['review_rate']=''
lst2=[]
for index,row in data1.iterrows():
    lst1=[]
    b=0
#     print(row['reviews'])
    for  i in eval(row['review']):
        if i[0] is not None:
                a=float(i[0][-3:])
                lst1.append(a)
        else:
            b=0
#             print(b)
    
    if(len(lst1)>0):
                b=sum(lst1)/len(lst1)
                b="%.2f" % b
                        
    
    data1.loc[index,'review_rate']=b
       
            
            

In [None]:
#extracting text only from review column 
data1['review_only']=''

for index,row in data1.iterrows():
    a=''
      
    for  i in eval(row['review']):

        a += i[1].replace('RATED\n','').strip()
            
    data1.loc[index,'review_only']=a


In [None]:
data1[['review_rate','review_only']]

*  extract mostly used dish words from review text which is common to dish_liked column  

In [None]:
collection=[]
for index,row in data1.iterrows():
        line = [x.strip() for x in row['dish_liked'].split(',')]          
        for i in line:
            collection.append(i)
#             print(i)
menu_set=set(collection)
menu_set.intersection(data1.review_only[1000].split(' '))

# Top dish

In [None]:
from collections import Counter
line=[x.strip() for x in ','.join(data1['dish_liked']).split(',')]          #stripping and splitting  in python
counter=Counter(line)
counter=counter.most_common(20)
dish_count=pd.DataFrame(counter, columns = ['dish', 'count'])

In [None]:
dish_count=dish_count.head(10)
plt.rcParams['figure.figsize'] = 8,6
sns.barplot(x='dish',y='count',data=dish_count)
plt.title('most loved dishes', fontdict={'fontweight':'bold', 'fontsize': 18})
plt.xticks(rotation='vertical')
plt.grid()
plt.show()

# Best dish per location

In [None]:
from collections import Counter
loc_dish = data1.groupby('location')['dish_liked'].value_counts()
ind = loc_dish.index.levels[0]
location=[]
dish=[]
count=[]
for i in ind:
    dishes=[x.strip() for x in ','.join(loc_dish[i].index).split(',')]
    counter=Counter(dishes)
    counter=counter.most_common(1)
    for  j in counter:
        location.append(i)
        dish.append(j[0])
        count.append(j[1])

In [None]:
loc_dish_df = pd.DataFrame({'location':location,'top_dish':dish,'count':count}).head(20)    
loc_dish_df