In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(os.listdir("../input"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df=pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')

In [None]:
df.isnull().sum()

In [None]:
#removing redundant data based on address and name
df.head()

In [None]:
df[df['name']=='Onesta']

In [None]:
#removing redundant data based on address and name
df=df.drop_duplicates(subset=['address','name'],keep='last')

In [None]:
#fill NaN in the rating column
df['Rate']=df['rate'].fillna('3.0/5',inplace=True)

In [None]:
#Rating is 4.1/5 we take 4.1 as string then convert it to float
df['Rating']=df['rate'].map(lambda x: str(x)[0:3])
df['Rating']=pd.to_numeric(df.Rating, errors='coerce')

In [None]:
#converting average price of 2 person to float
df=df.rename(columns={'approx_cost(for two people)':'cost'})
df['Ncost']=df['cost'].str.replace(',','')
df['Ncost']=pd.to_numeric(df.Ncost,errors='coerce')

In [None]:
#Dropping columns not necessary.
df=df.drop(['phone','cost','rate'],axis=1)
df=df.drop(['url','Rate'],axis=1)
df=df.drop(['menu_item'],axis=1)

In [None]:
#Below we can see two columns with almost same data , we drop one with less data.
print(df['listed_in(city)'].nunique())
print(df['location'].nunique())

In [None]:
df=df.drop('listed_in(city)',axis=1)

In [None]:
df['location'].value_counts()

In [None]:
#Creating a histogram plot of average cost for two people in Bangalore 
df.hist(column='Ncost',bins=30)

In [None]:
#Creating bar plot with 5 areas with most Restraunts.
plt.figure(figsize=(6,5))
sns.countplot(x='location',data=df,order=df.location.value_counts().iloc[:5].index,palette='rainbow')
plt.xticks(rotation=90)

In [None]:
#Finding best restaurants we need to take care of votes and ratings. 
#So keeping both in mind
#Top 5 rated restraunts
df=df.sort_values('votes',ascending=False)
fl=df[:5]
plt.figure(figsize=(10,10))

x=df['Rating']
y=df['votes']
plt.scatter(x,y,label='Best Restraunts In Bangalore',marker='o')

label=list(fl['name'])
x=list(fl['Rating'])
y=list(fl['votes'])
for i in range(len(label)):
    plt.annotate(label[i],(x[i],y[i]),ha='right')
plt.xlabel('Average Rating')
plt.ylabel('No of Votes')

In [None]:
#So top 5 restaurants based on the above graph are:
#Byg Brewski
#Toit
#Truffles
#AB's Absolute Barbeques
#The Black Pearl

In [None]:
#Pie chart displaying categories having maximum restaurants


restaurantTypeCount=df['rest_type'].value_counts().sort_values(ascending=False)
slices=[restaurantTypeCount[0],
        restaurantTypeCount[1],
        restaurantTypeCount[2],
        restaurantTypeCount[3],
        restaurantTypeCount[4],
        restaurantTypeCount[5],
        restaurantTypeCount[6]]
labels=['Pubs and bars','Buffet','Drinks & nightlife','Cafes','Desserts','Dine-out','Delivery ']
colors = ['#3333cc','#ffff1a','#ff3333','#c2c2d6','#6699ff','#c4ff4d','#339933']
plt.pie(slices,colors=colors, labels=labels, autopct='%1.0f%%', pctdistance=.5, labeldistance=1.2,shadow=True)
fig = plt.gcf()
plt.title("Percentage of Restaurants according to their Type", bbox={'facecolor':'1', 'pad':5})

fig.set_size_inches(12,12)
plt.show()



In [None]:
#Online Order v/s Rate of restaurant
plt.rcParams['figure.figsize'] = (15, 9)
x = pd.crosstab(df['Rating'], df['online_order'])
x.div(x.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked = True,color=['red','yellow'])
plt.title('online order vs rate', fontweight = 30, fontsize = 20)
plt.legend(loc="upper right")
plt.show()