In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df=pd.read_csv('../input/zomato.csv',encoding="ISO-8859-1")

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
##drop the following columns -> Longitude,Latitude, Locality, Locality verbose, Address,Switch to order menu.
to_drop=["Longitude","Latitude","Locality","Locality Verbose","Address","Switch to order menu"]
df.drop(to_drop,inplace=True,axis=1)

In [None]:
# Check if the IDs are unique.
df['Restaurant ID'].is_unique

In [None]:
# Now, We'll duplicate the country code column and change one of them according 
## to their respctive country names.
df["Country Code1"]=df["Country Code"].apply(str)
df['Country Code']=df['Country Code'].replace({189:'Canada',216:'Tunisia',215:'Philadelphia',214:'Dallas',1:'India',30:'Greece',148:'Equador'})
df['Country Code']=df['Country Code'].replace([208,14,94,191,162,184,166,37],'Others')
df=df.rename(columns={"Country Code":"Country Name"})

In [None]:
df[df["Average Cost for two"]>450000]

In [None]:
df=df[df["Restaurant ID"] != 7402935]
df=df[df["Restaurant ID"] != 7410290]
df=df[df["Restaurant ID"] != 7420899]

In [None]:
cnd=pd.DataFrame(df["Country Name"].value_counts())
sns.barplot(x=cnd.index,y=cnd["Country Name"],data=cnd,palette='Blues_d').set_title('Distribution of restaurants in different Countries')
fig = plt.gcf()
fig.set_size_inches(10,5)
plt.show()

In [None]:
## Change the columns->'Has Table booking,Has Online delivery,Is delivering now'
## with values as Yes/No and replace them with binary(1/0) values.
df['Has Table booking'] = pd.get_dummies(df["Has Table booking"],drop_first=True)
df['Has Online delivery'] = pd.get_dummies(df["Has Online delivery"],drop_first=True)
df['Is delivering now'] = pd.get_dummies(df["Is delivering now"],drop_first=True)

In [None]:
labelyn=['no','yes']
fig = plt.gcf()
htb=df["Has Table booking"].value_counts()
htbd=pd.DataFrame(htb)
hod=df["Has Online delivery"].value_counts()
hodd=pd.DataFrame(hod)
fig.set_size_inches(8,8)
plt.subplot(2,2,2)
p1=plt.pie(htbd["Has Table booking"],labels=labelyn,colors = ['#1674b1','#ed6d50'],explode=[0.05,0.05],autopct='%2f')
fig = plt.gcf()
fig.set_size_inches(8,8)
plt.title('Distribution of number of restaurant\n which have the option of booking table.')
plt.subplot(2,2,1)
p2=plt.pie(hodd["Has Online delivery"],labels=labelyn,colors = ['#1674b1','#ed6d50'],explode=[0.05,0.05],autopct='%2f')
fig = plt.gcf()
plt.title('Distribution of number of restaurant who \n takes order online.')
plt.figure(2)
plt.show()

In [None]:
## The first PieChart shows the percentage of restaurants which have the option
## of taking orders online.
## The second PieChart shows the percentage of restaurants which have the option to 
## book a table in advance.

In [None]:
table=pd.crosstab(df["Rating text"],df["Price range"])
table.div(table.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True,color=['#5a5255','#1b85b8','#559e83','#ae5a41'])
fig.set_size_inches(12,8)
plt.show()

In [None]:
df['Currency']=df['Currency'].replace({'Dollar($)':'Dollar','Pounds(��)':'Pounds','Brazilian Real(R$)':'Brazilian Real','NewZealand($)':'NewZealand Dollar'})


In [None]:
sns.barplot(x='Price range',y='Average Cost for two',palette="Blues_d",data=df)
plt.show()

In [None]:
# The above chart shows that Price range and average cost for two are not co related
# as we did not get the expected results

In [None]:
table=pd.crosstab(df["Country Name"],df["Rating text"])
table.div(table.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True,color=['#5a5255','#1b85b8','#559e83','#ae5a41'])
figz = plt.gcf()
figz.set_size_inches(12,6)
plt.show()


In [None]:
sns.barplot(x="Country Name",y="Votes",hue="Has Table booking",palette="Blues_d",data=df)
fig2 = plt.gcf()
fig2.set_size_inches(12,6)
plt.show()


In [None]:
cus=df["Cuisines"].value_counts()
cuisines = {}
cnt=0
for i in cus.index:
    for j in i.split(", "):
        if j not in cuisines:
            cuisines[j]=cus[cnt]
        else:
            cuisines[j] += cus[cnt]
    cnt += 1
    
cuisines = pd.Series(cuisines).sort_values(ascending=False)

In [None]:
sns.barplot(cuisines[:15].values,cuisines[:15].index, palette="Blues_d")
fig2 = plt.gcf()
fig2.set_size_inches(16,6)
plt.show()

In [None]:
India=df[df.Currency == 'Indian Rupees(Rs.)']

In [None]:
sns.boxplot(India["Votes"])

In [None]:
q3_v=India["Votes"].quantile(0.75)
q1_v=India["Votes"].quantile(0.25)
iqr_v=q3_v-q1_v
lowervotes=q1_v-(iqr_v*1.5)
uppervotes=q3_v+(iqr_v*1.5)
uppervotes

In [None]:
India=India[India["Votes"]<244]

In [None]:
sns.boxplot(India["Average Cost for two"])

In [None]:
q3_avg=India["Average Cost for two"].quantile(0.75)
q1_avg=India["Average Cost for two"].quantile(0.25)
iqr_avg=q3_avg-q1_avg
loweravg=q1_avg-(iqr_avg*1.5)
upperavg=q3_avg+(iqr_avg*1.5)
upperavg

In [None]:
India=India[India["Average Cost for two"]<1050]

# Train_Test_Split


In [None]:
X=India.drop(["Restaurant ID","Restaurant Name","Rating text","Country Name","City","Rating color",
           "Cuisines","Currency","Country Code1","Aggregate rating"],axis=1)
y=India["Aggregate rating"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

# LinearRegression

In [None]:
model= LinearRegression()
model.fit(X_train,y_train)

In [None]:
y_predict=model.predict(X_test)


In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

# RandomForestRegressor

In [None]:
modelrf= RandomForestRegressor(n_estimators=1000)
modelrf.fit(X_train,y_train)

In [None]:
y_predictrf=modelrf.predict(X_test)
r2_score(y_test,y_predictrf)

# DecisionTreeRegressor

In [None]:
modeldt= DecisionTreeRegressor(max_depth=6)
modeldt.fit(X_train,y_train)

In [None]:
y_predictdt=modeldt.predict(X_test)
r2_score(y_test,y_predictdt)

In [None]:
from sklearn.externals import joblib
filename="Decision_tree.sav"
joblib.dump(modeldt,filename)

In [None]:
from box import Box
best_model=joblib.load(filename)
from ipywidgets import *
from IPython.display import display

submit = Button(description='Get my Rating')
item=[
    IntSlider(min=0,max=1050,description="Average Cost for two"),
    IntSlider(min=0,max=1,steps=1,value=0,description="Has Table booking"),
    IntSlider(min=0,max=1,steps=1,value=0,description="Has Online delivery"),
    IntSlider(min=0,max=1,steps=1,value=0,description="Is delivering now"),
    IntSlider(min=0,max=4,steps=1,value=0,description="Price range"),
    IntSlider(min=0,max=244,description="Votes"),
    ]
def on_button_clicked(b):
    emp_row = []    
    for widget in item:
        value = widget.value
        emp_row.append(value)
    y_predicts = best_model.predict(np.array(emp_row).reshape(1, -1))
    print 'Aggregate Rating : {:.1f}'.format(float(y_predicts))

form = Box(item )
submit.on_click(on_button_clicked)


In [None]:
display(form, submit)