In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Part 0. Reading and cleaning data

In [None]:
data = pd.read_csv("../input/asian-restaurants/asia.csv").dropna()
data.head()

In [None]:
data.info()

# Part 1. Visualization

**Let's visualize categorical data**. As categorical parameters we have 'price' and 'town' parameters. Other parameters can be studied as continious parameters or haven't any usefull information.

In [None]:
sns.countplot(data["price"])

As we see, main restaurants, registered in dataset, have middle prices.

In [None]:
sns.countplot(data["town"], orient="v")

In [None]:
sns.countplot("town", hue="price", data=data, orient="v")

**Now let's visualize continious parmeters**.

In [None]:
cont_data = data.drop(["case", "restaurant", "price", "town"], axis=1)
cont_data.head()

In [None]:
sns.heatmap(cont_data.corr())

Let generate a table of histograms of distribution of every parameter.

In [None]:
fig, axes = plt.subplots(3, 3)

cols = list(cont_data)
k = 0
for i in range(9):
    axes[k, (i+1)%3].hist(data[cols[i]], label=cols[i])
    
    if(not (i+1)%3):
        k += 1



**As we see, all continious parameters haven't normal distribution.**

As we can see, **there is one interesting parameter**. This is 'ddRating' parameter.

In [None]:
plt.hist(data[cols[3]], label=cols[3], bins=25)
plt.legend()
plt.show()

After some manipulations we can have next result.

In [None]:
plt.hist(data[ data[cols[3]]>1 ][cols[3]], label=cols[3])
plt.legend()
plt.show()

In [None]:
print("Before modification:")
print("Kurtosis:" + str(data[cols[3]].kurt()))
print("Skewness:" + str(data[cols[3]].skew()), end="\n\n")
print("After modification:")
print("Kurtosis:" + str(data[ data[cols[3]]>1 ][cols[3]].kurt()))
print("Skewness:" + str(data[ data[cols[3]]>1 ][cols[3]].skew()))

# Part 2. Some additional calculations

As we see in table of histograms, there are some pairs of parameters with similar distributions. Let calculate correlation coeffitiens between these parameters.

In [None]:
may_well_correlated_pairs = []
for i in range(8):
    for j in range(i+1, 9):
        if(abs( data[cols[i]].corr(data[cols[j]]) )>0.7):
            may_well_correlated_pairs.append([cols[i], cols[j]])

In [None]:
for pair in may_well_correlated_pairs:
    print(pair[0]+" and "+pair[1])
    print( data[pair[0]].corr(data[pair[1]]), end="\n\n" )

As we see, there are two pairs of well-correlated continious parameters. It means, that we will be able to delete one parameter in every pair and believe, that prediction (or classification) model won't be much worse.

# Part 3. Dependencies between ratings and viewes

Firstly, let see correlation coeffitients between different ratings.

In [None]:
print("gRating and ddRating: "+str(data["gRating"].corr(data["ddRating"])), end="\n\n")
print("gRating and yRating: "+str(data["gRating"].corr(data["yRating"])), end="\n\n")
print("yRating and ddRating: "+str(data["yRating"].corr(data["ddRating"])), end="\n\n")

Now, let see correlation coeffitients between reviews of different ratings.

In [None]:
print("gReviews and ddReviews: "+str(data["gReviews"].corr(data["ddReviews"])), end="\n\n")
print("gReviews and yReviews: "+str(data["gReviews"].corr(data["yReviews"])), end="\n\n")
print("yReviews and ddReviews: "+str(data["yReviews"].corr(data["ddReviews"])), end="\n\n")

Now, what are equal correlation coeffitients between ratings and related reviewes to?

In [None]:
print("gReviews and gRating: "+str(data["gReviews"].corr(data["gRating"])), end="\n\n")
print("yReviews and yRating: "+str(data["yReviews"].corr(data["yRating"])), end="\n\n")
print("ddReviews and ddRating: "+str(data["ddReviews"].corr(data["ddRating"])), end="\n\n")

As we see, DoorDash rating and reviews are not linearly related with Google and Yelp rating and reviews. So, there is interesting question: have any Of registered DoorDash parameters an effect on price in restaurants or no?

# Part 4. DoorDash and price

In [None]:
indexesWithoutDDRating = data[data["ddRating"]==0].index
indexesWithDDRating = data[data["ddRating"]>0].index

In [None]:
sns.histplot(data["price"][indexesWithoutDDRating])

In [None]:
sns.histplot(data["price"][indexesWithDDRating])

As we see, availability of DoorDash rating affects on price, but not strong. But can we say it about DoorDash reviews and price?

In [None]:
sns.histplot(x="price", y="ddReviews", data=data)

As histogram shows, we may speak about some (may be, nonlinear) relation between DoorDash reviews and number of restaurants with some number of reviewes.