In [None]:
import pandas as pd
import numpy as np
pd.set_option("max_rows", None)
# Read data from CSV file for Indian Food
data = pd.read_csv('../input/indian-food-101/indian_food.csv')
# Check if data distribution is normal or not based on Mean, Std, IQR
# Here for 75% prep_time and cook_time exists below 20 and 40. Mean is less than Standard Deviation which states abnormal distribution
data.describe()

In [None]:
# -1 values are Nan. Hence replacing it with Nan.
data.replace(['-1', -1], np.nan, inplace=True)

# Check missing values in % in data distribution. If its < 5%, we can drop the rows having more than one missing value

(data.shape[0] - data.dropna().shape[0])/data.shape[0]

# As its > 5%, we can't drop it. We will replace values for prep_time > 20 and cook_time > 40 with Nan to make normal distribution

data.loc[data.prep_time>20, 'prep_time'] = np.nan
data.loc[data.cook_time>40, 'cook_time'] = np.nan

# For prep_time and cook_time, values are continuous. We will replace Nan with median value for values between 75% - 100% as they are extreme in data distribution

data.fillna(data.median(), inplace=True)
data.describe()

In [None]:
# Check columns region and state having values as Nan for both. We will drop these rows for data cleaning as we have to filter data based on thee columns to make meaningful results.
data.drop(index = data.loc[data.region.isnull() & data.state.isnull()].index, inplace=True)

#Prepare final results using groupby based on columns.
data_gr = data.groupby(['ingredients','name', 'region', 'state', 'diet','flavor_profile'])
data_gr.mean()
