In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center> <img src="https://cdn.britannica.com/q:60/08/177308-050-94D9D6BE/Food-Pizza-Basil-Tomato.jpg" width="50%" height="50%"> </center>

In [None]:
# Importing all the important libraries first

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading dataset and making a copy of it

main_df = pd.read_csv("/kaggle/input/pizza-price-prediction/pizza_v2.csv")
df = main_df.copy()
df.head()

In [None]:
# Checking the shape of the dataset

df.shape

In [None]:
# Checking the basic information about the dataset.

df.info()

In [None]:
# Number of unique values present in each feature column

df.nunique()

In [None]:
df.isna().any()

In [None]:
# No Null values are present

sns.heatmap(df.isnull())

In [None]:
# Remving "Rp" and "inch" from rows and changing datatype

df['diameter'] = df['diameter'].str.replace('inch', '').str.replace(',', '').astype('float64') # Removing Inch
df['price_rupiah'] = df['price_rupiah'].str.replace('Rp', '').str.replace(',', '').astype('float64') # Removing Rp 

df.loc[:, ['price_rupiah', 'diameter']]

In [None]:
df.head()

In [None]:
# Statistical Measure of the dataset

df.describe()

In [None]:
# Correlation matrix

plt.figure(figsize=(6,5))
sns.heatmap(df.corr(), annot=True)
plt.show()

## Univariate Analysis

In [None]:
# Checking and visualizing the type of distribution of a feature column

fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['price_rupiah'], ax=ax1 , color ='red')
ax1.set(title='price_rupiah distribution')
qqplot(df['price_rupiah'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [None]:
# Checking and visualizing the type of distribution of a feature column

fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['diameter'], ax=ax1 , color ='red')
ax1.set(title='diameter distribution')
qqplot(df['diameter'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [None]:
# Checking skewness value 
# If value lies between -0.5 to 0.5  then it is normal otherwise skewed

skew_val = df.skew().sort_values(ascending=False)
skew_val

## Bivariate Analysis

In [None]:
fig = px.bar(df,x='diameter',y='price_rupiah,color='size)
fig.show()

In [None]:
df.info()

In [None]:
fig = px.bar(df, x='diameter', y='price_rupiah', color="variant")
fig.show()

In [None]:
fig = px.bar(df, x='diameter', y='price_rupiah', color="company")
fig.show()

In [None]:
fig = px.bar(df, x='diameter', y='price_rupiah', color="topping")
fig.show()

In [None]:
sns.barplot(x = 'size', y = 'price_rupiah', data = df,  saturation=0.90)

In [None]:
sns.barplot(x = 'company', y = 'price_rupiah', data = df,  saturation=0.90)

In [None]:
fig = px.histogram (df, x = "price_rupiah",  facet_row = "company",  template = 'plotly_dark')
fig.show ()

In [None]:
fig =  px.pie (df, names = "size", hole = 0.4, template = "gridon")
fig.show ()

In [None]:
fig =  px.pie (df, names = "company", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "variant", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "topping", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "extra_sauce", hole = 0.4, template = "plotly_dark")
fig.show ()

# variant          129 non-null    object 
#  5   size             129 non-null    object 
#  6   extra_sauce      129 non-null    object 
#  7   extra_cheese     129 non-null    object 
#  8   extra_mushrooms

In [None]:
fig =  px.pie (df, names = "extra_cheese", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "extra_mushrooms", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
df.info()

In [None]:
fig = px.scatter (df, y = "diameter", x = "price_rupiah", color = "extra_mushrooms", template = "plotly_dark",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, y = "diameter", x = "price_rupiah", color = "extra_sauce", template = "plotly_dark",  trendline="lowess")
fig.show ()

## Multivariate Analysis

In [None]:
sns.pairplot(data=df, vars=['diameter', 'price_rupiah'], \
             hue='size', kind='reg', diag_kind='kde', size=5, palette='husl')

In [None]:
sns.pairplot(df)

In [None]:
df.info()

## Encoding

In [None]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder 

encoder = LabelEncoder() 

In [None]:
for i in df.columns:
    if df[i].dtype == 'object': 
        encoder.fit_transform(list(df[i].values))
        df[i] = encoder.transform(df[i].values) 
         
        for j in df.columns:
            if df[j].dtype == 'int':
                df[j] = df[j].astype('float64') 

In [None]:
df.info()

In [None]:
df.head()

## Splitting data

In [None]:
X = df.drop(['price_rupiah'], axis=1)
y = df['price_rupiah']

X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from sklearn.ensemble import  RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Model Building 

### Random Forest Regressor 

In [None]:
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)

In [None]:
print('Testing R2 Score: ', r2_score(y_test, pred_rf)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_rf)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_rf))
print('Testing MSE: ', mean_squared_error(y_test, pred_rf))

In [None]:
pred_rf_trn = model_rf.predict(X_train)

In [None]:
print('Training R2 Score: ', r2_score(y_train, pred_rf_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_rf_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_rf_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_rf_trn))

In [None]:
feat_importances_rf = pd.Series(model_rf.feature_importances_, index=X.columns)
feat_importances_rf.nlargest(10).plot(kind='barh')
plt.show()

* Here diameter is having highest feature importance

### XGBoost Regressor

In [None]:
from xgboost import XGBRFRegressor

In [None]:
# creating object of XGBoost
model_xgb = XGBRFRegressor(max_depth=8, n_estimators = 10)
model_xgb.fit(X_train, y_train)
pred_xgb = model_xgb.predict(X_test)

In [None]:
print('Testing R2 Score: ', r2_score(y_test, pred_xgb)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_xgb)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_xgb))
print('Testing MSE: ', mean_squared_error(y_test, pred_xgb))

In [None]:
pred_xgb_trn = model_xgb.predict(X_train)

In [None]:
print('Training R2 Score: ', r2_score(y_train, pred_xgb_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_xgb_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_xgb_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_xgb_trn))

In [None]:
feat_importances_xbg = pd.Series(model_xgb.feature_importances_, index=X.columns)
feat_importances_xbg.nlargest(10).plot(kind='barh')
plt.show()

* Diameter and Size have maximum feature importance in this case

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# creating object of Decision Tree Regressor
model_dt = DecisionTreeRegressor()
model_dt.fit(X_train, y_train)
pred_dt = model_dt.predict(X_test)

In [None]:
print('Testing R2 Score: ', r2_score(y_test, pred_dt)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_dt)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_dt))
print('Testing MSE: ', mean_squared_error(y_test, pred_dt))

In [None]:
pred_dt_trn = model_dt.predict(X_train)

In [None]:
print('Training R2 Score: ', r2_score(y_train, pred_dt_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_dt_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_dt_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_dt_trn))

In [None]:
feat_importances_dt = pd.Series(model_dt.feature_importances_, index=X.columns)
feat_importances_dt.nlargest(10).plot(kind='barh')
plt.show()

* Decision tree and Random Forest are giving almost same feature importance value

## Thank you for reading this notebook. Please do like 👍 if you find it useful and also check my other notebook.😀