In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
main_df = pd.read_csv('/kaggle/input/pizza-price-prediction/pizza_v2.csv')

In [4]:
df=  main_df.copy()
df.head()

In [5]:
df.shape

In [6]:
df.info()

In [7]:
df.nunique()

In [8]:
df.dtypes

In [9]:
df.isna().sum() # no null value present

In [10]:
sns.heatmap(df.isnull())

In [11]:
# Removing Rp and inch from rows and changing data types
df['diameter'] = df['diameter'].str.replace('inch','').str.replace(',','').astype('float64')
df['price_rupiah'] = df['price_rupiah'].str.replace('Rp','').str.replace(',','').astype('float64')

In [12]:
df.head()

In [13]:
df.describe() # statical measure of data set

In [14]:
# co relation matrix
plt.figure(figsize=(6,5))
sns.heatmap(df.corr(), annot=True)
plt.show()

# Univariate Analysis

In [15]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['price_rupiah'], ax=ax1 , color ='red')
ax1.set(title='price_rupiah distribution')
qqplot(df['price_rupiah'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [16]:
fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['diameter'], ax=ax1 , color ='red')
ax1.set(title='diameter distribution')
qqplot(df['diameter'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [17]:
# checking skewness values
# if the value lies between -0.5 to0.5 then it's normal distribution otherwise skewed
skedval = df.skew().sort_values(ascending = False)
skedval

# Bivariate Analysis

In [18]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='size')
fig.show()

In [19]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='topping')
fig.show()

In [20]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='variant')
fig.show()

In [21]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='extra_sauce')
fig.show()

In [22]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='extra_cheese')
fig.show()

In [23]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='extra_mushrooms')
fig.show()

In [24]:
fig = px.bar(df,x='diameter',y='price_rupiah',color='company')
fig.show()

In [25]:
sns.barplot(x = 'size', y = 'price_rupiah', data = df,  saturation=0.90)

In [26]:
sns.barplot(x = 'company', y = 'price_rupiah', data = df,  saturation=0.90)

In [27]:
fig = px.histogram (df, x = "price_rupiah",  facet_row = "company",  template = 'plotly_dark')
fig.show ()

In [28]:
fig =  px.pie (df, names = "size", hole = 0.4, template = "gridon")
fig.show ()

In [29]:
fig =  px.pie (df, names = "topping", hole = 0.4, template = "gridon")
fig.show ()

In [30]:
fig =  px.pie (df, names = "extra_cheese", hole = 0.4, template = "gridon")
fig.show ()

In [31]:
fig =  px.pie (df, names = "extra_mushrooms", hole = 0.4, template = "gridon")
fig.show ()

In [32]:
fig =  px.pie (df, names = "extra_sauce", hole = 0.4, template = "gridon")
fig.show ()

In [33]:
df.info()

In [34]:
fig = px.scatter (df, y = "diameter", x = "price_rupiah", color = "extra_mushrooms", template = "plotly_dark",  trendline="ols")
fig.show ()

In [35]:
fig = px.scatter (df, y = "diameter", x = "price_rupiah", color = "extra_sauce", template = "plotly_dark",  trendline="ols")
fig.show ()

# Multivariate Analysis

In [36]:
sns.pairplot(data=df, vars=['diameter', 'price_rupiah'], \
             hue='size', kind='reg', diag_kind='kde', size=5, palette='husl')

In [37]:
sns.pairplot(df)

# Encoding

In [38]:
# importing label encoder from sklear.preprocessing
from sklearn.preprocessing import LabelEncoder

In [39]:
encoder = LabelEncoder()

In [40]:
for i in df.columns:
    if df[i].dtype == 'object': 
        encoder.fit_transform(list(df[i].values))
        df[i] = encoder.transform(df[i].values) 
         
        for j in df.columns:
            if df[j].dtype == 'int':
                df[j] = df[j].astype('float64') 

In [41]:
df.head()

# splitting the data 

In [42]:
X = df.drop(['price_rupiah'],axis=1)
y = df['price_rupiah']


In [43]:
from sklearn.model_selection import train_test_split


In [44]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [45]:
y_train

# Machine learning algorithm implementation

In [46]:
# importing Random forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error

rf = RandomForestRegressor()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)



In [47]:
# predicting 
pred  = rf.predict(X_test)
pred

In [67]:
print('Testing R2 Score: ', r2_score(y_test, pred)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred)))
print('Testing MAE: ', mean_absolute_error(y_test, pred))
print('Testing MSE: ', mean_squared_error(y_test, pred))

In [49]:
pred_test = rf.predict(X_train)
pred_test

In [50]:
#print('Training R2 Score: ', rf(y_train, pred_test)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_test)))
print('Training MAE: ', mean_absolute_error(y_train, pred_test))
print('Training MSE: ', mean_squared_error(y_train, pred_test))

In [51]:
feat_importances_rf = pd.Series(rf.feature_importances_, index=X.columns)
feat_importances_rf.nlargest(10).plot(kind='barh')
plt.show()

**here diameter is having the highest feature importance**

# XGBoost Regressor

In [52]:
from xgboost import XGBRFRegressor

In [53]:
# creating object of XGBoost
model_xgb = XGBRFRegressor(max_depth=8, n_estimators = 10)
model_xgb.fit(X_train, y_train)
pred_xgb = model_xgb.predict(X_test)

In [54]:
pred_xgb_trn = model_xgb.predict(X_train)

In [55]:
print('Training R2 Score: ', r2_score(y_train, pred_xgb_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_xgb_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_xgb_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_xgb_trn))

In [56]:
feat_importances_xbg = pd.Series(model_xgb.feature_importances_, index=X.columns)
feat_importances_xbg.nlargest(10).plot(kind='barh')
plt.show()

**Diameter and size has the highest feature importance**

# Decision Tree

In [58]:
from sklearn.tree import DecisionTreeRegressor

In [62]:
model_dt = DecisionTreeRegressor()
model_dt.fit(X_train, y_train)
pred_dt = model_dt.predict(X_test)

In [63]:
print('Testing R2 Score: ', r2_score(y_test, pred_dt)*100)
print('Testing RMSE: ', np.sqrt(mean_squared_error(y_test, pred_dt)))
print('Testing MAE: ', mean_absolute_error(y_test, pred_dt))
print('Testing MSE: ', mean_squared_error(y_test, pred_dt))

In [64]:
pred_dt_trn = model_dt.predict(X_train)

In [65]:
print('Training R2 Score: ', r2_score(y_train, pred_dt_trn)*100)
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, pred_dt_trn)))
print('Training MAE: ', mean_absolute_error(y_train, pred_dt_trn))
print('Training MSE: ', mean_squared_error(y_train, pred_dt_trn))

In [66]:
feat_importances_dt = pd.Series(model_dt.feature_importances_, index=X.columns)
feat_importances_dt.nlargest(10).plot(kind='barh')
plt.show()

**Decision tree and random forest are almost giving same feature**