# Machine Learning for Pizza Prediction

### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics

from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('../input/pizza-price-prediction/pizza_v2.csv')
df.head()

# Visualization

In [None]:
# Renaming the price column
df.rename({'price_rupiah':'price'}, axis=1, inplace=True)

# Extracting digits in price column and converting to integer
df.price = df.price.str.replace('Rp','').str.replace(',','').astype(int)

# Extracting digits in diameter and converting to integer
df['diameter'] = df['diameter'].str.extract('(\d+\.?\d*)').astype(float) 

In [None]:
sns.histplot(data=df, x='price',bins=30)

In [None]:
# Pizza Diameter vs mean Price
sns.pointplot(data=df.sort_values(by='diameter'),
              x='diameter', y='price')
plt.xticks(rotation=90);

In [None]:
# Price Distribution for each restaurant
fig,ax=plt.subplots(1,2, sharey=True,figsize=(13,4))

sns.violinplot(data=df, x='company', y='price',ax=ax[0],inner="quartile")
sns.boxplot(data=df, x='company', y='price',ax=ax[1])

plt.tight_layout()

In [None]:
# Variant price range
plt.figure(figsize=(13,3))
sns.boxplot(data=df, x='variant', y='price')
plt.xticks(rotation=90)
plt.show()

In [None]:
# topping price range
plt.figure(figsize=(10,3))
sns.boxplot(data=df, x='topping', y='price')
plt.xticks(rotation=90)
plt.show()

# Encoding Categorical Data

In [None]:
map_dict = {'yes':1, 'no':0}

df['extra_cheese'] = df['extra_cheese'].map(map_dict)
df['extra_mushrooms'] = df['extra_mushrooms'].map(map_dict)
df['extra_sauce'] = df['extra_sauce'].map(map_dict)

df.head(3)

In [None]:
# Converting ['price','topping','variant'] into dummy variables
df = pd.get_dummies(df)

# Modeling

In [None]:
# Generating X , y
X=df.drop('price',axis=1)
y=df.price

# Splitting into train and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.hist()

* The target variable is right-skewed. To have a normal-shaped distribution we can use log transformation.

In [None]:
y_train = np.log2(y_train)
y_test = np.log2(y_test)

y_train.hist()

#### Simple GridSearch with 5-fold cross validation

In [None]:
param_grid= {'n_estimators':[2000,4000,6000],
             'max_depth':[3,4,5,6],
             'learning_rate':[0.01,0.1,0.2,0.3,0.5]
}

grid = model_selection.GridSearchCV(XGBRegressor(random_state=42),
                                    param_grid=param_grid,
                                    scoring='r2')

grid.fit(X_train,y_train)

##### Best parameters based on the GridSearch

In [None]:
grid.best_params_

# Metrics

In [None]:
y_pred = grid.predict(X_test)

mae = metrics.mean_absolute_error(y_pred, y_test)
print(f'MAE: {mae:.5f}')

rsme = metrics.mean_squared_error(y_pred, y_test, squared=False)
print(f'RMSE: {rsme:.5f}')

r2_score = metrics.r2_score(y_pred, y_test)
print(f'r2 score: {r2_score:.5f}')