In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
data = pd.read_csv('/kaggle/input/pizza-price-prediction/pizza_v1.csv')

In [None]:
print(data.shape)
data.head()

In [None]:
data.info()

#### 1. Data Cleaning

In [None]:
# Clean "price" column
data.rename(columns={'price_rupiah': 'price'}, inplace=True)
data['price'] = data['price'].str.extract(r'(\d*,\d*)')
data['price'] = data['price'].str.replace(',', '').astype(int)

In [None]:
# Fix typos
data.loc[data['size'] == 'reguler', 'size'] = 'regular'
data.loc[data['variant'] == 'spicy tuna', 'variant'] = 'spicy_tuna'
data.loc[data['variant'] == 'gournet_greek', 'variant'] = 'gourmet_greek'
data.loc[data['topping'] == 'papperoni', 'topping'] = 'pepperoni'
data.loc[data['topping'] == 'black papper', 'topping'] = 'black pepper'

In [None]:
data.head()

In [None]:
data['topping'].value_counts()

In [None]:
onion = data.loc[data['topping'] == 'onion'].index
data.drop(onion, inplace=True)

#### 2. Data Visualization

In [None]:
data.groupby('company').price.mean().plot(kind='bar', title="Price vs Company")
plt.show()

In [None]:
data.groupby('size').price.mean().plot(kind='bar', title="Price vs Size")
plt.show()

In [None]:
data.groupby('variant').price.mean().plot(kind='barh', figsize=(10,6), title="Price vs Variant")
plt.show()

In [None]:
data.groupby('topping').price.mean().plot(kind='barh', figsize=(10,6), title="Price vs Topping")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2)
data.groupby('extra_sauce').price.mean().plot(kind='bar', figsize=(12,4), ylabel='price', ax=ax1, title="Price vs Extra Sauce")
data.groupby('extra_cheese').price.mean().plot(kind='bar', ylabel='price', ax=ax2, title="Price vs Extra Cheese")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='price', y='diameter', data=data, hue='company')
plt.title("Price vs Diameter")
plt.grid(True)
plt.subplot()
plt.show()

#### 2a Count Plots

In [None]:
sns.countplot(x=data['company'], data=data)
plt.show()

In [None]:
sns.countplot(x=data['topping'], data=data)
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.countplot(x=data['size'], data=data)
plt.show()

In [None]:
sns.countplot(x=data['variant'], data=data)
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.countplot(x=data['diameter'], data=data)
plt.show()

In [None]:
fig, ax = plt.subplots(1,2)
fig.tight_layout()
sns.countplot(x=data['extra_sauce'], ax=ax[0])
sns.countplot(x=data['extra_cheese'], ax=ax[1])
fig.show()

#### 3. Preprocessing

In [None]:
df = data.copy()

Split data

In [None]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [None]:
print(f"X_train {X_train.shape}")
print(f"X_test {X_test.shape}")
print(f"y_train {y_train.shape}")
print(f"y_test{y_test.shape}")

In [None]:
# Cols for binary encoding
binary_cols = ['extra_sauce', 'extra_cheese']

# Cols for one-hot encoding
low_cardinality_cols = ['company', 'size'] 

# Cols for ordinal encoding
high_cardinality_cols = ['topping', 'variant']

# Numerical
numerical_cols = ['diameter']

Encode

In [None]:
def binary_encoder(df, cols):
    
    binary_mapping = {"yes": 1, "no": 0}
    temp = df[cols].copy()
    
    for col in temp.columns:
        temp[col] = temp[col].map(binary_mapping)
    
    return temp

X_train[binary_cols] = binary_encoder(X_train, binary_cols)
X_test[binary_cols] = binary_encoder(X_test, binary_cols)

In [None]:
OH_encoder = OneHotEncoder(sparse=False)

X_train_oh = OH_encoder.fit_transform(X_train[low_cardinality_cols])
colnames = OH_encoder.get_feature_names(['company', 'size'])
X_train_oh = pd.DataFrame(X_train_oh, index=X_train.index, columns=colnames)

X_test_oh = OH_encoder.transform(X_test[low_cardinality_cols])
X_test_oh = pd.DataFrame(X_test_oh, index=X_test.index, columns=colnames)

ordinal_encoder = OrdinalEncoder()

X_train_label = pd.DataFrame(ordinal_encoder.fit_transform(X_train[high_cardinality_cols]), index=X_train.index)
X_test_label = pd.DataFrame(ordinal_encoder.transform(X_test[high_cardinality_cols]), index=X_test.index)

X_train = pd.concat([X_train[numerical_cols], X_train_oh, X_train_label, X_train[binary_cols]], axis=1)
X_test = pd.concat([X_test[numerical_cols], X_test_oh, X_test_label, X_test[binary_cols]], axis=1)

Scale

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

#### 4. Fitting XGB Regressor

In [None]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.9, max_depth=10, alpha=1, random_state=1, n_jobs=4)

xgb.fit(X_train, y_train, verbose=False)

y_preds = xgb.predict(X_test)

mae = mean_absolute_error(y_test, y_preds)
print(f"Mean Absolute Error: {mae:.2f}\n")
print(f"R^2 score: {xgb.score(X_test, y_test):.4f}")

In [None]:
plt.figure(figsize=(10,5))
sns.regplot(x=y_test, y=y_preds, line_kws={"color": "red"})
plt.title("Actual vs Predicted Price")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.xlim(0, 250000)
plt.ylim(0, 250000)
plt.grid(True)
plt.subplot()
plt.show()