In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/pizza-price-prediction/pizza_v1.csv")
df

In [None]:
# Check Whether There is Missing Value in Datasets or Not
df.isna().sum()

In [None]:
# change rupiah_format format
price = []

for item in df['price_rupiah']:
    price += [float(item.replace('Rp', '').replace(',', ''))]
    
df['price_rupiah'] = price

## **Exploratory Data Analysis**

## **Check Data Type**

In [None]:
df.dtypes

In [None]:
sns.countplot(df.dtypes.map(str))

In [None]:
categoric_cols = [categoric for categoric in df.columns if df[categoric].dtype == 'object']
categoric_cols

In [None]:
numeric_cols = [numeric for numeric in df.columns if df[numeric].dtype == 'float64']
numeric_cols

## **Univariant Analysis**

In [None]:
fig, axes = plt.subplots(1, 6, figsize=(20, 5), sharey=True)
sns.countplot(ax=axes[0], x=df.company)

sns.countplot(ax=axes[1], x=df.topping)

sns.countplot(ax=axes[2], x=df.variant)

sns.countplot(ax=axes[3], x=df['size'])

sns.countplot(ax=axes[4], x=df.extra_sauce)

sns.countplot(ax=axes[5], x=df.extra_cheese)

- From the above visualization, can be concluded that company, extra_sauce and extra_cheese can make new colum with OneHotEncoding

- topping, variant and size column could be label encodng

In [None]:
df['diameter'].plot(figsize=(12,8))
plt.show()

In [None]:
df[numeric_cols].hist(figsize=(12,12))
plt.show()

## **Bivariant Analysis**

In [None]:
g = sns.PairGrid(df, x_vars=df.columns, y_vars=['price_rupiah'] )
g.map(sns.scatterplot)

In [None]:
df.groupby('company')['price_rupiah'].sum().plot()
plt.show()

- we can see that the company has the highest profit

- The purpose of changing the price format is so that there is no object format and remove the comma

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
X = df.drop(columns='price_rupiah')
y = df.price_rupiah

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numerical_pipeline = Pipeline([
    ('scaling', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('label', OneHotEncoder(handle_unknown='ignore'))
])

preprocessing = ColumnTransformer([
    ('numeric', numerical_pipeline, ['diameter']),    
    ('cat_sec', categorical_pipeline, categoric_cols)
])

pipeline = Pipeline([
    ('pre', preprocessing),
    ('algo', RandomForestRegressor(random_state=42))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.get_params()

In [None]:
parameter = {
    'algo__max_depth': [20,40,60,80,100],
    'algo__min_samples_leaf': [1, 2, 4],
    'algo__n_estimators': [200, 400, 600, 800, 1000]
}

In [None]:
model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

In [None]:
print(model.score(X_train, y_train)), print(model.score(X_test, y_test))

In [None]:
y_pred = model.predict(X_test)
mae_val = mean_absolute_error(y_pred, y_test)

print("Validation MAE for RandomForestRegressor Model: {:,.0f}".format(mae_val))