# Data Pre-Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


## Importing the dataset

In [None]:
df = pd.read_csv('../input/pizza-price-prediction/pizza_v1.csv')


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.isnull().sum()


In [None]:
df.describe()

In [None]:
df_numerical = df.select_dtypes(exclude=['object'])
df_categorical = df.select_dtypes(include=['object'])


In [None]:
df.dtypes


In [None]:
df['company'].value_counts()
df["price_rupiah"].value_counts()


In [None]:
df.rename(columns={'price_rupiah': 'Cost', 'company': 'Company', 'diameter': 'Diameter', 'variant': 'Variant',
          'size': 'Size', 'extra_sauce': 'Extra_sauce', 'extra_cheese': 'Extra_cheese'}, inplace=True)


In [None]:
encoding_columns = ['Company', 'topping', 'Variant',
                    'Size', 'Extra_sauce', 'Extra_cheese']


In [None]:
def refining_cost(col, df):
    df[col] = df[col].map(lambda x: x.replace('Rp', ''))
    df[col] = df[col].map(lambda x: x.replace(',', ''))


In [None]:
def categorical_encoding(value, df):
    one_hot_encoder = ce.OneHotEncoder(
        cols=value, return_df=True, use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final


In [None]:
df = categorical_encoding("Company", df)
df = categorical_encoding("topping", df)
df = categorical_encoding('Variant', df)
df = categorical_encoding('Size', df)
df = categorical_encoding("Extra_sauce", df)
df = categorical_encoding('Extra_cheese', df)


In [None]:
refining_cost('Cost', df)
df['Cost'] = df['Cost'].astype('float64')
df.dtypes


In [None]:
X = df.drop(['Cost'], axis=1)
y = df['Cost']
print(X.shape)


## Splitting the dataset into the Training set and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test, y_train, y_test = np.array(X_train), np.array(
    X_test), np.array(y_train), np.array(y_test)


# Modeling

## Multiple Linear Regression

In [None]:
mul_regressor = LinearRegression()
mul_regressor.fit(X_train, y_train)

## Decision Tree Regression

In [None]:
decision_regressor = DecisionTreeRegressor(random_state = 0)
decision_regressor.fit(X_train, y_train)

## Polynomial Rregression

In [None]:
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
poly_regressor = LinearRegression()
poly_regressor.fit(X_poly, y_train)

## Random Forest Regression

In [None]:
forest_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
forest_regressor.fit(X_train, y_train)


# R Squared Score

In [None]:

mul_score = r2_score(y_test, mul_regressor.predict(X_test))
decision_score = r2_score(y_test, decision_regressor.predict(X_test))
poly_score = r2_score(
    y_test,  poly_regressor.predict(poly_reg.transform(X_test)))
forest_score = r2_score(y_test,  forest_regressor.predict(X_test))

var = {mul_score: "mul_score", decision_score: "decision_score",
       poly_score: "poly_score", forest_score: "forest_score"}
print("The Regression model with the highest score is:", var.get(max(var)), "=",max(var))
