In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

## Data Preprocessing and cleaning

In [None]:
pizza_data = pd.read_csv("/kaggle/input/pizza-price-prediction/pizza_v1.csv")
pizza_data.describe()

In [None]:
pizza_data.head()

In [None]:
#checking for any missing data
pizza_data.isnull().sum()

In [None]:
#re-format price_rupiah column
pizza_data['price_rupiah'].dtype
price = []
for item in pizza_data['price_rupiah']:
    price.append(int(item.replace('Rp','').replace(',','')))
pizza_data['price_rupiah'] = price
pizza_data.head()

In [None]:
#checing varities
print(pizza_data['topping'].value_counts(), pizza_data['variant'].value_counts(), pizza_data['size'].value_counts(), pizza_data['company'].value_counts(), sep='\n\n')


In [None]:
#now just converting categorical data to numeric data to simply visualize it, then we will transform it again to see the relation between each column and price
from sklearn.preprocessing import LabelEncoder
col = ['topping', 'variant', 'size','company','extra_sauce','extra_cheese']

pizza_labelled = pizza_data.copy()
for item in col:
    if(pizza_labelled[item].dtype == 'object'):
        pizza_labelled[item] = pizza_labelled[item].fillna('N')
        lbl= LabelEncoder()
        lbl.fit(pizza_labelled[item].values)
        pizza_labelled[item] = lbl.transform(pizza_labelled[item].values)
pizza_labelled.head()

In [None]:
pizza_labelled.hist(figsize=(13,13))
plt.show()

In [None]:
#now through pair plot we will see the relation between every 2 columns

data_plt = pizza_labelled[pizza_labelled.columns.tolist()]
sns.pairplot(data_plt)

In [None]:
#now we will go to our original data and use label encoder again to transform it to new columns instead of different values in the data frame

col = ['topping', 'variant', 'size','company','extra_sauce','extra_cheese']

pizza_categoried = pizza_data.copy()
#print(pd.get_dummies(pizza_categoried[col]))
#pizza_categoried.head()

pd.get_dummies(pizza_categoried, columns= col)

In [None]:
pip install sweetviz

In [None]:
# we can also do automatic EDA with GML
import sweetviz as sv
report = sv.analyze(pizza_data)
report.show_html('pizza_data.html')

In [None]:
comparision = sv.compare(pizza_data[100:], pizza_data[:100])
comparision.show_html('compare.html')

In [None]:
#correlation value of price_rupiah with another columns that have the most impact on price
pizza_labelled.corr()['price_rupiah'].sort_values()

In [None]:
#We can fit a regression model for column diameter and calculate the R^2
from sklearn.linear_model import LinearRegression

sns.regplot(x="diameter", y='price_rupiah', data = pizza_data)

In [None]:
X = pizza_data[['diameter']]
Y = pizza_data['price_rupiah']
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X,Y) #R^2 score (cofficient relation)


In [None]:
features = col
X= pizza_labelled[features] # as this df has the numerical value and pizza_data has categorical
Y= pizza_data['price_rupiah']
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X,Y) #R^2 score (cofficient relation)

In [None]:
# Now we will create a list of tupples to make piplelines, model, and evaluation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

# we will make a pipeline object to predict the price

pipe = Pipeline(Input)
pipe.fit(X,Y)
pipe.score(X,Y)

## MODEL EVALUATION AND REFINEMENT


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

features = pizza_labelled.columns.to_list()
features.remove('price_rupiah')
X= pizza_labelled[features]
Y= pizza_labelled['price_rupiah']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)
ridge.fit(x_train, y_train)
ridge.score(x_test, y_test)

In [None]:
# performing a second order polynomial
pf = PolynomialFeatures(degree=2)
x_test_data = pf.fit_transform(x_test)
x_train_data = pf.fit_transform(x_train)
ridge1=Ridge(alpha=0.1)
ridge1.fit(x_train_data,y_train)
ridge1.score(x_test_data, y_test)