In [None]:
%config Completer.use_jedi = False

import warnings
warnings.filterwarnings("ignore")

# **<span style="background-color:#add8e6;">Flow</span>**
1. Inputting & Importing
2. Understanding the Data
3. Data Preprocessing
4. Vizzualisation
5. Categorical Encoding
6. Correlation
7. Splitting data
8. Feature Scaling
9. Model Selection

# **<span style="background-color:yellow;">1. Inputting & Importing</span>**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize':(18,10)})
sns.set_style({'axes.facecolor':'white', 'grid.color': '.8', 'font.family':'Times New Roman'})

In [None]:
# Colors
cyan = '#00FFD1'
red = '#FF007D'
prussian = '#0075FF'
green = '#EEF622'
yellow = '#FFF338'
violet = '#9B65FF'
orange = '#FFA500'
blue = '#00EBFF'
vermillion = '#FF6900'

red2 = '#FF2626'
seagreen = '#28FFBF'
green2 = '#FAFF00'
navyblue = '#04009A'

darkgreen = '#206A5D'
lightgreen = '#CCF6C8'
pink = '#F35588'
mauve = '#BAABDA'
lightblue = '#1CC5DC'
mustard = '#FDB827'
deeppurple = '#723881'



color_list = [cyan,red,prussian,green,violet,orange,yellow,blue,vermillion,red2,seagreen,green2,navyblue,darkgreen,lightgreen,pink,mauve,lightblue,mustard,deeppurple]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)

In [None]:
import os
from IPython.display import Image
Image(filename="../input/pizzaimg/ivan-torres-MQUqbmszGGM-unsplash (1).jpg")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pizza1 = pd.read_csv('/kaggle/input/pizza-price-prediction/pizza_v1.csv')
pizza2 = pd.read_csv('/kaggle/input/pizza-price-prediction/pizza_v2.csv')

# **<span style="background-color:yellow;">2. Understanding the data</span>**

In [None]:
pizza1.head()

In [None]:
pizza2.head()

**For the purpose of this notebook, except preprocessing, I'll only be focusing on Pizza1 from here onwards**

In [None]:
# What do I want to know if 
    #numerical: Range , Min, Max, Mean, Median, no_of_null
    #categorical: no. of values, val-counts/freq, no_of_null

In [None]:
def describefreq(df,col_index):
    col_no = len(df.columns)
    row_no = len(df)
    
    # identifying datatype
    data_types = {}
    for i in range(0,col_no):
        TYPE = type(df.columns[i])
        data_types[i] = TYPE
        
    val_counts = {}
    for i in range(0,col_no):
        if (data_types[i]==str):
            col = df.iloc[:,i]
            col_val_counts = col.value_counts()
            val_counts[i] = col_val_counts
            
            
    return display(pd.DataFrame(val_counts[col_index]))

### **Value counts for all categorical variables from both tables:**

In [None]:
for i in range(0,len(pizza1.columns)):
    describefreq(pizza1,i)

# **<span style="background-color:yellow;">3. Data Preprocessing</span>**

In [None]:
price_lis = []
for i in range(0,len(pizza1)):
    price = pizza1.loc[i,'price_rupiah']
    frag = price.split('Rp')
    price = frag[1]
    price_val = ''.join(price.split(','))
    price_lis.append(float(price_val))
pizza1['Price'] = price_lis
pizza1 = pizza1.drop('price_rupiah',axis=1)
    
price_lis = []
for i in range(0,len(pizza2)):
    price = pizza2.loc[i,'price_rupiah']
    frag = price.split('Rp')
    price = frag[1]
    price_val = ''.join(price.split(','))
    price_lis.append(float(price_val))
pizza2['Price'] = price_lis
pizza2 = pizza2.drop('price_rupiah',axis=1)

In [None]:
for i in range(0,len(pizza2)):
    diameter = pizza2.loc[i,'diameter']
    frag = diameter.split(' ')
    num = frag[0]
    pizza2.iat[i,2] = float(num)

In [None]:
for i in range(0,len(pizza1.columns)-1):
    col_name = pizza1.columns[i]
    display(pd.pivot_table(pizza1,values=['Price'],index=[col_name]))

# **<span style="background-color:yellow;">4. Vizualisation</span>**

## Violin Plots

In [None]:
sns.violinplot(x='company',y='Price',data=pizza1);
plt.title('Company',fontsize=30);

In [None]:
sns.violinplot(x='topping',y='Price',data=pizza1);
plt.xticks(rotation=45);
plt.title('Topping',fontsize=30);
plt.savefig('fig.png')

In [None]:
sns.violinplot(x='variant',y='Price',data=pizza1);
plt.xticks(rotation=45);
plt.title('Variant',fontsize=30);

In [None]:
sns.violinplot(x='size',y='Price',data=pizza1);
plt.xticks(rotation=45);
plt.title('Size',fontsize=30);

In [None]:
sns.violinplot(x='extra_sauce',y='Price',data=pizza1,)
plt.title('Extra Sauce',fontsize=30);

In [None]:
sns.violinplot(x='extra_cheese',y='Price',data=pizza1);
plt.title('Extra Cheese',fontsize=30);

In [None]:
pizza1.head()

# **<span style="background-color:yellow;">5. Categorical Encoding</span>**

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
pizza1['company'] = le.fit_transform(pizza1['company'])

In [None]:
le = LabelEncoder()
pizza1['topping'] = le.fit_transform(pizza1['topping'])

In [None]:
le = LabelEncoder()
pizza1['variant'] = le.fit_transform(pizza1['variant'])

In [None]:
le = LabelEncoder()
pizza1['size'] = le.fit_transform(pizza1['size'])

In [None]:
le = LabelEncoder()
pizza1['extra_sauce'] = le.fit_transform(pizza1['extra_sauce'])

In [None]:
le = LabelEncoder()
pizza1['extra_cheese'] = le.fit_transform(pizza1['extra_cheese'])

In [None]:
pizza1.columns

In [None]:
#pizza1 = pizza1.reindex(columns = ['diameter','topping','variant','extra_sauce','extra_cheese','size_jumbo','size_large', 'size_medium', 'size_reguler',
#       'size_small', 'company_B', 'company_C', 'company_D', 'company_E','Price'])

In [None]:
pizza1.head(15)

# **<span style="background-color:yellow;">6. Correlation</span>**

In [None]:
fig, ax = plt.subplots(figsize=(18,16)) 
my_c = sns.diverging_palette(20, 220, as_cmap=True)
mask = np.triu(pizza1.corr())
sns.heatmap(pizza1.corr(),cmap='BrBG',linewidths=1.5,ax=ax,annot=True,center=0,square=True,mask=mask)
plt.title('Correlation for Pizza1 after Encoding',fontsize=30);

In [None]:
pizza1.head()

# **<span style="background-color:yellow;">7. Splitting Data</span>**

### 

In [None]:
from sklearn.model_selection import train_test_split

X = pizza1.iloc[:,:-1]
y = pizza1.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.head()

In [None]:
X_test.head()

# **<span style="background-color:yellow;">8. Feature Scaling</span>**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train.iloc[:,:5] = sc.fit_transform(X_train.iloc[:,:5])#-2
X_test.iloc[:,:5] = sc.transform(X_test.iloc[:,:5])#-2

In [None]:
X_train.head()

# **<span style="background-color:yellow;">9. Model selection</span>**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor

from sklearn.metrics import r2_score

rfr = RandomForestRegressor(max_depth=13,random_state=0)
gbr = GradientBoostingRegressor(random_state=0)
lr = LinearRegression()
gb = GaussianNB()
xgb = XGBRegressor()

In [None]:
rfr.fit(X_train,y_train)
gbr.fit(X_train,y_train)
lr.fit(X_train,y_train)
gb.fit(X_train,y_train)
xgb.fit(X_train,y_train)

In [None]:
for model in [rfr,gbr,lr,gb,xgb]:
    y_pred = model.predict(X_test)
    acc = r2_score(y_test,y_pred)
    
    print(model,': ',acc)

# <span style="background-color:thistle;">Learning Curve plot for XGBRegressor</span>

In [None]:
# values = [i for i in range(1, 21)]
values = [i for i in range(2,100, 5)]
#values = [i for i in np.linspace(0.01,0.1,30)]

In [None]:
train_scores = []
test_scores = []
# evaluate a decision tree for each depth
for i in values:
	# configure the model
	model = XGBRegressor(gamma=0,learning_rate=0.094,max_depth=20,n_estimators=i,n_jobs=16)
	# fit model on the training dataset
	model.fit(X_train, y_train)
	# evaluate on the train dataset
	train_yhat = model.predict(X_train)
	train_acc = r2_score(y_train, train_yhat)
	train_scores.append(train_acc)
	# evaluate on the test dataset
	test_yhat = model.predict(X_test)
	test_acc = r2_score(y_test, test_yhat)
	test_scores.append(test_acc)
	# summarize progress
	print('>%.3f, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

In [None]:
plt.plot(values,train_scores,color=blue,label='Train');
plt.plot(values,test_scores,color=red,label='Test');
plt.title('Learning Curve for N Estimators as a hyperparameter',fontsize=30);
plt.legend();

In [None]:
xgb = XGBRegressor(gamma=0,learning_rate=0.094,max_depth=20,n_estimators=i,n_jobs=16)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
r2_score(y_pred,y_test)

In [None]:
np.array(y_test)

In [None]:
y_pred

In [None]:
x_axis = [i for i in range(1,27)]
plt.plot(x_axis,y_test,color=cyan,label='Actual')
plt.scatter(x_axis,y_test,color=cyan)
plt.plot(x_axis,y_pred,color=red,label='Predicted')
plt.scatter(x_axis,y_pred,color=red)
plt.title('Accuracy Predicted vs Actual',fontsize=30)
#ax.set_facecolor('white')
plt.legend();
plt.savefig('fig.png')