## Pizza Pricing ( Low Sample Size )

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import1= pd.read_csv('../input/pizza-price-prediction/pizza_v1.csv')
import2= pd.read_csv('../input/pizza-price-prediction/pizza_v2.csv')

**Data Cleaning**

In [None]:
pizza=pd.concat([import1,import2],axis=1)

In [None]:
pizza = pizza.T.drop_duplicates().T

In [None]:
pizza['price'] = pizza['price_rupiah'].apply(lambda x:x.split('Rp')[1]).apply(lambda x:x.split(',')[0]).astype(int)
pizza.drop(['price_rupiah','size'],axis=1,inplace=True)

In [None]:
pizza = pd.DataFrame(np.delete(np.array(pizza),(7,6),1),columns= pizza.columns.drop_duplicates())

In [None]:
pizza[['diameter','price']]=pizza[['diameter','price']].astype(int)

In [None]:
#pizza[pizza['variant']=='spicy_tuna'] = pizza[pizza['variant']=='spicy tuna']

In [None]:
pizza= pizza.drop('variant',axis=1).drop_duplicates().reset_index(drop=True)

In [None]:
pizza.head(3)

***Stratifying our Target Variable***

In [None]:
sns.displot(pizza['price'],color='blue')

In [None]:
pizza['cost'] = pd.cut(pizza['price'],
                          bins=[0.0,40.0,70,100,160,np.inf],
                          labels=['Very Cheap','Cheap','Average','Expensive','Very Expensive'])

In [None]:
pizza['cost'].value_counts().sort_values().plot.barh()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split= StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=42)
for train_index, test_index in split.split(pizza, pizza["cost"]):
    strat_train_set = pizza.loc[train_index]
    strat_test_set = pizza.loc[test_index]

In [None]:
strat_test_set['cost'].value_counts()/ len(strat_test_set)

In [None]:
for _set in strat_test_set,strat_train_set:
    _set.drop('cost',axis=1,inplace=True)

In [None]:
X_train = strat_train_set.drop('price',axis=1)
X_test = strat_test_set.drop('price',axis=1)
y_train= strat_train_set['price']
y_test= strat_test_set['price']

In [None]:
prices = pd.concat([strat_train_set.copy(),strat_test_set.copy()]).reset_index(drop=True)

*Setting up Data for Linear Regression*

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [None]:
numerical_variables = ['diameter']
categ_variables = [col for col in prices if prices[col].dtype == 'object']
Ordinal_Vars= ['extra_sauce','extra_cheese','extra_mushrooms']
OneHot_Vars= list(set(categ_variables).difference(Ordinal_Vars))

In [None]:
scaler = StandardScaler(with_mean=False)
OH = OneHotEncoder(handle_unknown='ignore',sparse=False)
OE= OrdinalEncoder()
LNR= LinearRegression()
PCA = PCA()

***Standard Pipeline***

In [None]:
numpipe= Pipeline([
    ('scaler',scaler)
])

OHtransformer= Pipeline([
        ('OneHot',OH),
        ('Scaler',scaler)
    ])

OEtransformer= Pipeline([
        ('OrdinalEncoder',OE),
        ('Scaler',scaler)
    ])

PPipe= ColumnTransformer(
    transformers=
    [
    ('Numericals',numpipe,numerical_variables),
    ('OneHots',OHtransformer,OneHot_Vars),
    ('Ordinals',OEtransformer,Ordinal_Vars)
]
)

FullPipe= Pipeline([
    ('transforming',PPipe),
    ('model',LNR)
])

In [None]:
from sklearn.model_selection import train_test_split

*Linear Regression*

In [None]:
FullPipe.fit(X_train,y_train)
LNRpredictions = FullPipe.predict(X_test)

*Stochastic Gradient Descent*

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
SGD= SGDRegressor(alpha=0.03,max_iter=10000)

In [None]:
SGDPipe= Pipeline([
    ('transforming',PPipe),
    ('model',SGD)
])

In [None]:
SGDPipe.fit(X_train,y_train)
SGDpreds= SGDPipe.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
RFR= RandomForestRegressor()
GBR= GradientBoostingRegressor()

*Random Forest Regressor*

In [None]:
RFRPipe= Pipeline([
    ('transforming',PPipe),
    ('model',RFR)
])

In [None]:
RFRPipe.fit(X_train,y_train)
RFRpreds= RFRPipe.predict(X_test)

*Gradient Boosting Regressor*

In [None]:
GBRPipe= Pipeline([
    ('transforming',PPipe),
    ('model',GBR)
])

In [None]:
GBRPipe.fit(X_train,y_train)
GBRpreds= GBRPipe.predict(X_test)

**Model Comparisons**

In [None]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [None]:
def evaltotable(predictions,y_test):
    MAE=mean_absolute_error(predictions,y_test)
    RMSE=np.sqrt(mean_squared_error(predictions,y_test))
    R2=r2_score(predictions,y_test)
    return MAE,RMSE,R2

In [None]:
modelstocompare= [LNRpredictions,SGDpreds,RFRpreds,GBRpreds]

In [None]:
appended=[]
for i in modelstocompare:
    appended.append(evaltotable(i,y_test))

In [None]:
modelcomparison= pd.DataFrame(np.array(appended).T,columns=['LR','SGD','RFR','GBR'],
                              index=['MAE','RMSE','R2']).T.reset_index().rename(columns={'index':'Models'}).sort_values(by='R2',ascending=False).reset_index(drop=True)

In [None]:
modelcomparison

In [None]:
plt.plot(figsize=(12,6))
sns.barplot(x=modelcomparison['Models'],y=modelcomparison['R2'],palette='cividis')
plt.title('Coefficient of Determination',fontsize=14)
plt.ylim(0,0.65)

In [None]:
plt.plot(figsize=(12,6))
sns.barplot(x=modelcomparison['Models'],y=modelcomparison['MAE'].sort_values(),palette='cividis')
plt.title('MAE',fontsize=14)

plt.ylim(15,23)

**Best Fit Conclusions**

In [None]:
modelcomparison.loc[0]

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(GBRpreds,alpha=0.6,color='blue',bins=15)
sns.histplot(y_test,alpha=0.6,color='green',bins=15)
plt.title('Real Pricing (Green) & Predicted Prices (Blue)',fontsize=14)

In [None]:
OH_Varnames= GBRPipe['transforming'].transformers_[1][1].named_steps['OneHot'].get_feature_names()

In [None]:
TransformedVariables = ['diameter']+Ordinal_Vars+ OH_Varnames.tolist() 

In [None]:
FeatImportance= pd.DataFrame(GBR.feature_importances_,TransformedVariables,
                             columns=['Feature Importance']).sort_values(by='Feature Importance',ascending=False)

In [None]:
plt.figure(figsize=(17,10))
sns.heatmap(FeatImportance,annot=True,lw=1,cmap='cividis')

In [None]:
GBR.estimators_[0][0]

In [None]:
from sklearn import tree

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(GBR.estimators_[0][0], 
                   feature_names=TransformedVariables,  
                   filled=True)

***Gradient Boosting Regressor fits our data decently for our sample size, explaining most of the pizza prices by diameter, which makes sense & overfits the data to it. 
Without the diameter feature & because of the small sample size, no model can adjust to the data. (Shown below)***


In [None]:
X_train.drop('diameter',axis=1,inplace=True)
X_test.drop('diameter',axis=1,inplace=True)

In [None]:
NoDiamPrePro= ColumnTransformer(
    transformers=
    [
    ('OneHots',OHtransformer,OneHot_Vars),
    ('Ordinals',OEtransformer,Ordinal_Vars)
]
)

GBRnoDiam= Pipeline([
    ('transforming',NoDiamPrePro),
    ('model',GBR)
])

RFRnoDiam= Pipeline([
    ('transforming',NoDiamPrePro),
    ('model',RFR)
])


In [None]:
GBRnoDiam.fit(X_train,y_train)

In [None]:
RFRnoDiam.fit(X_train,y_train)

In [None]:
nodiamRFRpreds= RFRnoDiam.predict(X_test)
nodiamGBRpreds= GBRnoDiam.predict(X_test)

In [None]:
# RFR
evaltotable(nodiamRFRpreds,y_test)

In [None]:
#GBR
evaltotable(nodiamGBRpreds,y_test)