## Importing Libraries

In [None]:
import pandas as pd                                  
import seaborn as sns                                 
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats                               

from sklearn.preprocessing import LabelEncoder        
from sklearn.model_selection import train_test_split  
from sklearn import metrics                            
from sklearn.linear_model import LinearRegression     
from sklearn.linear_model import Lasso                
from xgboost import XGBRegressor                      

%matplotlib inline
plt.rcParams['figure.figsize'] = (8,5)

df = pd.read_csv("../input/pizza-price-prediction/pizza_v1.csv")

### Analyzing The Dataset

In [None]:
print(df.shape)
df.head()

The dataset as 129 rows and 8 columns

The dataset shows us the different variables that are used to determine the price of the pizza

Our target variable is **price_rupiah**


In [None]:
df.info()

The price_rupiah values must need to change to **float or int** and other column values must also need to be changed


In [None]:
#Checking for null values
sns.heatmap(df.isnull(), cbar = False)
plt.show()

The dataset has **no null values**

### Cleaning the Dataset

**Changing the column price_rupiah value to int or float**

In [None]:
df["price_rupiah"] = df["price_rupiah"].str.replace("Rp","").str.replace(",","")

#Converting the datatype to int
df["price_rupiah"] = df["price_rupiah"].astype(int)

In [None]:
#At the time of making this notebook the converstion price of rupiah to dollars is 0.00070
df["price"] = df["price_rupiah"] * 0.000070

#droping the column price_rupiah
df.drop("price_rupiah", axis = 1, inplace = True)

In [None]:
#Changing the datatype of diameter to int
df["diameter"] = df["diameter"].astype(int)
df.head()

### Visualizing the Dataset

In [None]:
for X in df.columns:
    if X == "price":
        pass
    else:
        sns.set_theme(style="darkgrid")
        sns.barplot(x = X, y = "price", data = df, ci = None)
        plt.xticks(rotation = 90)
        plt.xlabel(X)
        plt.ylabel("Price")
        plt.show()

**The chart shows the different value with respect to the price.**

### Encoding the Dataset

In [None]:
le = LabelEncoder()

In [None]:
#Encoding the cloumns with object datatype
for val in df.columns:
    if df[val].dtype == "O":
        df[val] = le.fit_transform(df[val])
df.head()
        

In [None]:
correlation = df.corr()
sns.heatmap(correlation, annot = True)
plt.show()

The correlation chart shows that the **diameter** has the **highest correlation with the price**

In [None]:
df.describe()

### Checking for Outliears 

In [None]:
for X in df.columns:
    if X == "price":
        pass
    else:
        sns.boxplot(x = X, data = df)
        plt.show()

The boxplot shows that the diameter has a outlier

### Removing Outliers using IQR

In [None]:
q1 = df["diameter"].quantile(0.25)
q3 = df["diameter"].quantile(0.75)
IQR = q3 - q1

Lower_whisk = q1 - 1.5 * IQR
Upper_whisk = q3 + 1.5 * IQR

print("Quantile 1:", q1)
print("Quantile 3:", q3)
print("Inter Quartile Range:",IQR)
print("Lower Whisk:",Lower_whisk)
print("Upper Whisk:",Upper_whisk)

*The diameter variable has outliers because there are no values present in between 12.0 and 9.0, so the values below 9 are considered as outliers, Removing the values below lower whisk(9.0) will eliminate the outliers and stabilize the boxplot but we also loose too many values.*

In [None]:
temp = df[df["diameter"] > Lower_whisk]
sns.boxplot(temp["diameter"])
plt.show()

### Zscore 

In [None]:
z=np.abs(stats.zscore(df["diameter"]))
threshold=3
print(np.where(z>3))

**Zscore shows no outliers so I'll go with the actual dataset**

### Model Evaluation

In [None]:
X = df.drop("price",axis = 1)
Y = df["price"]

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size = 0.1, random_state =0) 

### Linear Regression

In [None]:
Linearmodel = LinearRegression()

#### Training 

In [None]:
Linearmodel.fit(xtrain,ytrain)
Train_prediction = Linearmodel.predict(xtrain)
Li_Train_error_score = metrics.r2_score(ytrain, Train_prediction)

#Visualizing the plot
sns.regplot(x = ytrain, y = Train_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.text(9, 15, f'Error Score: {Li_Train_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

#### Testing

In [None]:
Linearmodel.fit(xtest,ytest)
Test_prediction = Linearmodel.predict(xtest)
Li_Test_error_score = metrics.r2_score(ytest, Test_prediction)

sns.regplot(x = ytest, y = Test_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.text(6.5, 10, f'Error Score: {Li_Test_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

### Lasso Regression

In [None]:
Lassomodel = Lasso()

#### Train

In [None]:
Lassomodel.fit(xtrain,ytrain)
Train_prediction = Lassomodel.predict(xtrain)
Lasso_Train_error_score = metrics.r2_score(ytrain, Train_prediction)
sns.regplot(x = ytrain, y = Train_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")

plt.text(9, 14, f'Error Score: {Lasso_Train_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

#### Test

In [None]:
Lassomodel.fit(xtest,ytest)
Test_prediction = Lassomodel.predict(xtest)
Lasso_Test_error_score = metrics.r2_score(ytest, Test_prediction)

sns.regplot(x = ytest, y = Test_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.text(6.5, 9, f'Error Score: {Lasso_Test_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

### XGBRegressor

In [None]:
XGBReg_model = XGBRegressor()

#### Train

In [None]:
XGBReg_model.fit(xtrain,ytrain)
Train_prediction = XGBReg_model.predict(xtrain)
XGB_Train_error_score = metrics.r2_score(ytrain, Train_prediction)

sns.regplot(x = ytrain, y = Train_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.text(9, 17, f'Error Score: {XGB_Train_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

#### Test

In [None]:
XGBReg_model.fit(xtest,ytest)
Test_prediction = XGBReg_model.predict(xtest)
XGB_Test_error_score = metrics.r2_score(ytest, Test_prediction)

sns.regplot(x = ytest, y = Test_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.text(6.5, 10, f'Error Score: {XGB_Test_error_score}', horizontalalignment='center',verticalalignment='center'
         ,bbox=dict(facecolor='red', alpha=0.5))

plt.show()

In [None]:
print("Linear Regression")
print(f'The R-Squared Value for Linear Regression Train Model is :{Li_Train_error_score}')
print(f'The R-Squared Value for Linear Regression Test Model is :{Li_Test_error_score}')

print()
print("Lasso Regression")
print(f'The R-Squared Value for Lasso Regression Train Model is :{Lasso_Train_error_score}')
print(f'The R-Squared Value for Lasso Regression Test Model is :{Lasso_Test_error_score}')

print()
print("XGBRegressor")
print(f'The R-Squared Value for XGBRegressor Train Model is :{XGB_Train_error_score}')
print(f'The R-Squared Value for XGBRegressor TestModel is :{XGB_Test_error_score}')

### Conclusion

**XGBRegressor has better R-squared values thus it has better prediction score compared to Linear Regression model and Lass**

**Thank You!!!**