In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')

#Modules for ML
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#Regression classes
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

In [None]:
df = pd.read_csv('../input/pizza-price-prediction/pizza_v1.csv')
df.shape

In [None]:
df.info()

In [None]:
df.head()

# **Exploratory Data Analysis**

# **Companies Involved**

In [None]:
plt.figure(figsize=(7,7))
df['company'].value_counts().plot(kind='pie',autopct='%.2f')
plt.title('Companies involved')
plt.ylabel('')
plt.show()

# **Size Types**

In [None]:
plt.figure(figsize=(7,7))
df['size'].value_counts().plot(kind='pie',autopct='%.2f')
plt.title('Size types')
plt.ylabel('')
plt.show()

# **Countplots of any two**

In [None]:
def countplot_of_2(x,hue,title=None,figsize=(7,7)):
    plt.figure(figsize=figsize)
    sns.countplot(data=df[[x,hue]],x=x,hue=hue)
    plt.title(title)
    plt.show()

# **Size and Extra Sauce**

In [None]:
countplot_of_2('size','extra_sauce','Size and Extra sauce')

# **Size and Extra cheese**

In [None]:
countplot_of_2('size','extra_cheese','Size and Extra Cheese')

# **Which Company produce which size of pizza(s).**

In [None]:
countplot_of_2('company','size','Company and their pizza size')

# **Size and It's topping**

In [None]:
countplot_of_2('topping', 'size', 'Size and Topping',(20,10))

# **Toppings and extra sauce**

In [None]:
countplot_of_2('topping','extra_sauce','Toppings and Extra Sauce',(20,10))

# **Toppings and extra cheese**

In [None]:
countplot_of_2('topping','extra_cheese','Toppings and Extra Cheese',(20,10))

# **Diameter and its size**

In [None]:
plt.figure(figsize=(7,7))
sns.histplot(data=df[['size','diameter']],x='diameter',hue='size')
plt.title('Diameter and its size')
plt.show()

# **Feature Engineering**

In [None]:
df.head()

# **Converting 'price_rupiah' column to numeric values**

In [None]:
df['price_rupiah'] = df['price_rupiah'].str.replace('\D+','',regex=True)
df['price_rupiah'] = pd.to_numeric(df['price_rupiah'])
df['price_rupiah'].dtype

In [None]:
sns.histplot(data=df[['price_rupiah']],x='price_rupiah')
plt.show()

# **One Hot Encoding**

In [None]:
cols_to_encode = df.drop(['diameter','price_rupiah'],axis=1).columns
cols_to_encode

In [None]:
dummies = pd.get_dummies(df[cols_to_encode],drop_first=True)
dummies.shape

In [None]:
dummies.head()

In [None]:
df.drop(cols_to_encode,axis=1,inplace=True)
df.head()

# **Feature Scaling**

In [None]:
cols_to_scale = ['diameter']
scale = MinMaxScaler()
scalled = scale.fit_transform(df[cols_to_scale])

In [None]:
df['diameter'] = scalled[:,0]
#No need to scale price price_rupiah column

In [None]:
df[cols_to_scale].describe()

# **Concact Dummies and DF**

In [None]:
new_df = pd.concat([dummies,df],axis=1)
new_df.shape

In [None]:
new_df.head()

# **Splitting the Data**

In [None]:
x,y = new_df.drop('price_rupiah',axis=1),df['price_rupiah']
x.shape,y.shape

# **Train and Test data split**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# **Model Building**

In [None]:
models = [LinearRegression(),Ridge(),Lasso(),DecisionTreeRegressor(),RandomForestRegressor(),SVR(),KNeighborsRegressor()]

In [None]:
mean_scores = []

for model in models:
    print("Model:",model)
    cv_score = cross_val_score(model,x,y,cv=3)
    print("CV scores:",cv_score)
    mean_score = cv_score.mean()
    print("Mean Score:",mean_score)
    print('\n')
    mean_scores.append(mean_score)

In [None]:
mean_scores = np.array(mean_scores)

In [None]:
plt.figure(figsize=(7,7))
plt.barh(range(6),mean_scores[1:]*100)
plt.title('Regression Models Mean Accuracy')
plt.yticks(range(6),models[1:])
plt.show()

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train,y_train)
rf_model.score(x_test,y_test)

# **Let's Work on Decision Tree Regression**

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train,y_train)
dt_model.score(x_test,y_test)

# **Predictions**

In [None]:
def predict(model,x):
    pred = model.predict(x)
    return pred

def actual_vs_predicted(label,y_true,y_pred,title=None):
    ap = pd.DataFrame({f'{label} Actual':y_true,f'{label} Predicted':y_pred})
    
    plt.figure(figsize=(7,7))
    sns.scatterplot(data=ap, x=f'{label} Actual', y=f'{label} Predicted')
    plt.title(title)
    plt.show()

# **Test Data Predictions**

In [None]:
y_test_pred = predict(dt_model,x_test)

actual_vs_predicted('Test',y_test,y_test_pred,'Test Data Predictions')

# **Train Data Predictions**

In [None]:
y_train_pred = predict(dt_model,x_train)

actual_vs_predicted('Train',y_train,y_train_pred,'Train Data Predictions')