In [None]:
#Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# importing dataset

df=pd.read_csv('../input/pizzapricepredictedamlrandforeregre-liregres/pizza_price pred.csv')
df.head()

In [None]:
df.shape

In [None]:
# columns in dataframe

df.columns

In [None]:
df.info()

In [None]:
# Cleaning Price Data
df['price_rupiah'] = df['price_rupiah'].str.replace('Rp', '').str.replace(',', '')

# Convert Price Data to numeric
df['price_rupiah'] = pd.to_numeric(df['price_rupiah'])

# Convert to float number as giver diameter in floating value
df['price_rupiah'] = df['price_rupiah'].astype('float64')


In [None]:
#checking missing values

df.isnull().sum()

## *EXPLORATORY DATA ANALYSIS*

In [None]:
#companies evaluation

df.company.value_counts()

In [None]:
# barplot for companies

plt.figure(figsize=(8,5))

plt.title('Companies',fontsize=20)

sns.barplot(df.company.value_counts().index, df.company.value_counts())

In [None]:
#price evaluation

df.price_rupiah.value_counts().head(10)

In [None]:
# price frequencies in histogram

sns.histplot(data=df[['price_rupiah']],x='price_rupiah')
plt.show()


In [None]:
df.info()

In [None]:
# diameter evaluation

df.diameter.max() # Maximum diameter of pizza in dataset

In [None]:
df.diameter.min() # minimum diameter of pizza in given dataset

In [None]:
df.diameter.value_counts()    

In [None]:
df.describe()

In [None]:
# diameters in histogram

plt.figure(figsize=(10,4))

plt.title('Diameter of pizza')
                                                
plt.xlabel('diameter')
plt.ylabel('number of pizzas')

plt.hist(df.diameter, bins = np.arange(10,25,2), color = 'cyan', edgecolor ='purple')

In [None]:
#topping evaluation

df.topping.value_counts()

In [None]:
# barplot for toppings

plt.figure(figsize=(10,6))

plt.xticks(rotation=75)

plt.title('Toppings of  Pizza',fontsize=20)

sns.barplot(df.topping.value_counts().index, df.topping.value_counts())

In [None]:
#pie plot for toppings

plt.figure(figsize=(30,10))


df.topping.value_counts().plot(kind='pie',autopct='%1.1f%%')

plt.title('Toppings of  Pizza',fontsize=18)

In [None]:
#variants of pizza- evaluation

df.variant.value_counts()

In [None]:
#Bar graph for variants of pizza

plt.figure(figsize=(10,6))

df.variant.value_counts().plot(kind='bar')

plt.xticks(rotation=75)   

plt.title("Variants of Pizza",fontsize=18)

In [None]:
#Extra sauce for pizza

df.extra_sauce.value_counts()

In [None]:
#Pie plot for extra sauce
plt.title('Extra sauce')
df.extra_sauce.value_counts().plot(kind='pie',autopct='%1.1f%%',explode=[0.1,0])

In [None]:
# Companies providing extra sauce-evaluation

c_s=df[(df.extra_sauce=='yes') & (df.company)].loc[:, ['company','extra_sauce']]
c_s

In [None]:
c_s.value_counts()

In [None]:
#bar plot of companies providing extra sauce

sns.set(style='darkgrid')

plt.figure(figsize=(7,6))

sns.countplot(x='company',data=c_s, palette='Spectral')

plt.title('companies providing extra_sauce',fontsize=15)

In [None]:
# Extra cheese

df.extra_cheese.value_counts()

In [None]:
# companies providing extra cheese

c_c=df[(df.extra_cheese=='yes') & (df.company)].loc[:, ['company','extra_cheese']]
c_c

In [None]:
c_c.value_counts()

In [None]:
#companies providing extra cheese

sns.set(style='darkgrid')

plt.figure(figsize=(7,6))

sns.countplot(x='company',data=c_c, palette='mako')

plt.title('companies providing extra_cheese',fontsize=15)

In [None]:
# companies providing both extra sauce and cheese

extras_c=df[(df.extra_sauce=='yes') & (df.extra_cheese=='yes') & (df.company)].loc[:, ['extra_sauce','extra_cheese','company']]
extras_c.head()

In [None]:
extras_c.value_counts()

In [None]:
# countplot for companies providing extra sauce and extra cheese

sns.set(style='darkgrid')

plt.figure(figsize=(7,6))

sns.countplot(x='company',data=extras_c, palette='mako')

plt.title('companies providing both extra cheese and sauce',fontsize=15)

In [None]:
# Company A's price ranging

A_prices=df[(df.company=='A') & (df.price_rupiah)]
A_prices

## *Graphical presentation of more feactures*

In [None]:
def countplot_of_2(x,hue,title=None,figsize=(7,7)):
    plt.figure(figsize=figsize)
    sns.countplot(data=df[[x,hue]],x=x,hue=hue)
    plt.title(title)
    plt.show()

In [None]:
# Companies and pizza's of different sizes, they are producing

countplot_of_2('company','size','Companies and their pizza size')


In [None]:
# companies with their variants of pizza

countplot_of_2('company','variant','Companies and their variants of pizza',(25,12))

In [None]:
countplot_of_2('company','topping','Companies and their toppings on pizza',(18,8))

In [None]:
# different sizes with and their toppings

countplot_of_2('topping', 'size', 'size and topping',(18,10))

In [None]:
# toping and extra sauce


countplot_of_2('topping','extra_sauce','Toppings and Extra Sauce',(15,7))

# *ML Part*

In [None]:
df.head()

## *Encoding*

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [None]:
df.company=le.fit_transform(df.company)
df.topping=le.fit_transform(df.topping)
df.variant=le.fit_transform(df.variant)

df['size'] = le.fit_transform(df['size'])

df.extra_sauce=le.fit_transform(df.extra_sauce)
df.extra_cheese=le.fit_transform(df.extra_cheese)
df.head()

## *Splitting data for Training and Testing*

In [None]:
X=df.iloc[:, [0,2,3,4,6,7]]
X.head()

In [None]:
y=df.price_rupiah
y.head()

In [None]:
X.info()

## *converting all feactures into floating datatype, for convenience*

In [None]:
df.size = df.size.astype('float64')
df.company = df.company.astype('float64')
df.topping = df.topping.astype('float64')
df.extra_sauce= df.extra_sauce.astype('float64')
df.extra_cheese = df.extra_cheese.astype('float64')
df.variant = df.variant.astype('float64')

In [None]:
df.info()

In [None]:
df.describe()

## *Linear Regression*

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.03, random_state=130)

In [None]:
from sklearn.linear_model import LinearRegression

model= LinearRegression()
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import r2_score

y_predict = model.predict(X_test)

y_predict_train=model.predict(X_train)

print('Test data', r2_score(y_test,y_predict))
print('Train data', r2_score(y_train,y_predict_train))

In [None]:
y_predict

In [None]:
y_test

In [None]:
model.predict([[4.0,8.5,10.0,16.0,1.0,1.0]])

# *Randomforest regressor*

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.03, random_state=130)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model=RandomForestRegressor(n_estimators=11,criterion='mse',max_depth=4)

In [None]:
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import r2_score

y_predict = model.predict(X_test)
y_predict_train=model.predict(X_train)

print('Test data', r2_score(y_test,y_predict))
print('Train data', r2_score(y_train,y_predict_train))

In [None]:
y_predict

In [None]:
y_test

In [None]:
model.predict([[4.0,12.0,3.0,0.0,0.0,0.0]])