In [1]:
#open file 
import pandas as pd
train = pd.read_csv('dataset.csv')
train

Unnamed: 0,Sold_On,Size,Ingredients_Cost,Design_Complexity,Time_Taken,Price,Amount,Gender
0,Monday,large,28,complex,6,49,10,female
1,Saturday,medium,65,simple,2,93,9,female
2,Wednesday,large,96,simple,5,128,8,male
3,Tuesday,medium,155,simple,3,198,6,female
4,Saturday,large,20,simple,2,32,5,male
...,...,...,...,...,...,...,...,...
3995,Thursday,medium,200,simple,4,283,10,female
3996,Saturday,medium,152,complex,7,202,1,female
3997,Friday,small,153,complex,10,218,6,male
3998,Tuesday,small,207,simple,4,280,5,male


In [2]:
#check data of file
train.info()
print(train.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sold_On            4000 non-null   object
 1   Size               4000 non-null   object
 2   Ingredients_Cost   4000 non-null   int64 
 3   Design_Complexity  4000 non-null   object
 4   Time_Taken         4000 non-null   int64 
 5   Price              4000 non-null   int64 
 6   Amount             4000 non-null   int64 
 7   Gender             4000 non-null   object
dtypes: int64(4), object(4)
memory usage: 250.1+ KB
Sold_On              0
Size                 0
Ingredients_Cost     0
Design_Complexity    0
Time_Taken           0
Price                0
Amount               0
Gender               0
dtype: int64


In [3]:
#describe the data 
train.describe()


Unnamed: 0,Ingredients_Cost,Time_Taken,Price,Amount
count,4000.0,4000.0,4000.0,4000.0
mean,111.5525,4.683,154.68075,5.455
std,57.863428,2.830285,76.282079,2.871591
min,10.0,1.0,14.0,1.0
25%,62.0,2.75,90.0,3.0
50%,113.0,4.0,154.0,5.0
75%,162.0,6.0,217.0,8.0
max,210.0,11.0,329.0,10.0


In [4]:
#encode categorical data by label encoding
from sklearn.preprocessing import LabelEncoder

label_cols = ['Sold_On', 'Size', 'Design_Complexity', 'Gender']
encoder = LabelEncoder()
for col in label_cols:
    train[col] = encoder.fit_transform(train[col])
train

Unnamed: 0,Sold_On,Size,Ingredients_Cost,Design_Complexity,Time_Taken,Price,Amount,Gender
0,1,0,28,0,6,49,10,0
1,2,1,65,1,2,93,9,0
2,6,0,96,1,5,128,8,1
3,5,1,155,1,3,198,6,0
4,2,0,20,1,2,32,5,1
...,...,...,...,...,...,...,...,...
3995,4,1,200,1,4,283,10,0
3996,2,1,152,0,7,202,1,0
3997,0,2,153,0,10,218,6,1
3998,5,2,207,1,4,280,5,1


In [5]:
y = train["Price"]
X = train.drop(columns=["Price"])


In [6]:
#split data into X and y
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
#create and train an initial Random Forest model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
#evaluate the initial model using MAE and RMSE metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)


MAE: 9.669735416666667
RMSE: 12.494954849688888


In [9]:
#remove outliers to improve training quality
df = train[train['Price'] < train['Price'].quantile(0.95)]


In [10]:
#create an improved Random Forest model with manual hyperparameters
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
    n_estimators=300,      
    max_depth=15,          
    min_samples_split=5,
    random_state=42
)


In [11]:
#use GridSearchCV to find the best model hyperparameters automatically
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)

print(grid.best_params_)


{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}


In [12]:
#evaluate the best model obtained from Grid Search
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

best_model = grid.best_estimator_

val_preds = best_model.predict(X_train,)

mae = mean_absolute_error(y_train, val_preds)
rmse = np.sqrt(mean_squared_error(y_train, val_preds))

print("MAE:", mae)
print("RMSE:", rmse)


MAE: 5.6859115778208755
RMSE: 7.399844037902914


In [13]:
#predict the price of a new cake using the best trained model
new_cake = pd.DataFrame({
    'Sold_On': [5],
    'Size': [1],
    'Ingredients_Cost': [50],
    'Design_Complexity': [0],
    'Time_Taken': [3],
    'Amount': [2],
    'Gender': [1]
})

predicted_price = best_model.predict(new_cake)
print("Predicted Price:", predicted_price[0])


Predicted Price: 72.25800700390045
