In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings

In [3]:
df = pd.read_excel("Supermarket-pred-data.xlsx",index_col=0)

In [4]:
df["Quantity"].max()

76

In [5]:
df.head()

Unnamed: 0,Date,Month name,Day name,Time Phase,Weekend,Quantity,cogs,Tax 5%,Total
0,1,1,2,2,0,29,1841.02,92.051,1933.071
1,1,1,2,1,0,24,1583.02,79.151,1662.171
2,1,1,2,3,0,28,1095.18,54.759,1149.939
3,2,1,3,2,0,34,1253.38,62.669,1316.049
4,2,1,3,3,0,14,599.48,29.974,629.454


In [6]:
df.drop(["Tax 5%","Weekend"],inplace=True,axis = 1)

In [7]:
cond1 = (df["Quantity"]>=0)  & (df["Quantity"]<15 )
cond2 = (df["Quantity"]>=15) & (df["Quantity"]<30 )
cond3 = (df["Quantity"]>=30) & (df["Quantity"]<45 )
cond4 = (df["Quantity"]>=45) & (df["Quantity"]<60 )
cond5 = (df["Quantity"]>=60) & (df["Quantity"]<100 )

df["Quantity"] = np.where(cond1,10,df["Quantity"])
df["Quantity"] = np.where(cond2,25,df["Quantity"])
df["Quantity"] = np.where(cond3,40,df["Quantity"])
df["Quantity"] = np.where(cond4,55,df["Quantity"])
df["Quantity"] = np.where(cond5,65,df["Quantity"])
df.head()

Unnamed: 0,Date,Month name,Day name,Time Phase,Quantity,cogs,Total
0,1,1,2,2,25,1841.02,1933.071
1,1,1,2,1,25,1583.02,1662.171
2,1,1,2,3,25,1095.18,1149.939
3,2,1,3,2,40,1253.38,1316.049
4,2,1,3,3,10,599.48,629.454


In [8]:
df["Quantity"].unique()

array([25, 40, 10, 55, 65], dtype=int64)

In [9]:
#Dividing the data into input features and the target
X = df[["Date","Month name","Day name","Time Phase"]]
Y = df[["Quantity","Total"]]

In [10]:
print("Shape of the real data set : ", df.shape)
"""
Splitting the data 
(train) 90:10 (test)
"""
print()
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.1)
print("Shape of X_train is : ",X_train.shape)
print("Shape of X_test is : ",X_test.shape)
print("Shape of y_train is : ",y_train.shape)
print("Shape of y_test is : ",y_test.shape)

Shape of the real data set :  (252, 7)

Shape of X_train is :  (226, 4)
Shape of X_test is :  (26, 4)
Shape of y_train is :  (226, 2)
Shape of y_test is :  (26, 2)


In [11]:
#Making the Random Forest Regressor Object
Rfrg = RandomForestRegressor(n_estimators=200,random_state=42, min_samples_leaf = 4, min_samples_split= 10)

In [12]:
#Fitting on Data points
Rfrg.fit(X_train,y_train)

In [34]:
#We can see the feature importance here.
Rfrg.feature_importances_

array([0.26857168, 0.10727279, 0.13260502, 0.49155051])

In [35]:
y_pred = Rfrg.predict(X_test)

In [36]:
# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 264.32169700718214


In [37]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 216699.53760167505


In [38]:
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) score:", r2)

R-squared (R2) score: 0.38085638413436135


In [39]:
num_features = X_train.shape[1]
adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_features - 1)
print("R-squared score:", r2)
print("Adjusted R-squared score:", adjusted_r2)

R-squared score: 0.38085638413436135
Adjusted R-squared score: 0.2629242668266206


In [18]:
"""OPTIONAL FOR HYPER-PARAMETER TUNING (GETTING THE BEST PARAM VALUES)"""
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor

# # Define hyperparameters to search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# rf_regressor = RandomForestRegressor(random_state=42)
# grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Getting the best hyperparameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

# # Training a new Random Forest regressor with the best hyperparameters
# best_rf_regressor = RandomForestRegressor(random_state=42, **best_params)
# best_rf_regressor.fit(X_train, y_train)

# # Evaluating the model
# y_pred = best_rf_regressor.predict(X_test)
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Mean Absolute Error (MAE):", mae)
# print("Mean Squared Error (MSE):", mse)
# print("R-squared (R2) score:", r2)
# adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - num_features - 1)
# print("Adjusted R-squared score:", adjusted_r2)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Mean Absolute Error (MAE): 348.7865077084939
Mean Squared Error (MSE): 389685.6509477533
R-squared (R2) score: 0.39221281533032504
Adjusted R-squared score: 0.2764438277741965


In [40]:
warnings.filterwarnings("ignore",category=UserWarning)
print(pd.DataFrame(Rfrg.predict([[2	,1	,3,	2	]]),columns=["Expected Quantity","Expected Sales"]))

   Expected Quantity  Expected Sales
0          32.570614     1551.459168


In [41]:
df.iloc[3:4,:]

Unnamed: 0,Date,Month name,Day name,Time Phase,Quantity,cogs,Total
3,2,1,3,2,40,1253.38,1316.049
