In [1]:
# with orders also containing store and date & time of order

# importing libraries  
import numpy as nm  
import matplotlib.pyplot as mtp  
import pandas as pd 
from sklearn.utils import shuffle

#importing datasets  
data_set= pd.read_csv('orders_loc_time.csv') 

print(data_set.columns)
print('\n')

#fetch distinct locationsids
locs = data_set['LocationId'].unique()

#create a running number list of same size as locationids count
run_num = list(range(1,len(locs)+1))

#replace locationids with running numbers
data_set.replace(to_replace=locs, value=run_num, inplace=True)

# shuffle the data before training the data
data_set = shuffle(data_set)

#Extracting Independent and dependent Variable  
x= data_set.iloc[:, :-1].values  
y= data_set.iloc[:, 12].values

# Splitting the dataset into training and test set.  
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=0)

#Fitting the RandomForestRegressor model to the training set:  
from sklearn.ensemble import RandomForestRegressor 
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train, y_train)

#Predicting the Test set result;  
y_pred = rf.predict(x_test)

# Evaluate predictions
print(f'Train Score: {rf.score(x_train, y_train)*100:.2f}%')
print(f'Test Score: {rf.score(x_test, y_test)*100:.2f}%')

Index(['LocationId', 'order_year', 'order_month', 'order_date', 'order_hour',
       'order_min', 'order_sec', 'ordersize', 'itemscount',
       'pendingorderscount', 'pendingorderssize', 'pendingorderitems',
       'bumptime'],
      dtype='object')


Train Score: 98.24%
Test Score: 94.90%


In [2]:
# test with new values
newOrders = [['0714a783-6cb8-4d9b-8679-d7924cde08f6', 2023, 5, 18, 16, 45, 50, 5, 1, 1, 3, 1], 
             ['0714a783-6cb8-4d9b-8679-d7924cde08f6', 2023, 3, 12, 4, 23, 33, 5, 1, 1, 3, 1]]
df = pd.DataFrame(newOrders)
df.replace(to_replace=locs, value=run_num, inplace=True)
newOrders = df.values.tolist()

preds = rf.predict(newOrders)

for pred in preds:
    if(pred > 3600):
        hour = int(pred/3600)
        rem = pred - hour*3600
        min = int(rem/60)
        rem = rem - min*60
        sec = int(rem)
        print("bump time --> %r:%r:%r" %(hour, min, sec))
    elif(pred > 60):
        min = int(pred/60)
        rem = rem - min*60
        sec = int(rem)
        print("bump time --> %r:%r:%r" %(hour, min, sec))
    else:
        sec = int(pred)
        print("bump time --> %r:%r:%r" %(hour, min, sec))

bump time --> 19:16:28
bump time --> 7:46:42


In [2]:
importance = rf.feature_importances_
for i,v in enumerate(importance):
    print(f'Feature: {data_set.columns[i]}, Score: {v*100: .3f}%')

Feature: LocationId, Score:  0.332%
Feature: order_year, Score:  0.002%
Feature: order_month, Score:  34.339%
Feature: order_date, Score:  51.455%
Feature: order_hour, Score:  3.705%
Feature: order_min, Score:  2.058%
Feature: order_sec, Score:  3.156%
Feature: ordersize, Score:  0.032%
Feature: itemscount, Score:  0.000%
Feature: pendingorderscount, Score:  1.395%
Feature: pendingorderssize, Score:  2.049%
Feature: pendingorderitems, Score:  1.478%
