**Data Operations with Polars instead of Pandas**

In [1]:
import polars as pl

In [2]:
df = pl.read_csv("data/train.csv")

In [3]:
df_test = pl.read_csv("data/test.csv")

In [4]:
set(df.columns) - set(df_test.columns)

{'Time_taken(min)'}

In [5]:
df.columns

['ID',
 'Delivery_person_ID',
 'Delivery_person_Age',
 'Delivery_person_Ratings',
 'Restaurant_latitude',
 'Restaurant_longitude',
 'Delivery_location_latitude',
 'Delivery_location_longitude',
 'Order_Date',
 'Time_Orderd',
 'Time_Order_picked',
 'Weatherconditions',
 'Road_traffic_density',
 'Vehicle_condition',
 'Type_of_order',
 'Type_of_vehicle',
 'multiple_deliveries',
 'Festival',
 'City',
 'Time_taken(min)']

Convert latitude and longitude data to H3 based hexagon ids

In [24]:
import h3
# same operations for test.csv data
res = 6
distance_data = []
restaurants_data = []
deliveries_data = []

for row in df.rows(named=True):
    res_del_distance_meters = h3.point_dist((row["Restaurant_latitude"], row["Restaurant_longitude"]),
                               (row["Delivery_location_latitude"], row["Delivery_location_longitude"]),
                               unit="m")
    restaurant = h3.geo_to_h3(lat=row["Restaurant_latitude"], lng=row["Restaurant_longitude"], resolution=res)
    delivery = h3.geo_to_h3(lat=row["Delivery_location_latitude"], lng=row["Delivery_location_longitude"], resolution=res)

    restaurants_data.append(restaurant)
    deliveries_data.append(delivery)
    distance_data.append(res_del_distance_meters)

df = df.with_columns([
    pl.Series(name="Distance(m)", values=distance_data),
    pl.Series(name="Restaurant_hex_id", values=restaurants_data),
    pl.Series(name="Delivery_hex_id", values=deliveries_data)
])

Remove latitude and longitude columns of restaurant and delivery locations

In [25]:
df = df.drop(columns=["Restaurant_latitude", "Restaurant_longitude", "Delivery_location_longitude", "Delivery_location_latitude"])

In [8]:
import re

Convert multiple_deliveries column type from utf8 to integer after removing of "NaN " values

In [9]:
def convert_int_function(r):
    try:
        return int(re.search(r'\d+', r).group())
    except Exception as err:
        print(err)
        return -1

In [10]:
def remove_nan_from_multiple_deliveries(r: str):
    return "-1" if "NaN" in r else r

In [26]:
df["multiple_deliveries"].value_counts()

multiple_deliveries,counts
str,u32
"""1""",7065
"""0""",3491
"""2""",513
"""NaN """,238
"""3""",92


In [28]:
df = df.with_columns(
    [
        pl.col("multiple_deliveries").apply(remove_nan_from_multiple_deliveries).cast(pl.Int64),
        pl.col("Time_taken(min)").apply(convert_int_function)
    ]
)

In [29]:
df["multiple_deliveries"].value_counts()

multiple_deliveries,counts
i64,u32
0,3491
-1,238
3,92
2,513
1,7065


In [30]:
# Apply all operations to test data also
# df_test.write_csv("data/processed_test_data.csv", has_header=True)

In [14]:
# "Time_Orderd", "Time_Order_picked" can be used for preparing time
X = df[["Road_traffic_density", "Type_of_vehicle", "Distance(m)",
        "Weatherconditions", "Vehicle_condition", "multiple_deliveries"]].to_pandas()
y = df["Time_taken(min)"].to_pandas()

In [15]:
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


Duration estimation of delivery with CatBoost Regressor model

In [16]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

cb = CatBoostRegressor(
    n_estimators=200,
    loss_function="MAPE",
    learning_rate=0.3,
    depth=3,
    random_state=1,
    verbose=False
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
train_pool = Pool(data=X_train, label=y_train, cat_features=["Road_traffic_density", "Type_of_vehicle",
                                                             "Weatherconditions", "Vehicle_condition", "multiple_deliveries"])
test_pool = Pool(X_test, cat_features=["Road_traffic_density", "Type_of_vehicle",
                                       "Weatherconditions", "Vehicle_condition", "multiple_deliveries"])
cb.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x7f844a797820>

In [17]:
y_pred = cb.predict(test_pool)

In [18]:
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error as mape

cb_mape = np.sqrt(mape(y_test, y_pred))
print(f"MAPE in y_pred : {np.mean(cb_mape)}")

MAPE in y_pred : 0.4623135193278682


In [19]:
cb.feature_names_

['Road_traffic_density',
 'Type_of_vehicle',
 'Distance(m)',
 'Weatherconditions',
 'Vehicle_condition',
 'multiple_deliveries']

In [20]:
cb.save_model("model/cb.pkl", format="cbm")

In [21]:
df.write_csv("data/processed_data.csv", has_header=True)

Predict with test data

In [22]:
cb.predict(data=["High", "motorcycle", 1465.2041612956887, "conditions Windy", 0, 3])

35.1904627569758

In [23]:
cb.predict(data=["Low ", "scooter", 7763.616358402078, "conditions Sunny", 2, 0])

16.510762611140738