<a href="https://colab.research.google.com/github/sbesinski/kaggle_ML/blob/main/food_delivery_estimated_time_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("denkuznetz/food-delivery-time-prediction")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/denkuznetz/food-delivery-time-prediction?dataset_version_number=1...


100%|██████████| 11.6k/11.6k [00:00<00:00, 15.7MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/denkuznetz/food-delivery-time-prediction/versions/1





In [2]:
import pandas as pd
import os
import matplotlib as plt

csv_path = os.path.join(path,'Food_Delivery_Times.csv')
df = pd.read_csv(csv_path)
df.head()


Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,522,7.93,Windy,Low,Afternoon,Scooter,12,1.0,43
1,738,16.42,Clear,Medium,Evening,Bike,20,2.0,84
2,741,9.52,Foggy,Low,Night,Scooter,28,1.0,59
3,661,7.44,Rainy,Medium,Afternoon,Scooter,5,1.0,37
4,412,19.03,Clear,Low,Morning,Bike,16,5.0,68


In [3]:
df.describe()

Unnamed: 0,Order_ID,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
count,1000.0,1000.0,1000.0,970.0,1000.0
mean,500.5,10.05997,16.982,4.579381,56.732
std,288.819436,5.696656,7.204553,2.914394,22.070915
min,1.0,0.59,5.0,0.0,8.0
25%,250.75,5.105,11.0,2.0,41.0
50%,500.5,10.19,17.0,5.0,55.5
75%,750.25,15.0175,23.0,7.0,71.0
max,1000.0,19.99,29.0,9.0,153.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Order_ID                1000 non-null   int64  
 1   Distance_km             1000 non-null   float64
 2   Weather                 970 non-null    object 
 3   Traffic_Level           970 non-null    object 
 4   Time_of_Day             970 non-null    object 
 5   Vehicle_Type            1000 non-null   object 
 6   Preparation_Time_min    1000 non-null   int64  
 7   Courier_Experience_yrs  970 non-null    float64
 8   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB


In [5]:
print(df.isnull().sum())

Order_ID                   0
Distance_km                0
Weather                   30
Traffic_Level             30
Time_of_Day               30
Vehicle_Type               0
Preparation_Time_min       0
Courier_Experience_yrs    30
Delivery_Time_min          0
dtype: int64


In [6]:
df.tail()

Unnamed: 0,Order_ID,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
995,107,8.5,Clear,High,Evening,Car,13,3.0,54
996,271,16.28,Rainy,Low,Morning,Scooter,8,9.0,71
997,861,15.62,Snowy,High,Evening,Scooter,26,2.0,81
998,436,14.17,Clear,Low,Afternoon,Bike,8,0.0,55
999,103,6.63,Foggy,Low,Night,Scooter,24,3.0,58


In [7]:
df.keys()

Index(['Order_ID', 'Distance_km', 'Weather', 'Traffic_Level', 'Time_of_Day',
       'Vehicle_Type', 'Preparation_Time_min', 'Courier_Experience_yrs',
       'Delivery_Time_min'],
      dtype='object')

In [8]:
missing_columns = df.columns[df.isnull().any()]
print("Columns with missing values:", missing_columns.tolist())

Columns with missing values: ['Weather', 'Traffic_Level', 'Time_of_Day', 'Courier_Experience_yrs']


In [9]:
# Get rows with missing values
rows_with_missing_data = df[df.isnull().any(axis=1)]

# Print the rows
print("Rows with missing data:")
print(rows_with_missing_data)

Rows with missing data:
     Order_ID  Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type  \
6         627         9.52   Clear           Low         NaN         Bike   
14        939         2.80   Clear          High     Morning      Scooter   
24        211        11.20   Clear        Medium     Morning         Bike   
42        313         0.99     NaN        Medium     Evening         Bike   
71        494         4.17     NaN           Low     Evening      Scooter   
..        ...          ...     ...           ...         ...          ...   
974       414        11.68   Clear           NaN   Afternoon      Scooter   
976       344         8.96   Snowy           NaN     Morning          Car   
987       331         7.44   Rainy           Low     Evening         Bike   
988       215        14.39   Rainy        Medium     Morning      Scooter   
989       467         6.07     NaN           Low   Afternoon         Bike   

     Preparation_Time_min  Courier_Experience_yrs  

In [10]:
df.drop(columns=['Order_ID'], inplace=True)

In [11]:
feature_values = df[:-1]
print(feature_values)

     Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type  \
0           7.93   Windy           Low   Afternoon      Scooter   
1          16.42   Clear        Medium     Evening         Bike   
2           9.52   Foggy           Low       Night      Scooter   
3           7.44   Rainy        Medium   Afternoon      Scooter   
4          19.03   Clear           Low     Morning         Bike   
..           ...     ...           ...         ...          ...   
994         4.37   Clear        Medium     Evening      Scooter   
995         8.50   Clear          High     Evening          Car   
996        16.28   Rainy           Low     Morning      Scooter   
997        15.62   Snowy          High     Evening      Scooter   
998        14.17   Clear           Low   Afternoon         Bike   

     Preparation_Time_min  Courier_Experience_yrs  Delivery_Time_min  
0                      12                     1.0                 43  
1                      20                     2.0    

In [12]:
all_columns = df.columns[df.any()]
print("Columns names:", all_columns.tolist())

Columns names: ['Distance_km', 'Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type', 'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min']


handling missing value

In [13]:
# Replace NaN in string (categorical) columns with the most frequent value (mode)
categorical_columns = ['Weather', 'Traffic_Level', 'Time_of_Day']
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True) # Mode[0] gives the most frequent value

# Replace NaN in float column with the mean
df['Courier_Experience_yrs'].fillna(df['Courier_Experience_yrs'].mean(), inplace=True)

print(df.isnull().sum())

Distance_km               0
Weather                   0
Traffic_Level             0
Time_of_Day               0
Vehicle_Type              0
Preparation_Time_min      0
Courier_Experience_yrs    0
Delivery_Time_min         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True) # Mode[0] gives the most frequent value
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Courier_Experience_yrs'].fillna(df['Courier_Experience_yrs'].mean(), inplace=True)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Weather                 1000 non-null   object 
 2   Traffic_Level           1000 non-null   object 
 3   Time_of_Day             1000 non-null   object 
 4   Vehicle_Type            1000 non-null   object 
 5   Preparation_Time_min    1000 non-null   int64  
 6   Courier_Experience_yrs  1000 non-null   float64
 7   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 62.6+ KB


In [18]:
from sklearn.preprocessing import LabelEncoder
target_labels = ['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type', 'Courier_Experience_yrs' ]
label_encoder = LabelEncoder()

# Encode each column
for col in target_labels:
    df[col] = label_encoder.fit_transform(df[col])

In [19]:
from sklearn.preprocessing import OneHotEncoder

target_values = ['Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type', 'Courier_Experience_yrs']
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[target_values])
encoded_features.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [20]:
from sklearn.preprocessing import StandardScaler
feature_values = ['Distance_km', 'Weather', 'Traffic_Level', 'Time_of_Day', 'Vehicle_Type', 'Preparation_Time_min', 'Courier_Experience_yrs']
scaler = StandardScaler()
df[feature_values] = scaler.fit_transform(df[feature_values])

In [21]:
target = 'Delivery_Time_min'

#df[target] = scaler.fit_transform(df[[target]]). tutaj nie ma potrzeby skalowania ponieważ otrzymamy czas na minusie

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Weather                 1000 non-null   float64
 2   Traffic_Level           1000 non-null   float64
 3   Time_of_Day             1000 non-null   float64
 4   Vehicle_Type            1000 non-null   float64
 5   Preparation_Time_min    1000 non-null   float64
 6   Courier_Experience_yrs  1000 non-null   float64
 7   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 62.6 KB


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = df[feature_values]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X[:-1]

Unnamed: 0,Distance_km,Weather,Traffic_Level,Time_of_Day,Vehicle_Type,Preparation_Time_min,Courier_Experience_yrs
0,-0.374085,2.030087,-0.296080,-1.281817,1.373492,-0.691853,-1.236594
1,1.117008,-0.855609,1.031634,-0.234581,-0.913755,0.419111,-0.934322
2,-0.094835,-0.134185,-0.296080,1.859891,1.373492,1.530076,-1.236594
3,-0.460144,0.587239,1.031634,-1.281817,1.373492,-1.663947,-1.236594
4,1.575401,-0.855609,-0.296080,0.812655,-0.913755,-0.136371,0.274765
...,...,...,...,...,...,...,...
994,-0.999326,-0.855609,1.031634,-0.234581,1.373492,-1.525077,0.879309
995,-0.273977,-0.855609,-1.623795,-0.234581,0.229868,-0.552983,-0.632050
996,1.092420,0.587239,-0.296080,0.812655,1.373492,-1.247335,1.483852
997,0.976505,1.308663,-1.623795,-0.234581,1.373492,1.252335,-0.934322


In [25]:
y[:-1]

Unnamed: 0,Delivery_Time_min
0,43
1,84
2,59
3,37
4,68
...,...
994,25
995,54
996,71
997,81


In [26]:
X_test.shape

(200, 7)

In [27]:
model = LinearRegression()
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7564743649408955

In [33]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = model.predict(X_test)
# Regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 99.221636
Mean Absolute Error: 7.0870999999999995
R-squared: 0.7786352862854807


In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7786352862854807

In [35]:
# Regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 99.221636
Mean Absolute Error: 7.0870999999999995
R-squared: 0.7786352862854807


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Distance_km             1000 non-null   float64
 1   Weather                 1000 non-null   float64
 2   Traffic_Level           1000 non-null   float64
 3   Time_of_Day             1000 non-null   float64
 4   Vehicle_Type            1000 non-null   float64
 5   Preparation_Time_min    1000 non-null   float64
 6   Courier_Experience_yrs  1000 non-null   float64
 7   Delivery_Time_min       1000 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 62.6 KB


In [37]:
# Optional: Print a few predictions vs actual values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison.head())

     Actual  Predicted
521      32      37.04
737      68      67.61
740      39      40.27
660      44      44.10
411      85      77.34
