In [322]:
import pathlib
import pandas as pd
import numpy as np

In [323]:
data_dir = pathlib.Path().absolute().parent.as_posix() + '/data/raw/train.csv'

df = pd.read_csv(data_dir)

In [324]:
df.sample(3)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
14109,0xa0de,RANCHIRES010DEL01,22,4.7,23.354422,85.3329,23.424422,85.4029,02-03-2022,18:25:00,18:40:00,conditions Sunny,Medium,1,Snack,scooter,0,,Metropolitian,(min) 13
27980,0x4b30,SURRES15DEL01,30,4.2,21.160522,72.771477,21.230522,72.841477,02-03-2022,20:25:00,20:30:00,conditions Stormy,Jam,0,Snack,motorcycle,3,Yes,Metropolitian,(min) 53
19660,0xb972,VADRES13DEL01,37,4.7,22.310237,73.158921,22.350237,73.198921,26-03-2022,12:40:00,12:50:00,conditions Stormy,High,1,Snack,motorcycle,1,No,Metropolitian,(min) 25


In [325]:
df.shape

(45593, 20)

In [326]:
# drop ID & Delivery_pperson_id

<font color = 'orange'>

#### <b> City
- handle NaN values
- handle extra space at suffix
- change data type

In [327]:
print(f"Unique Valeus: {df['City'].unique()} \n\nNaN Count: {df[df['City'] == 'NaN '].shape[0]} \n\nValue Count: {df['City'].value_counts()}")

Unique Valeus: ['Urban ' 'Metropolitian ' 'Semi-Urban ' 'NaN '] 

NaN Count: 1200 

Value Count: City
Metropolitian     34093
Urban             10136
NaN                1200
Semi-Urban          164
Name: count, dtype: int64


In [328]:
# fix extra spaces
df['City'] = list(map(lambda x: x.strip(), df['City']))

df['City'].unique()

array(['Urban', 'Metropolitian', 'Semi-Urban', 'NaN'], dtype=object)

In [329]:
# fix 'NaN' with np.nan
df['City'] = df['City'].replace('NaN', np.nan)

In [330]:
df['City'].isnull().sum()

np.int64(1200)

<font color = 'orange'>

#### <b> Festival
- handle NaN
- handle extra space at suffix
- change data type

In [331]:
print(f"Unique Valeus: {df['Festival'].unique()} \n\nNaN Count: {df[df['Festival'] == 'NaN '].shape[0]} \n\nValue Count: {df['Festival'].value_counts()}")

Unique Valeus: ['No ' 'Yes ' 'NaN '] 

NaN Count: 228 

Value Count: Festival
No      44469
Yes       896
NaN       228
Name: count, dtype: int64


In [332]:
# fix the extra space
df['Festival'] = list(map(lambda x: x.strip(), df['Festival']))
print(f"Unique value: {df['Festival'].unique()}")

# fix 'NaN' with np.nan
df['Festival'] = df['Festival'].replace('NaN', np.nan)
print(f"Value Count: {df['Festival'].isnull().sum()}")

Unique value: ['No' 'Yes' 'NaN']
Value Count: 228


<font color = 'orange'>

#### <b> multiple_deliveries
- handle NaN
- change data type

In [333]:
df['multiple_deliveries'].unique()

array(['0', '1', '3', 'NaN ', '2'], dtype=object)

In [335]:
df['multiple_deliveries'].value_counts()

multiple_deliveries
1       28159
0       14095
2        1985
NaN       993
3         361
Name: count, dtype: int64

In [337]:
# fix 'NaN ' with np.nan
df['multiple_deliveries'] = df['multiple_deliveries'].replace('NaN ', np.nan)
print(f"Value Count: {df['multiple_deliveries'].isnull().sum()}")

Value Count: 993


In [338]:
# fixing data type
df['multiple_deliveries'] = df['multiple_deliveries'].astype('Int64')

In [339]:
# as we dont have any clear explanation we'll convert 1,2,3 as 1 which means multiple deliveries
# and 0 means single delivery 
df['multiple_deliveries'] = df['multiple_deliveries'].apply(lambda x : x if x == 0 or np.isnan(x) else 1)

In [340]:
df['multiple_deliveries'].unique()

array([ 0.,  1., nan])

<font color = 'orange'>

#### <b> Type_of_vehicle
- handle extra space at suffix
- change data type

In [342]:
df['Type_of_vehicle'].unique()

array(['motorcycle ', 'scooter ', 'electric_scooter ', 'bicycle '],
      dtype=object)

In [343]:
# fix the extra space
df['Type_of_vehicle'] = list(map(lambda x: x.strip(), df['Type_of_vehicle']))
print(f"Unique value: {df['Type_of_vehicle'].unique()}")

Unique value: ['motorcycle' 'scooter' 'electric_scooter' 'bicycle']


<font color = 'orange'>

#### <b> Type_of_order
- handle extra space at suffix
- change data type

In [344]:
df['Type_of_order'].unique()

array(['Snack ', 'Drinks ', 'Buffet ', 'Meal '], dtype=object)

In [345]:
# fix the extra space
df['Type_of_order'] = list(map(lambda x: x.strip(), df['Type_of_order']))
print(f"Unique value: {df['Type_of_order'].unique()}")

Unique value: ['Snack' 'Drinks' 'Buffet' 'Meal']


<font color = 'orange'>

#### <b> Road_traffic_density
- handle NaN
- handle extra space at suffix
- change data type

In [346]:
df['Road_traffic_density'].unique()

array(['High ', 'Jam ', 'Low ', 'Medium ', 'NaN '], dtype=object)

In [347]:
df['Road_traffic_density'].value_counts()

Road_traffic_density
Low        15477
Jam        14143
Medium     10947
High        4425
NaN          601
Name: count, dtype: int64

In [348]:
# fix the extra space
df['Road_traffic_density'] = list(map(lambda x: x.strip(), df['Road_traffic_density']))
print(f"Unique value: {df['Road_traffic_density'].unique()}")

# fix 'NaN' with np.nan
df['Road_traffic_density'] = df['Road_traffic_density'].replace('NaN', np.nan)
print(f"Value Count: {df['Road_traffic_density'].isnull().sum()}")

Unique value: ['High' 'Jam' 'Low' 'Medium' 'NaN']
Value Count: 601


<font color = 'orange'>

#### <b> Delivery_person_Rattings
- handle NaN
- change data type

In [349]:
df['Delivery_person_Ratings'].unique()

array(['4.9', '4.5', '4.4', '4.7', '4.6', '4.8', '4.2', '4.3', '4', '4.1',
       '5', '3.5', 'NaN ', '3.8', '3.9', '3.7', '2.6', '2.5', '3.6',
       '3.1', '2.7', '1', '3.2', '3.3', '6', '3.4', '2.8', '2.9', '3'],
      dtype=object)

In [350]:
df[df['Delivery_person_Ratings'] == 'NaN '].shape[0]

1908

In [351]:
# fixing data type
df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].astype('float')

In [352]:
df['Delivery_person_Ratings'].isna().sum()

np.int64(1908)

<font color = 'orange'>

#### <b> Delivery_person_Age
- handle NaN
- change data type

In [353]:
df['Delivery_person_Age'].unique()

array(['37', '34', '23', '38', '32', '22', '33', '35', '36', '21', '24',
       '29', '25', '31', '27', '26', '20', 'NaN ', '28', '39', '30', '15',
       '50'], dtype=object)

In [354]:
# 'NaN ' is present in Delivery_person_Age column - 1854
df[df['Delivery_person_Age'] == 'NaN '].shape[0]

1854

In [355]:
# fixing NaN
df['Delivery_person_Age'] = df['Delivery_person_Age'].replace('NaN ', np.nan)

# fixing data type
df['Delivery_person_Age'] = df['Delivery_person_Age'].astype('Int64')

In [356]:
df['Delivery_person_Age'].isnull().sum()

np.int64(1854)

<font color = 'orange'>

#### <b> Weatherconditions
- handle Nan
- change data type

In [357]:
df['Weatherconditions'].unique()

array(['conditions Sunny', 'conditions Stormy', 'conditions Sandstorms',
       'conditions Cloudy', 'conditions Fog', 'conditions Windy',
       'conditions NaN'], dtype=object)

In [358]:
df['Weatherconditions'].value_counts()

Weatherconditions
conditions Fog           7654
conditions Stormy        7586
conditions Cloudy        7536
conditions Sandstorms    7495
conditions Windy         7422
conditions Sunny         7284
conditions NaN            616
Name: count, dtype: int64

In [359]:
# fixing NaN
df['Weatherconditions'] = df['Weatherconditions'].replace('conditions NaN', np.nan)

In [360]:
df['Weatherconditions'].isnull().sum()

np.int64(616)

In [361]:
df.isnull().sum()

ID                                0
Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                       0
Time_Order_picked                 0
Weatherconditions               616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken(min)                   0
dtype: int64

In [362]:
# % of NaN values
round(df.isnull().mean() * 100, 2)

ID                             0.00
Delivery_person_ID             0.00
Delivery_person_Age            4.07
Delivery_person_Ratings        4.18
Restaurant_latitude            0.00
Restaurant_longitude           0.00
Delivery_location_latitude     0.00
Delivery_location_longitude    0.00
Order_Date                     0.00
Time_Orderd                    0.00
Time_Order_picked              0.00
Weatherconditions              1.35
Road_traffic_density           1.32
Vehicle_condition              0.00
Type_of_order                  0.00
Type_of_vehicle                0.00
multiple_deliveries            2.18
Festival                       0.50
City                           2.63
Time_taken(min)                0.00
dtype: float64

# Feature Engineering

"restaurant - delivery location" latitude & longitude
- Euclidean dist
- haversine dist
- manhattan dist
- estimate delivery time (assume 30 kmph)

time ordered & time order picked
- time lag
- ordered picked time ratio

order date
- hour
- day of week
- month
- quarter
- year