## Dataset Preparation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from math import radians, cos, sin, sqrt, asin
import seaborn as sns

df = pd.read_csv("data/amazon_delivery.csv")

df

In [None]:
print(df.info())
# 43,739 total entries

In [None]:
print(df.shape)
df.isnull().sum()
# NaN values found in Agent_Rating and Weather

In [None]:
# Imputation: replacing NaN Weather values with the mode "Fog"

print(df['Weather'].mode())

replacement = {"Weather": 'Fog'}
df.fillna(value = replacement, inplace = True)
df.isnull().sum()

In [None]:
# Dropping remaining NaN values from Agent_Rating and setting it to a new dataframe
df_upd = df.dropna().copy().set_index('Order_ID')

print(df_upd.describe())
print("\n", df_upd.isna().sum())

## Identified Targets: **'Delivery_Time** and **'Agent_Rating'**


In [None]:
df_upd.head()

## Feature Engineering

### Creating a column for the distance between store and drop points

In [None]:
# Creating column for Store-Drop Distance values
df_upd.insert(df_upd.columns.get_loc('Drop_Longitude')+1, 'Store-Drop_Distance_km', None)

In [None]:
def haversine_dist(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    earth_radius = 6371
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    dist = (2*earth_radius) * asin(sqrt((sin(dlat/2)**2) + (cos(lat1)*cos(lat2)*(sin(dlon/2)**2))))
    return dist


# Setting Store-Drop Distance values
df_upd['Store-Drop_Distance_km'] = df_upd.apply(lambda r: haversine_dist(r['Store_Latitude'], r['Store_Longitude'], r['Drop_Latitude'], r['Drop_Longitude']), axis = 1)


In [None]:
# Removing outliers
Q1 = df_upd['Store-Drop_Distance_km'].quantile(0.25) # first quantile
Q3 = df_upd['Store-Drop_Distance_km'].quantile(0.75) # third quantile
IQR = Q3 - Q1

lowerbound = Q1 - 1.5*IQR
upperbound = Q3 + 1.5*IQR

outliers_mask = ( (df_upd['Store-Drop_Distance_km'] < lowerbound) | (df_upd['Store-Drop_Distance_km'] > upperbound) )
outliers = df_upd[outliers_mask]

print('Before dropping: \n',df_upd.describe())

df_upd.drop(outliers.index, inplace = True)

# 183 entries with outliers in Store-Drop_Distance_km removed
print('\nAfter dropping: \n', df_upd.describe())

### Setting up dataframe with regards to predictors

In [None]:
# List of columns to test (predictors)
features = ['Agent_Age', 'Store-Drop_Distance_km', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
df_features = df_upd[features].copy()

# Continuous predictors: Agent_Age, Store-Drop_Distance_km
continuous_features = ['Agent_Age', 'Store-Drop_Distance_km']

# Categorical predictors; set as dummies
df_features = pd.get_dummies(df_features, columns=['Weather', 'Traffic', 'Vehicle', 'Area', 'Category'], drop_first = True)
df_features[df_features.columns.difference(['Agent_Age', 'Store-Drop_Distance_km'])] = df_features[df_features.columns.difference(['Agent_Age', 'Store-Drop_Distance_km'])].astype(int)

In [39]:
# SelectKBest and f_regression
from sklearn.feature_selection import SelectKBest, f_regression
X = df_features.values
y_time = df_upd['Delivery_Time'].values
y_rating = df_upd['Agent_Rating'].values

selector = SelectKBest(score_func = f_regression, k = 6)

X_sel_time = selector.fit_transform(X, y_time)
selected_cols = df_features.columns[selector.get_support()]
print('Top 4 for Delivery_Time: ', selected_cols.tolist())

X_sel_rating = selector.fit_transform(X, y_rating)
selected_cols = df_features.columns[selector.get_support()]
print('Top 4 for Agent_Rating: ', selected_cols.tolist())

Top 4 for Delivery_Time:  ['Agent_Age', 'Store-Drop_Distance_km', 'Weather_Sunny', 'Traffic_Jam ', 'Traffic_Low ', 'Category_Grocery']
Top 4 for Agent_Rating:  ['Agent_Age', 'Store-Drop_Distance_km', 'Traffic_Jam ', 'Traffic_Medium ', 'Traffic_NaN ', 'Area_Urban ']


In [None]:
# SelectFromModel using Random Forest Regressor
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)
sfm = SelectFromModel(rf, threshold = 'median')

In [None]:
fig, ax = plt.subplots(2, 4, figsize = (18, 8))

test = ['Agent_Age', 'Store-Drop_Distance_km', 'Delivery_Time', 'Agent_Rating']
for i in range(2):
    for j in range(4):
        if i == 0:
            sns.histplot(df_upd[test[j]], kde = True, ax = ax[i][j], bins = 30, color = 'blue')
            ax[i][j].set_title(test[j])
        if i == 1:
            sns.boxplot(df_upd[test[j]], ax = ax[i][j], width = 0.2, color = '#556bd9')

plt.show()