In [1]:
# Basic Import
import pandas as pd
import warnings

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV

#preprocessing data 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

#scoring
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#train test split
from sklearn.model_selection import  train_test_split

#column transformer and pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Notebook/Data/fully_cleaned_data.csv')
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Time_Orderd_Hour,Delivery_city,distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,21,DEH,10.280582
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,14,KOC,6.242319
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,17,PUNE,13.78786
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,9,LUDH,2.930258
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19,KNP,19.396618


### Splitting data into X and y (independent and dependent variable)

In [3]:
X = df.drop(columns=['Time_taken (min)'], axis=1)
y = df['Time_taken (min)']

In [4]:
X.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_Orderd_Hour,Delivery_city,distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,21,DEH,10.280582
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,14,KOC,6.242319
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,17,PUNE,13.78786
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,9,LUDH,2.930258
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,19,KNP,19.396618


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features= X.select_dtypes('object').columns
numerical_features = X.select_dtypes(['float64', 'int64']).columns

In [6]:
df[numerical_features].head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Vehicle_condition,multiple_deliveries,Time_Orderd_Hour,distance
0,36.0,4.2,2,3.0,21,10.280582
1,21.0,4.7,1,1.0,14,6.242319
2,23.0,4.7,1,1.0,17,13.78786
3,34.0,4.3,0,0.0,9,2.930258
4,24.0,4.7,1,1.0,19,19.396618


In [7]:
df[categorical_features].head()

Unnamed: 0,Weather_conditions,Road_traffic_density,Type_of_order,Type_of_vehicle,Festival,City,Delivery_city
0,Fog,Jam,Snack,motorcycle,No,Metropolitian,DEH
1,Stormy,High,Meal,motorcycle,No,Metropolitian,KOC
2,Sandstorms,Medium,Drinks,scooter,No,Metropolitian,PUNE
3,Sandstorms,Low,Buffet,motorcycle,No,Metropolitian,LUDH
4,Fog,Jam,Snack,scooter,No,Metropolitian,KNP


In [8]:
## there are ordinal categorical features
Road_traffic_density = ['Low', 'Medium', 'High', 'Jam']
Weather_conditions = ['Sunny', 'Cloudy', 'Windy', 'Fog', 'Sandstorms', 'Stormy']

In [9]:
categorical_columns = ['Type_of_order',	'Type_of_vehicle', 'Festival', 'City', 'Delivery_city']
encoder_columns = ['Road_traffic_density', 'Weather_conditions']
numerical_columns = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',	'multiple_deliveries', 'Time_Orderd_Hour', 'distance']


In [14]:
#numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median', fill_value=0)),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

#categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

#ordinal pipeline
ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=['Road_traffic_density', 'Weather_conditions'])),
    ('scaler', StandardScaler(with_mean=False))
])

In [15]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns),
    ('ordinal_pipeline', ordinal_pipeline, encoder_columns)
])

In [16]:
X_train

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_Orderd_Hour,Delivery_city,distance
8239,30.0,4.7,Sandstorms,Low,2,Drinks,scooter,1.0,No,Metropolitian,8,COIMB,1.558115
17220,25.0,4.8,Sandstorms,Medium,0,Meal,motorcycle,1.0,No,Metropolitian,18,JAP,19.362580
3142,37.0,4.6,Fog,High,1,Meal,scooter,1.0,No,Urban,11,CHEN,1.552233
18277,31.0,4.6,Windy,Medium,1,Snack,motorcycle,1.0,No,Metropolitian,17,JAP,16.381599
37922,23.0,4.6,Sunny,Medium,2,Buffet,scooter,0.0,No,Urban,17,MYS,12.434913
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,28.0,3.3,Sunny,Low,2,Buffet,electric_scooter,2.0,No,Metropolitian,22,MUM,12.243496
11284,23.0,4.9,Cloudy,Low,1,Drinks,scooter,1.0,No,Metropolitian,9,MUM,1.529506
38158,21.0,4.6,Sunny,Jam,1,Drinks,scooter,1.0,No,Metropolitian,20,BANG,9.315069
860,39.0,3.8,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitian,23,MUM,19.882988


In [17]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [21]:
def evaluate_model(true, pred):
    r2score = r2_score(true, pred)
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    return mae, rmse, r2score, mse

In [None]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(),
    "AdaBoostRegressor" : AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #make prediction
    y_test_pred = model.predict(X_test)

        # Evaluate Test dataset
    model_test_mae, model_test_rmse, model_test_r2, model_test_mse = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- Mean Square Error: {:.4f}".format(model_test_mse))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')


In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

### Best Model is Random Forest