Project Title : DeliveryLogistics_Model

Prepared By : Rushikesh patil

In [76]:
# Import Data Manipulation Libraries
import numpy as np
import pandas as pd 

# Import Data Visiualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Import Warnings
import warnings
warnings.filterwarnings(action='ignore')

# Import Scikit-Learn Libraries
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,cross_val_score,KFold

# Import Machine Learning Model 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV


In [77]:

# Function Defination 

# Step 1: Data Inegstion 

filepath = "https://raw.githubusercontent.com/rushikeshpatil-123/DeliveryLogistics_Model/refs/heads/main/data/raw/Delivery_Logistics.csv"

def data_ingestion():
    return pd.read_csv(filepath)

# Step 2: Data Exploration 

from collections import OrderedDict
import pandas as pd

def data_exploration(df):

    numerical_col = df.select_dtypes(exclude='object').columns
    categorical_col = df.select_dtypes(include='object').columns

    num_stats_list = []
    cat_stats_list = []

    # ------------------ Numerical Features ------------------
    for col in numerical_col:

        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_count = ((df[col] < LW) | (df[col] > UW)).sum()

        stats = OrderedDict({
            "Feature": col,
            "Mean": df[col].mean(),
            "Median": df[col].median(),
            "Maximum": df[col].max(),
            "Minimum": df[col].min(),
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR,
            "Lower_Limit": LW,
            "Upper_Limit": UW,
            "Outlier_Count": outlier_count,
            "Skewness": df[col].skew(),
            "Kurtosis": df[col].kurt()
        })

        num_stats_list.append(stats)  
    numerical_stats_report = pd.DataFrame(num_stats_list)

    # ------------------ Categorical Features ------------------
    for col in categorical_col:
        cat_stats = OrderedDict({
            "Feature": col,
            "Unique Values": df[col].nunique(),
            "Mode": df[col].mode()[0],
            "Missing values": df[col].isnull().sum(),
            "Value Counts": df[col].value_counts().to_dict()
        })

        cat_stats_list.append(cat_stats)

    categorical_stats_report = pd.DataFrame(cat_stats_list)

    # ------------------ Dataset Info ------------------
    dataset_info = pd.DataFrame({
        "Feature": df.columns,
        "Dtype": df.dtypes.values,
        "Missing Values": df.isnull().sum().values,
        "Unique Values": df.nunique().values
    })

    return numerical_stats_report, categorical_stats_report, dataset_info


# Step 3: Data Preprocessing
def data_preprocessing(df):

    X = df.drop(columns= ['package_weight_kg'], axis =1 )
    y = df['package_weight_kg']

    # Split the Dataset into train and test

    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                        test_size=0.3,
                                                        random_state=0)
    
    # Use Encoding Techinque to convert all catergorical columns into numerical columns 

    from sklearn.preprocessing import LabelEncoder, MinMaxScaler
    categorical_col = X.select_dtypes(include = 'object').columns

    for i in  categorical_col:
        le = LabelEncoder()
        X_train[i] = le.fit_transform(X_train[i])
        X_test[i] = le.transform(X_test[i])

    # Using Normalization Technique

    sc = MinMaxScaler()
    X_train = sc.fit_transform(X_train)   # seen data
    X_test = sc.transform(X_test)    # unseen data 
    return X_train, X_test, y_train,y_test

# Step 4: Model Building 

def model_building(X_train, X_test, y_train, y_test):

    models = {
        "LinearRegression": LinearRegression(),
        "DecisionTreeRegressor":DecisionTreeRegressor(),
        "RandomForestRegressor":RandomForestRegressor(),
        "GradientBoostRegressor":GradientBoostingRegressor(),
        "AdaBoostRegressor":AdaBoostRegressor(),
        "SVR": SVR()
    }

    Regression_models = []
    
    for model_name, model in models.items():

        print(f"Training model: {model_name}")

        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)

        # Metrics
        r2_score_model = r2_score(y_test, y_pred)
        mse_model = mean_squared_error(y_test, y_pred)
        mae_model = mean_absolute_error(y_test, y_pred)

        Regression_models.append({
            "model name": model_name,
            "R2 Score": r2_score_model,
            "Mse": mse_model,
            "Mae": mae_model
        })

    
    Regression_models_report = pd.DataFrame(Regression_models)
    return Regression_models_report


In [61]:
# Function Calling

df = data_ingestion()

numerical_stats_report, categorical_stats_report, dataset_info = data_exploration(df)

X_train, X_test, y_train, y_test = data_preprocessing(df)

model_report = model_building(X_train, X_test, y_train, y_test)

Training model: LinearRegression
Training model: DecisionTreeRegressor
Training model: RandomForestRegressor
Training model: GradientBoostRegressor
Training model: AdaBoostRegressor
Training model: SVR


EXploratory Data Analysis

In [62]:
df.sample(frac=1)

Unnamed: 0,delivery_id,delivery_partner,package_type,vehicle_type,delivery_mode,region,weather_condition,distance_km,package_weight_kg,delivery_time_hours,expected_time_hours,delayed,delivery_status,delivery_rating,delivery_cost
13404,13405.0,fedex,pharmacy,bike,express,central,hot,133.5,49.24,1970-01-01 00:00:00.000000005,1970-01-01 00:00:00.000000004,yes,delayed,2,865.2200
14668,14669.0,shadowfax,furniture,ev bike,express,south,cold,281.2,36.88,1970-01-01 00:00:00.000000010,1970-01-01 00:00:00.000000007,yes,delayed,2,1566.6400
4181,4182.0,dhl,automobile parts,ev bike,same day,east,hot,244.8,2.74,1970-01-01 00:00:00.000000007,1970-01-01 00:00:00.000000008,no,delivered,5,1332.2200
6335,6336.0,blue dart,automobile parts,van,same day,central,hot,258.9,42.96,1970-01-01 00:00:00.000000010,1970-01-01 00:00:00.000000008,yes,delayed,2,1523.3800
11049,11050.0,ekart,fragile items,bike,standard,south,clear,4.6,36.21,1970-01-01 00:00:00.000000001,1970-01-01 00:00:00.000000024,no,delivered,5,131.6300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509,1510.0,xpressbees,cosmetics,ev bike,two day,south,hot,13.2,8.27,1970-01-01 00:00:00.000000002,1970-01-01 00:00:00.000000016,no,delivered,3,95.6674
24004,24005.0,ecom express,pharmacy,truck,express,central,cold,109.0,46.78,1970-01-01 00:00:00.000000003,1970-01-01 00:00:00.000000004,no,delivered,5,735.3400
11775,11776.0,blue dart,documents,bike,standard,south,clear,59.0,42.69,1970-01-01 00:00:00.000000003,1970-01-01 00:00:00.000000024,no,delivered,3,423.0700
7076,7077.0,amazon logistics,cosmetics,truck,standard,north,hot,106.5,5.25,1970-01-01 00:00:00.000000002,1970-01-01 00:00:00.000000024,no,delivered,5,548.2500


In [63]:
# Checking Descriptive stats : Numerical columns 
numerical_stats_report

Unnamed: 0,Feature,Mean,Median,Maximum,Minimum,Q1,Q3,IQR,Lower_Limit,Upper_Limit,Outlier_Count,Skewness,Kurtosis
0,delivery_id,12500.5,12500.5,24750.01,250.99,6250.75,18750.25,12499.5,-12498.5,37499.5,0,-1.066136e-16,-1.202673
1,distance_km,150.390436,151.0,297.1,3.6,75.9,224.9,149.0,-147.6,448.4,0,0.001882955,-1.196196
2,package_weight_kg,25.145898,25.145,49.52,0.67,12.68,37.66,24.98,-24.79,75.13,0,-0.002606811,-1.199496
3,delivery_rating,3.666,4.0,5.0,1.0,3.0,5.0,2.0,0.0,8.0,0,-0.4731261,-0.738847
4,delivery_cost,864.944579,867.535,1632.7206,95.6674,490.8,1237.91,747.11,-629.865,2358.575,0,0.001107873,-1.172019


In [64]:
# Checking Descriptive stats : Categorical columns 
categorical_stats_report

Unnamed: 0,Feature,Unique Values,Mode,Missing values,Value Counts
0,delivery_partner,9,xpressbees,0,"{'xpressbees': 2826, 'fedex': 2818, 'dhl': 280..."
1,package_type,9,fragile items,0,"{'fragile items': 2848, 'pharmacy': 2810, 'doc..."
2,vehicle_type,6,ev bike,0,"{'ev bike': 4218, 'van': 4187, 'scooter': 4174..."
3,delivery_mode,4,two day,0,"{'two day': 6302, 'same day': 6279, 'express':..."
4,region,5,west,0,"{'west': 5095, 'central': 5060, 'south': 4977,..."
5,weather_condition,6,foggy,0,"{'foggy': 4219, 'stormy': 4198, 'rainy': 4171,..."
6,delivery_time_hours,20,1970-01-01 00:00:00.000000006,0,"{'1970-01-01 00:00:00.000000006': 3118, '1970-..."
7,expected_time_hours,9,1970-01-01 00:00:00.000000016,0,"{'1970-01-01 00:00:00.000000016': 6302, '1970-..."
8,delayed,2,no,0,"{'no': 18331, 'yes': 6669}"
9,delivery_status,3,delivered,0,"{'delivered': 18331, 'delayed': 5341, 'failed'..."


In [65]:
#Checking Dataset Information
dataset_info

Unnamed: 0,Feature,Dtype,Missing Values,Unique Values
0,delivery_id,float64,0,24502
1,delivery_partner,object,0,9
2,package_type,object,0,9
3,vehicle_type,object,0,6
4,delivery_mode,object,0,4
5,region,object,0,5
6,weather_condition,object,0,6
7,distance_km,float64,0,2935
8,package_weight_kg,float64,0,4853
9,delivery_time_hours,object,0,20


In [66]:
model_report

Unnamed: 0,model name,R2 Score,Mse,Mae
0,LinearRegression,0.662078,69.456584,6.894695
1,DecisionTreeRegressor,0.950174,10.241322,2.304267
2,RandomForestRegressor,0.983946,3.299789,1.173137
3,GradientBoostRegressor,0.492508,104.310035,8.59542
4,AdaBoostRegressor,0.032517,198.856473,12.162167
5,SVR,0.225367,159.218107,10.872322


In [67]:
# Use RandomForest with GridSearch CV

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Build Model with RandomForest 

rf = RandomForestRegressor(random_state=42)

# Hyperparameter grid

param_grid = {
    'n_estimators': [100],
    'max_depth':[None]

}

# GridSearch CV
grid = GridSearchCV(
    estimator = rf,
    param_grid = param_grid,
    cv = 5,
    n_jobs= -1,
    verbose= 1
)
# Fit On training Data 
grid.fit(X_train,y_train)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'max_depth': None, 'n_estimators': 100}
Best score: 0.9799478341721878


In [68]:
df['weight_category'] = pd.cut(
    df['package_weight_kg'],
    bins = [0,10,25,df['package_weight_kg'].max()],
    labels = [
        'Light (0 - 10 kg)',
        'Medium (10 - 25 kg)',
        'Heavy (25+ kg)'

    ],
    include_lowest = True
)

In [69]:
# Crosstab
def crosstab(df):
    crosstab1 = pd.crosstab(df['region'],df['weight_category'],margins=True)
    crosstab2 = pd.crosstab(df['vehicle_type'],df['weight_category'],margins=True)
    crosstab3 = pd.crosstab(df['package_type'],df['weight_category'],margins=True)
    crosstab4 = pd.crosstab(df['delivery_mode'],df['weight_category'],margins=True)
    crosstab5 = pd.crosstab(df['weather_condition'],df['weight_category'],margins=True)
    return crosstab1,crosstab2,crosstab3,crosstab4,crosstab5

In [70]:
crosstab1,crosstab2,crosstab3,crosstab4,crosstab5 = crosstab(df)

In [71]:
#Crosstab reports
crosstab1.to_csv('crosstab1.csv')
crosstab1

weight_category,Light (0 - 10 kg),Medium (10 - 25 kg),Heavy (25+ kg),All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
central,961,1471,2628,5060
east,946,1536,2437,4919
north,982,1483,2484,4949
south,962,1562,2453,4977
west,1035,1491,2569,5095
All,4886,7543,12571,25000


In [72]:
crosstab2.to_csv('crosstab2.csv')
crosstab2

weight_category,Light (0 - 10 kg),Medium (10 - 25 kg),Heavy (25+ kg),All
vehicle_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bike,841,1246,2073,4160
ev bike,818,1218,2182,4218
ev van,811,1270,2035,4116
scooter,836,1326,2012,4174
truck,782,1231,2132,4145
van,798,1252,2137,4187
All,4886,7543,12571,25000


In [73]:
crosstab3.to_csv('crosstab3.csv')
crosstab3

weight_category,Light (0 - 10 kg),Medium (10 - 25 kg),Heavy (25+ kg),All
package_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
automobile parts,565,853,1377,2795
clothing,515,845,1407,2767
cosmetics,554,807,1383,2744
documents,543,907,1355,2805
electronics,540,834,1418,2792
fragile items,586,802,1460,2848
furniture,544,800,1402,2746
groceries,525,835,1333,2693
pharmacy,514,860,1436,2810
All,4886,7543,12571,25000


In [74]:
crosstab4.to_csv('crosstab4.csv')
crosstab4.T

delivery_mode,express,same day,standard,two day,All
weight_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Light (0 - 10 kg),1219,1221,1190,1256,4886
Medium (10 - 25 kg),1819,1913,1880,1931,7543
Heavy (25+ kg),3195,3145,3116,3115,12571
All,6233,6279,6186,6302,25000


In [75]:
crosstab5.to_csv('crosstab5.csv')
crosstab5.T

weather_condition,clear,cold,foggy,hot,rainy,stormy,All
weight_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light (0 - 10 kg),827,801,831,780,807,840,4886
Medium (10 - 25 kg),1236,1293,1234,1255,1281,1244,7543
Heavy (25+ kg),2061,2064,2154,2095,2083,2114,12571
All,4124,4158,4219,4130,4171,4198,25000
