# Package Data Preprocessing

## Utils

In [19]:
import json
from scipy.stats import entropy

def calculate_entropy(col):
    """Calculate entropy of a column in a dataframe.
    
    Args:
        df (pandas.DataFrame): Dataframe containing the column.
        col (str): Name of the column to calculate entropy for.
        
    Returns:
        float: Entropy of the column.
    """
    value_counts = col.value_counts()
    probabilities = value_counts / value_counts.sum()
    return entropy(probabilities, base=2)

## Preprocessing Phase 1

### Overview

In [13]:
import pandas as pd

In [14]:
PACKAGE_DATA_PROCESSED_DATASET_PATH = "../datasets/processed/eval_package_data_formatted.json"

df_package_data = pd.read_json(PACKAGE_DATA_PROCESSED_DATASET_PATH)

df_package_data

Unnamed: 0,RouteID,StopID,PackageID,start_time_utc,end_time_utc,planned_service_time_seconds,depth_cm,height_cm,weight_cm
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,,,31.5,40.6,12.7,30.5
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,,,31.5,25.4,12.7,17.8
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,,,40.0,25.4,12.7,17.8
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,,,47.0,37.1,6.1,21.8
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,,,47.0,44.5,20.3,35.6
...,...,...,...,...,...,...,...,...,...
724908,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,,,38.3,66.0,17.8,40.6
724909,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,,,38.3,61.6,10.8,41.3
724910,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,,,38.3,48.3,30.5,36.8
724911,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,,,102.0,48.3,15.2,33.0


In [15]:
df_package_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724913 entries, 0 to 724912
Data columns (total 9 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   RouteID                       724913 non-null  object 
 1   StopID                        724913 non-null  object 
 2   PackageID                     724913 non-null  object 
 3   start_time_utc                53924 non-null   object 
 4   end_time_utc                  53924 non-null   object 
 5   planned_service_time_seconds  724913 non-null  float64
 6   depth_cm                      724913 non-null  float64
 7   height_cm                     724913 non-null  float64
 8   weight_cm                     724913 non-null  float64
dtypes: float64(4), object(5)
memory usage: 49.8+ MB


In [16]:
df_package_data.describe()

Unnamed: 0,planned_service_time_seconds,depth_cm,height_cm,weight_cm
count,724913.0,724913.0,724913.0,724913.0
mean,70.398475,33.793581,10.157039,24.547123
std,69.63196,9.200135,7.340767,7.33548
min,1.0,0.3,0.0,0.3
25%,37.5,25.7,4.8,17.8
50%,54.3,32.3,7.6,24.1
75%,81.0,38.6,12.7,29.2
max,7325.0,134.6,73.7,95.3


### 1. Handling Missing values

In [17]:
missing_count = df_package_data.isnull().sum()
missing_percentage = missing_count / df_package_data.shape[0] * 100

# Sort the missing_percentage in descending order
missing_percentage = missing_percentage.sort_values(ascending=False)
# Sort the missing_count in descending order
missing_count = missing_count[missing_percentage.index]

# Print the results
print("Missing values count:\n", missing_count)
print("Missing percentage:\n", missing_percentage)

Missing values count:
 start_time_utc                  670989
end_time_utc                    670989
RouteID                              0
StopID                               0
PackageID                            0
planned_service_time_seconds         0
depth_cm                             0
height_cm                            0
weight_cm                            0
dtype: int64
Missing percentage:
 start_time_utc                  92.561314
end_time_utc                    92.561314
RouteID                          0.000000
StopID                           0.000000
PackageID                        0.000000
planned_service_time_seconds     0.000000
depth_cm                         0.000000
height_cm                        0.000000
weight_cm                        0.000000
dtype: float64


The missing values for the start_time and end_time of the route are a lot, so as to handle this, it will be better if instead of having those two fields, we have a new field called duration which will be the difference between the start_time and end_time. This will also help in reducing the number of features. However in order to have this new field we rather combine another data source to infer the time_spend.

In [18]:
# Drop start_time and end_time columns
df_package_data = df_package_data.drop(columns=["start_time_utc", "end_time_utc"])
df_package_data

Unnamed: 0,RouteID,StopID,PackageID,planned_service_time_seconds,depth_cm,height_cm,weight_cm
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,40.6,12.7,30.5
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,25.4,12.7,17.8
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,25.4,12.7,17.8
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,37.1,6.1,21.8
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,44.5,20.3,35.6
...,...,...,...,...,...,...,...
724908,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,66.0,17.8,40.6
724909,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,61.6,10.8,41.3
724910,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,48.3,30.5,36.8
724911,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,48.3,15.2,33.0


### 2. Data Reduction

In [20]:
entropy_values = df_package_data.apply(calculate_entropy)
# Sort the entropy_values in descending order
entropy_values = entropy_values.sort_values(ascending=False)
entropy_values

PackageID                       19.467448
RouteID                         11.561550
StopID                           9.391584
planned_service_time_seconds     9.080126
depth_cm                         6.180138
weight_cm                        6.027162
height_cm                        5.596605
dtype: float64

We notice that the entropy values for the columns we selected is high so we keep them.