# Combined preprocessing of the datasets

## Utils

In [1]:
import json
from scipy.stats import entropy

def calculate_entropy(col):
    """Calculate entropy of a column in a dataframe.
    
    Args:
        df (pandas.DataFrame): Dataframe containing the column.
        col (str): Name of the column to calculate entropy for.
        
    Returns:
        float: Entropy of the column.
    """
    value_counts = col.value_counts()
    probabilities = value_counts / value_counts.sum()
    return entropy(probabilities, base=2)

## Preprocessing Phase 2

### Overview

The goal of this second part is to combine the different datasources that have been partially processed.

In [2]:
import pandas as pd

In [3]:
PACKAGE_DATA_PREPROCESSED_DATASET_PATH = "../datasets/preprocessed/eval_package_data_formatted.json"
df_package_data = pd.read_json(PACKAGE_DATA_PREPROCESSED_DATASET_PATH)
df_package_data

Unnamed: 0,RouteID,StopID,PackageID,planned_service_time_seconds,depth_cm,height_cm,weight_cm
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,40.6,12.7,30.5
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,25.4,12.7,17.8
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,25.4,12.7,17.8
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,37.1,6.1,21.8
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,44.5,20.3,35.6
...,...,...,...,...,...,...,...
724908,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,66.0,17.8,40.6
724909,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,61.6,10.8,41.3
724910,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,48.3,30.5,36.8
724911,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,48.3,15.2,33.0


In [4]:
ROUTE_DATA_PREPROCESSED_DATASET_PATH = "../datasets/preprocessed/eval_route_data_formatted.json"

df_eval_route = pd.read_json(ROUTE_DATA_PREPROCESSED_DATASET_PATH, orient='records')
df_eval_route

Unnamed: 0,RouteID,station_code,date_YYYY_MM_DD,departure_time_utc,executor_capacity_cm3,Zone,starting_point
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ
1,RouteID_00337a3a-19c6-4821-9350-ad3cf0532047,DLA4,2018-09-03,14:48:09,3313071.00,E-4,AI
2,RouteID_0034b931-97a7-485f-a6b3-b22798489094,DLA4,2018-09-19,15:00:43,3313071.00,D-1,BU
3,RouteID_0085d8a2-71a3-4205-9c46-e62139eca47b,DSE4,2018-07-02,15:18:39,3539605.75,B-10,HI
4,RouteID_008bf177-673c-4647-89dd-bf8c13b7c51d,DLA3,2018-07-11,15:30:00,3313071.00,B-9,HP
...,...,...,...,...,...,...,...
3002,RouteID_fe892b64-84ad-48d5-aede-f65d2e4a3123,DLA7,2018-09-09,14:20:14,3313071.00,E-26,YY
3003,RouteID_feb2d877-9478-474c-8f88-6f93ce2ac6aa,DLA8,2018-06-26,16:35:12,3539605.75,D-23,OX
3004,RouteID_fedf4c2f-2618-4598-bfb5-ec90822c9a29,DBO3,2018-09-17,12:01:18,4247527.00,H-2,YL
3005,RouteID_ff30c100-6266-461d-9c56-939775a1defe,DBO1,2018-09-18,13:05:52,4247527.00,H-7,UI


### Data Integration

In [5]:
# merge df eval route and df package data
df_eval_route_package_data = pd.merge(df_eval_route, df_package_data, on='RouteID')
df_eval_route_package_data


Unnamed: 0,RouteID,station_code,date_YYYY_MM_DD,departure_time_utc,executor_capacity_cm3,Zone,starting_point,StopID,PackageID,planned_service_time_seconds,depth_cm,height_cm,weight_cm
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,40.6,12.7,30.5
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,25.4,12.7,17.8
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,25.4,12.7,17.8
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,37.1,6.1,21.8
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,44.5,20.3,35.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
714564,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,66.0,17.8,40.6
714565,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,61.6,10.8,41.3
714566,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,48.3,30.5,36.8
714567,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,48.3,15.2,33.0


### Feature Extraction

#### 1. Add time to arrive feature

In [6]:
import json

TRAVEL_TIMES_RAW_DATASET_PATH = "../datasets/raw/eval_travel_times_formatted.json"
ROUTE_DATA_RAW_DATASET_PATH = "../datasets/raw/eval_route_data_formatted.json"

with open(TRAVEL_TIMES_RAW_DATASET_PATH,"r") as travel_times_file:
    travel_times_data = json.load(travel_times_file)

with open(ROUTE_DATA_RAW_DATASET_PATH,"r") as route_data_file:
    route_data = json.load(route_data_file)    

In [7]:
# Get time to reach stop from source
def get_time_to_reach_stop_from_source(route, source, stop):
    return travel_times_data[route][source][stop]

# Get lat of each stop
def get_route_stop_lat(route, stop):
    return route_data[route]["stops"][stop]["lat"]

# Get lng of each stop
def get_route_stop_lng(route, stop):
    return route_data[route]["stops"][stop]["lng"]

In [8]:
# Create a new column time_to_arrive that applies the get_time_to_reach_stop_from_source function to RouteID, starting_point, and StopID
df_eval_route_package_data['time_to_arrive'] = df_eval_route_package_data.apply(lambda row: get_time_to_reach_stop_from_source(row['RouteID'], row['starting_point'], row['StopID']), axis=1)

#### 2. Add day of week feature

In [9]:
# Parse date_YYYY_MM_DD field to get day of week
df_eval_route_package_data['day_of_week'] = pd.to_datetime(df_eval_route_package_data['date_YYYY_MM_DD']).dt.day_name()

# Parse date_YYYY_MM_DD field to get the month
df_eval_route_package_data['month'] = pd.to_datetime(df_eval_route_package_data['date_YYYY_MM_DD']).dt.month

df_eval_route_package_data

Unnamed: 0,RouteID,station_code,date_YYYY_MM_DD,departure_time_utc,executor_capacity_cm3,Zone,starting_point,StopID,PackageID,planned_service_time_seconds,depth_cm,height_cm,weight_cm,time_to_arrive,day_of_week,month
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,40.6,12.7,30.5,703.9,Sunday,6
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,25.4,12.7,17.8,703.9,Sunday,6
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,25.4,12.7,17.8,670.6,Sunday,6
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,37.1,6.1,21.8,789.6,Sunday,6
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,44.5,20.3,35.6,789.6,Sunday,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714564,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,66.0,17.8,40.6,2386.8,Sunday,7
714565,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,61.6,10.8,41.3,2386.8,Sunday,7
714566,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,48.3,30.5,36.8,2386.8,Sunday,7
714567,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,48.3,15.2,33.0,2568.5,Sunday,7


#### 3. Add coordinates features

In [19]:
df_eval_route_package_data['StartLat'] = df_eval_route_package_data.apply(lambda row: get_route_stop_lat(row['RouteID'], row['starting_point']), axis=1)
df_eval_route_package_data['StartLon'] = df_eval_route_package_data.apply(lambda row: get_route_stop_lng(row['RouteID'], row['starting_point']), axis=1)
df_eval_route_package_data['StopLat'] = df_eval_route_package_data.apply(lambda row: get_route_stop_lat(row['RouteID'], row['StopID']), axis=1)
df_eval_route_package_data['StopLon'] = df_eval_route_package_data.apply(lambda row: get_route_stop_lng(row['RouteID'], row['StopID']), axis=1)

df_eval_route_package_data

Unnamed: 0,RouteID,station_code,date_YYYY_MM_DD,departure_time_utc,executor_capacity_cm3,Zone,starting_point,StopID,PackageID,planned_service_time_seconds,...,weight_cm,time_to_arrive,day_of_week,month,StartLat,StartLon,StopLat,StopLon,distance_lat,distance_lon
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,...,30.5,703.9,Sunday,6,33.885480,-118.344553,33.885480,-118.344553,0.0,0.0
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,...,17.8,703.9,Sunday,6,33.885480,-118.344553,33.885480,-118.344553,0.0,0.0
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,...,17.8,670.6,Sunday,6,33.902808,-118.355223,33.902808,-118.355223,0.0,0.0
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,...,21.8,789.6,Sunday,6,33.893616,-118.358108,33.893616,-118.358108,0.0,0.0
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,...,35.6,789.6,Sunday,6,33.893616,-118.358108,33.893616,-118.358108,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714564,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,...,40.6,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714565,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,...,41.3,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714566,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,...,36.8,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714567,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,...,33.0,2568.5,Sunday,7,34.155712,-117.528523,34.155712,-117.528523,0.0,0.0


In [18]:
# Calculate distance from start to stop
df_eval_route_package_data['distance_lat'] = df_eval_route_package_data.apply(lambda row: abs(row['StartLat'] - row['StopLat']), axis=1)
df_eval_route_package_data['distance_lon'] = df_eval_route_package_data.apply(lambda row: abs(row['StartLon'] - row['StopLon']), axis=1)
df_eval_route_package_data

Unnamed: 0,RouteID,station_code,date_YYYY_MM_DD,departure_time_utc,executor_capacity_cm3,Zone,starting_point,StopID,PackageID,planned_service_time_seconds,...,weight_cm,time_to_arrive,day_of_week,month,StartLat,StartLon,StopLat,StopLon,distance_lat,distance_lon
0,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_fd17bd67-a3d8-45b9-936b-c7e9d879102e,31.5,...,30.5,703.9,Sunday,6,33.885480,-118.344553,33.885480,-118.344553,0.0,0.0
1,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AH,PackageID_f78261e7-4c8c-4d72-b007-9934a53a700b,31.5,...,17.8,703.9,Sunday,6,33.885480,-118.344553,33.885480,-118.344553,0.0,0.0
2,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AJ,PackageID_a8e394ee-4208-412d-8b34-8cbce880a322,40.0,...,17.8,670.6,Sunday,6,33.902808,-118.355223,33.902808,-118.355223,0.0,0.0
3,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_b7c39ed4-fa0a-497a-ae0c-e3515734f8ae,47.0,...,21.8,789.6,Sunday,6,33.893616,-118.358108,33.893616,-118.358108,0.0,0.0
4,RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,DLA8,2018-06-17,17:23:00,3539605.75,K-21,UZ,AL,PackageID_e980c24b-ab88-4af1-b3f9-1f851823f561,47.0,...,35.6,789.6,Sunday,6,33.893616,-118.358108,33.893616,-118.358108,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714564,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b0bad72b-041e-4b5b-8303-03131c837b8d,38.3,...,40.6,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714565,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_b2c6f0de-6fcf-4616-b2cd-ba63ef6d0715,38.3,...,41.3,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714566,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,YU,PackageID_3de4b3fd-d80d-4ae7-a519-c82a4e46c48c,38.3,...,36.8,2386.8,Sunday,7,34.152862,-117.502335,34.152862,-117.502335,0.0,0.0
714567,RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,DLA7,2018-07-08,15:42:06,3539605.75,G-18,XL,ZQ,PackageID_59aae7fe-cbb2-47c8-8c2f-59a97154ccd9,102.0,...,33.0,2568.5,Sunday,7,34.155712,-117.528523,34.155712,-117.528523,0.0,0.0


In [11]:
df_eval_route_package_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714569 entries, 0 to 714568
Data columns (total 17 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   RouteID                       714569 non-null  object 
 1   station_code                  714569 non-null  object 
 2   date_YYYY_MM_DD               714569 non-null  object 
 3   departure_time_utc            714569 non-null  object 
 4   executor_capacity_cm3         714569 non-null  float64
 5   Zone                          714569 non-null  object 
 6   starting_point                714569 non-null  object 
 7   StopID                        714569 non-null  object 
 8   PackageID                     714569 non-null  object 
 9   planned_service_time_seconds  714569 non-null  float64
 10  depth_cm                      714569 non-null  float64
 11  height_cm                     714569 non-null  float64
 12  weight_cm                     714569 non-nul

In [12]:
df_eval_route_package_data.describe()

Unnamed: 0,executor_capacity_cm3,planned_service_time_seconds,depth_cm,height_cm,weight_cm,time_to_arrive,month,StartLat
count,714569.0,714569.0,714569.0,714569.0,714569.0,714569.0,714569.0,714569.0
mean,3668698.0,70.413758,33.795002,10.158491,24.548767,1832.104515,7.649726,37.99392
std,483505.5,69.740068,9.199817,7.339519,7.334135,612.804355,1.33605,5.337329
min,3114853.0,1.0,0.3,0.0,0.3,45.9,6.0,30.120233
25%,3313071.0,37.5,25.7,4.8,17.8,1367.4,6.0,33.821785
50%,3539606.0,54.3,32.3,7.6,24.1,1816.3,7.0,34.1246
75%,4247527.0,81.0,38.6,12.7,29.2,2227.4,9.0,42.212239
max,4672280.0,7325.0,134.6,73.7,95.3,7179.7,10.0,48.072868


## 2. Feature Selection

In [17]:
entropy_values = df_eval_route_package_data.apply(calculate_entropy)
# Sort the entropy_values in descending order
entropy_values = entropy_values.sort_values(ascending=False)
entropy_values

PackageID                       19.446714
StopLon                         17.988215
StartLon                        17.988215
StopLat                         17.940088
StartLat                        17.940088
time_to_arrive                  14.480888
RouteID                         11.540118
departure_time_utc              10.507729
StopID                           9.391532
planned_service_time_seconds     9.081874
starting_point                   8.143111
Zone                             7.867785
date_YYYY_MM_DD                  6.331296
depth_cm                         6.178412
weight_cm                        6.025406
height_cm                        5.595292
station_code                     3.826254
day_of_week                      2.790648
executor_capacity_cm3            2.142974
month                            1.892192
dtype: float64

## 3. Handling duplicates

In [14]:
df_eval_route_package_data.duplicated().sum()

0

### 4. Agregate Data

In [15]:
grouped_summary = df_eval_route_package_data.groupby('RouteID').describe()
grouped_summary

Unnamed: 0_level_0,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,executor_capacity_cm3,planned_service_time_seconds,planned_service_time_seconds,...,month,month,StartLat,StartLat,StartLat,StartLat,StartLat,StartLat,StartLat,StartLat
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
RouteID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e,287.0,3539605.75,0.0,3539605.75,3539605.75,3539605.75,3539605.75,3539605.75,287.0,73.155052,...,6.0,6.0,287.0,33.892209,0.006025,33.878491,33.886885,33.892955,33.897142,33.902809
RouteID_00337a3a-19c6-4821-9350-ad3cf0532047,163.0,3313071.00,0.0,3313071.00,3313071.00,3313071.00,3313071.00,3313071.00,163.0,96.821472,...,9.0,9.0,163.0,34.416974,0.004693,34.406042,34.413563,34.416744,34.420594,34.425697
RouteID_0034b931-97a7-485f-a6b3-b22798489094,197.0,3313071.00,0.0,3313071.00,3313071.00,3313071.00,3313071.00,3313071.00,197.0,90.736041,...,9.0,9.0,197.0,34.440287,0.005553,34.426003,34.436935,34.440955,34.444450,34.448587
RouteID_0085d8a2-71a3-4205-9c46-e62139eca47b,273.0,3539605.75,0.0,3539605.75,3539605.75,3539605.75,3539605.75,3539605.75,273.0,64.595238,...,7.0,7.0,273.0,47.855703,0.009225,47.837306,47.848480,47.854721,47.860499,47.877920
RouteID_008bf177-673c-4647-89dd-bf8c13b7c51d,171.0,3313071.00,0.0,3313071.00,3313071.00,3313071.00,3313071.00,3313071.00,171.0,98.970760,...,7.0,7.0,171.0,34.068212,0.003448,34.061812,34.065610,34.067838,34.070452,34.075906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RouteID_ff9bc358-52ee-42ff-bb97-f97722d383e0,219.0,4672279.50,0.0,4672279.50,4672279.50,4672279.50,4672279.50,4672279.50,219.0,82.763014,...,6.0,6.0,219.0,42.309157,0.003512,42.299891,42.307060,42.309611,42.311980,42.314824
RouteID_ffdd3b7f-9de4-4a0b-8c07-d186bf7f2dee,228.0,3539605.75,0.0,3539605.75,3539605.75,3539605.75,3539605.75,3539605.75,228.0,75.610526,...,7.0,7.0,228.0,34.153798,0.006482,34.141617,34.148829,34.153027,34.160167,34.164471
RouteID_ffdf3647-285c-4839-ac61-12def72d7d4e,204.0,4247527.00,0.0,4247527.00,4247527.00,4247527.00,4247527.00,4247527.00,204.0,62.871078,...,7.0,7.0,204.0,30.382053,0.010852,30.361980,30.374353,30.380834,30.387341,30.413676
RouteID_ffe04428-b8ea-455d-841a-a1d47f5b3241,243.0,3313071.00,0.0,3313071.00,3313071.00,3313071.00,3313071.00,3313071.00,243.0,68.338272,...,7.0,7.0,243.0,47.542414,0.003323,47.534507,47.539571,47.541654,47.545070,47.548700


## 5. Discretization

For the executor capacity 3 bins are used: small, medium and large.