In [20]:
import pandas as pd
import numpy as np
import vaex
vaex.settings.display.float_format = '%.8f'
import warnings
warnings.filterwarnings('ignore')

import os
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [21]:
folder_path = './cleaned_dataset'
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Dictionary to store dataframes
dfs_dict = {}

# Load each CSV into Vaex and store in the dictionary
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    key_name = file.replace(".csv", "")  # Remove the .csv extension for the key
    dfs_dict[key_name] = vaex.open(file_path)
    print(f"Loaded {file} into Vaex DataFrame with key: {key_name}")

Loaded April_2024.csv into Vaex DataFrame with key: April_2024
Loaded August_2024.csv into Vaex DataFrame with key: August_2024
Loaded December_2023.csv into Vaex DataFrame with key: December_2023
Loaded Feb_2024.csv into Vaex DataFrame with key: Feb_2024
Loaded Jan_2024.csv into Vaex DataFrame with key: Jan_2024
Loaded July_2024.csv into Vaex DataFrame with key: July_2024
Loaded June_2024.csv into Vaex DataFrame with key: June_2024
Loaded March_2024.csv into Vaex DataFrame with key: March_2024
Loaded May_2024.csv into Vaex DataFrame with key: May_2024
Loaded November_2023.csv into Vaex DataFrame with key: November_2023
Loaded October_2023.csv into Vaex DataFrame with key: October_2023
Loaded October_2024.csv into Vaex DataFrame with key: October_2024
Loaded September_2024.csv into Vaex DataFrame with key: September_2024


# Cleaning Processes

In [22]:
# Create the mapping dictionary to reduce the string length

route_names = dfs_dict['September_2024']['Route Name'].unique()
# mapping dictionary
route_names_map = {}

for route_name in route_names:
    if "EXPRESS" in route_name:
        route_names_map[route_name] = route_name.split(' ', 1)[0] 
    elif "TUNNEL" in route_name:
        continue
    else:
        route_names_map[route_name] = route_name.split(' ', 2)[-1]

def route_name_map(df):
    df['Route Name'] = df['Route Name'].apply(lambda x: route_names_map.get(x, x))
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = route_name_map(df)

In [23]:
# Remove all rows with missing values for each Vaex DataFrame in the dictionary
for i, (key, df) in enumerate(dfs_dict.items()):
    #print(f'Rows before removing missing values: {len(df):,}')
    # Drop missing values (returns a new Vaex DataFrame)
    dfs_dict[key] = df.dropna(column_names=['stop_lon', 'Stop Code'])
    #print(f'Rows after removing missing values: {len(dfs_dict[key]):,}')
    print(f"Removed missing values from {key} ({i+1}/{len(dfs_dict)})")


Removed missing values from April_2024 (1/13)
Removed missing values from August_2024 (2/13)
Removed missing values from December_2023 (3/13)
Removed missing values from Feb_2024 (4/13)
Removed missing values from Jan_2024 (5/13)
Removed missing values from July_2024 (6/13)
Removed missing values from June_2024 (7/13)
Removed missing values from March_2024 (8/13)
Removed missing values from May_2024 (9/13)
Removed missing values from November_2023 (10/13)
Removed missing values from October_2023 (11/13)
Removed missing values from October_2024 (12/13)
Removed missing values from September_2024 (13/13)


In [24]:
def stop_sequence_normalization(df):
    df = df.to_pandas_df()
    # Convert 'Business Day' to datetime format if not already
    df['Business Day'] = pd.to_datetime(df['Business Day'])

    # Normalize stop_sequence within each group
    df['stop_sequence_normalized'] = df.groupby(['Business Day', 'Route Name', 'Direction', 'daily_order_trip_id'])['stop_sequence'].transform(
        lambda x: np.round((x - 1) / (x.max() - 1),3) if x.nunique() > 1 else 0
    )

    df = vaex.from_pandas(df)
    df['Business Day'] = df['Business Day'].astype('datetime64').dt.strftime('%Y-%m-%d')
    #df = df.drop('stop_sequence')
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = stop_sequence_normalization(df)
    print(f"Normalized stop_sequence in {key} ({i+1}/13)")

Normalized stop_sequence in April_2024 (1/13)
Normalized stop_sequence in August_2024 (2/13)
Normalized stop_sequence in December_2023 (3/13)
Normalized stop_sequence in Feb_2024 (4/13)
Normalized stop_sequence in Jan_2024 (5/13)
Normalized stop_sequence in July_2024 (6/13)
Normalized stop_sequence in June_2024 (7/13)
Normalized stop_sequence in March_2024 (8/13)
Normalized stop_sequence in May_2024 (9/13)
Normalized stop_sequence in November_2023 (10/13)
Normalized stop_sequence in October_2023 (11/13)
Normalized stop_sequence in October_2024 (12/13)
Normalized stop_sequence in September_2024 (13/13)


In [25]:
# daily_order_trip_id normalization
def daily_order_trip_normalization(df):
    df = df.to_pandas_df()
    # Convert 'Business Day' to datetime format if not already
    df['Business Day'] = pd.to_datetime(df['Business Day'])
    
    # Group by 'Business Day', 'Route Name', 'Direction' and then normalize 'daily_order_trip_id'
    df['daily_order_trip_normalized'] = df.groupby(['Business Day', 'Route Name', 'Direction'])['daily_order_trip_id'].transform(
        lambda x: np.round((x - 1) / (x.max() - 1),3) if x.nunique() > 1 else 0
    )


    df = vaex.from_pandas(df)
    df['Business Day'] = df['Business Day'].astype('datetime64').dt.strftime('%Y-%m-%d')
    #df = df.drop('daily_order_trip_id')
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = daily_order_trip_normalization(df)
    print(f"Normalized daily_order_trip_id in {key} ({i+1}/13)")

Normalized daily_order_trip_id in April_2024 (1/13)
Normalized daily_order_trip_id in August_2024 (2/13)
Normalized daily_order_trip_id in December_2023 (3/13)
Normalized daily_order_trip_id in Feb_2024 (4/13)
Normalized daily_order_trip_id in Jan_2024 (5/13)
Normalized daily_order_trip_id in July_2024 (6/13)
Normalized daily_order_trip_id in June_2024 (7/13)
Normalized daily_order_trip_id in March_2024 (8/13)
Normalized daily_order_trip_id in May_2024 (9/13)
Normalized daily_order_trip_id in November_2023 (10/13)
Normalized daily_order_trip_id in October_2023 (11/13)
Normalized daily_order_trip_id in October_2024 (12/13)
Normalized daily_order_trip_id in September_2024 (13/13)


In [26]:
# Remove all rows with 'Opportunity' in the 'TripType' column and remove the column afterwards
for i, (key, df) in enumerate(dfs_dict.items()):
    #print(f"Rows before removing 'Opportunity' rows from {key}: {len(df):,}")
    
    # Remove rows where 'TripType' == 'Opportunity'
    df_filtered = df[df.TripType != 'Opportunity']
    
    # Drop the 'TripType' column
    dfs_dict[key] = df_filtered.drop(columns=['TripType'])
    #print(f"Rows after removing 'Opportunity' rows from {key}: {len(dfs_dict[key]):,}")
    print(f"Removed 'Opportunity' rows from {key} ({i+1}/{len(dfs_dict)})")

Removed 'Opportunity' rows from April_2024 (1/13)
Removed 'Opportunity' rows from August_2024 (2/13)
Removed 'Opportunity' rows from December_2023 (3/13)
Removed 'Opportunity' rows from Feb_2024 (4/13)
Removed 'Opportunity' rows from Jan_2024 (5/13)
Removed 'Opportunity' rows from July_2024 (6/13)
Removed 'Opportunity' rows from June_2024 (7/13)
Removed 'Opportunity' rows from March_2024 (8/13)
Removed 'Opportunity' rows from May_2024 (9/13)
Removed 'Opportunity' rows from November_2023 (10/13)
Removed 'Opportunity' rows from October_2023 (11/13)
Removed 'Opportunity' rows from October_2024 (12/13)
Removed 'Opportunity' rows from September_2024 (13/13)


In [27]:
# Remove 'Stop Code', 'Stop Name', 'Vehicle', 'expected_arrival_time', '__TripType' columns

for i, (key, df) in enumerate(dfs_dict.items()):
    # Drop the columns
    dfs_dict[key] = df.drop(columns=['Stop Code', 'Stop Name', 'Vehicle', 'expected_arrival_time', 'Passengers Transfer'])
    print(f"Removed columns from {key} ({i+1}/{len(dfs_dict)})")

Removed columns from April_2024 (1/13)
Removed columns from August_2024 (2/13)
Removed columns from December_2023 (3/13)
Removed columns from Feb_2024 (4/13)
Removed columns from Jan_2024 (5/13)
Removed columns from July_2024 (6/13)
Removed columns from June_2024 (7/13)
Removed columns from March_2024 (8/13)
Removed columns from May_2024 (9/13)
Removed columns from November_2023 (10/13)
Removed columns from October_2023 (11/13)
Removed columns from October_2024 (12/13)
Removed columns from September_2024 (13/13)


In [28]:
# Split direction column into two columns.
def direction_splitting(df):
    # East = 1, West = -1, other = 0
    df['Horizontal Direction'] = (
        (df.Direction == 'E') * 1 +
        (df.Direction == 'W') * -1
    )
    
    # North = 1, South = -1, other = 0
    df['Vertical Direction'] = (
        (df.Direction == 'N') * 1 +
        (df.Direction == 'S') * -1
    )
    #df = df.drop(columns=["Direction"])
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = direction_splitting(df)
    print(f"Split direction column in {key} ({i+1}/{len(dfs_dict)})")

Split direction column in April_2024 (1/13)
Split direction column in August_2024 (2/13)
Split direction column in December_2023 (3/13)
Split direction column in Feb_2024 (4/13)
Split direction column in Jan_2024 (5/13)
Split direction column in July_2024 (6/13)
Split direction column in June_2024 (7/13)
Split direction column in March_2024 (8/13)
Split direction column in May_2024 (9/13)
Split direction column in November_2023 (10/13)
Split direction column in October_2023 (11/13)
Split direction column in October_2024 (12/13)
Split direction column in September_2024 (13/13)


In [29]:
# Cyclical encoding for month
def process_datetime_features(df):
    df = df.to_pandas_df()

    # Ensure the time column is in datetime format
    df['ActualTime'] = pd.to_datetime(df['ActualTime'])
    
    # Extract hour from the time column
    df['hour'] = df['ActualTime'].dt.hour

    # Apply cyclical encoding for hour, create a hour_sin and hour_cos column
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24).round(6).apply(lambda x: 0.0 if x == -0.0 else x)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24).round(6).apply(lambda x: 0.0 if x == -0.0 else x)

    # Remove hour column
    df = df.drop(columns=['hour'])

    # Apply cyclical encoding for minute, create a minute_sin and minute_cos column
    df['minute_sin'] = np.sin(2 * np.pi * df['ActualTime'].dt.minute / 60).round(6).apply(lambda x: 0.0 if x == -0.0 else x)
    df['minute_cos'] = np.cos(2 * np.pi * df['ActualTime'].dt.minute / 60).round(6).apply(lambda x: 0.0 if x == -0.0 else x)


    # Ensure the date column is in datetime format
    df['Business Day'] = pd.to_datetime(df['Business Day'])


    # Extract day of the month and normalized it
    df['day_normalized'] = (df['Business Day'].dt.day / df.apply(
    lambda row: 29 if (row['Business Day'].month == 2 and 
                       ((row['Business Day'].year % 4 == 0 and row['Business Day'].year % 100 != 0) or 
                        (row['Business Day'].year % 400 == 0))) 
    else {1: 31, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}.get(row['Business Day'].month, 30), 
    axis=1
    )).round(3)
    
    # Apply cyclical encoding for day of the month, create a day_sin and day_cos column
    df['day_sin'] = np.sin(2 * np.pi * df['day_normalized']).round(6).apply(lambda x: 0.0 if x == -0.0 else x)
    df['day_cos'] = np.cos(2 * np.pi * df['day_normalized']).round(6).apply(lambda x: 0.0 if x == -0.0 else x)

    # Remove day_normalized column
    df = df.drop(columns=['day_normalized'])

    # Apply cyclical encoding for month, create a month_sin and month_cos column
    df['month_sin'] = np.sin(2 * np.pi * df['Business Day'].dt.month / 12).round(6).apply(lambda x: 0.0 if x == -0.0 else x)
    df['month_cos'] = np.cos(2 * np.pi * df['Business Day'].dt.month / 12).round(6).apply(lambda x: 0.0 if x == -0.0 else x)

    # Remove month column
    df = df.drop(columns=['month', 'Start Trip Time'])

    # Calculate weekday number (Sunday = 1, Monday = 2, ..., Saturday = 7)
    df['weekday_num'] = df['Business Day'].dt.dayofweek + 1


    # Convert back to Vaex DataFrame
    df = vaex.from_pandas(df)

    # Remove the Business Day and ActualTime columns
    #df = df.drop(columns=['Business Day', 'ActualTime'])
    df = df.drop(columns=['ActualTime'])

    # Apply Cyclical Encoding for Weekdays
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday_num'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday_num'] / 7)

    # Round small floating-point values and ensure -0.0 becomes 0.0
    df['weekday_sin'] = df['weekday_sin'].round(6).apply(lambda x: 0.0 if x == -0.0 else x)
    df['weekday_cos'] = df['weekday_cos'].round(6).apply(lambda x: 0.0 if x == -0.0 else x)

    # Remove the weekday_num column
    df = df.drop(columns=['weekday_num'])
    #df['ActualTime'] = df['ActualTime'].astype('datetime64').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['Business Day'] = df['Business Day'].astype('datetime64').dt.strftime('%Y-%m-%d')
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = process_datetime_features(df)
    print(f"Processed datetime features in {key} ({i+1}/{len(dfs_dict)})")


Processed datetime features in April_2024 (1/13)
Processed datetime features in August_2024 (2/13)
Processed datetime features in December_2023 (3/13)
Processed datetime features in Feb_2024 (4/13)
Processed datetime features in Jan_2024 (5/13)
Processed datetime features in July_2024 (6/13)
Processed datetime features in June_2024 (7/13)
Processed datetime features in March_2024 (8/13)
Processed datetime features in May_2024 (9/13)
Processed datetime features in November_2023 (10/13)
Processed datetime features in October_2023 (11/13)
Processed datetime features in October_2024 (12/13)
Processed datetime features in September_2024 (13/13)


In [30]:
# Stop Lat and Stop Lon Normalization
# Dictionary to store the min and max values for each column
coordinate_mapping = {}

def coordinate_normalization(df):
    lat_min = df['stop_lat'].min()
    lat_max = df['stop_lat'].max()
    lon_min = df['stop_lon'].min()
    lon_max = df['stop_lon'].max()
    print(f'Lat Min: {lat_min}, Lat Max: {lat_max}, Lon Min: {lon_min}, Lon Max: {lon_max}\n\n')

    # Normalize the columns (Min-Max Scaling where min -> 0 and max -> 1)
    df['stop_lat_norm'] = (df['stop_lat'] - lat_min) / (lat_max - lat_min)
    df['stop_lon_norm'] = (df['stop_lon'] - lon_min) / (lon_max - lon_min)

    # Store the min and max values in a dictionary
    normalization_params = {
        "stop_lat": [lat_min, lat_max],
        "stop_lon": [lon_min, lon_max]
    }
    df = df.drop(columns=['stop_lat', 'stop_lon'])
    return df, normalization_params

for i, (key, df) in enumerate(dfs_dict.items()):
    print(f"Processing coordinate normalization in {key} ({i+1}/13)")
    dfs_dict[key], coordinate_mapping[key] = coordinate_normalization(df)

Processing coordinate normalization in April_2024 (1/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in August_2024 (2/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in December_2023 (3/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in Feb_2024 (4/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in Jan_2024 (5/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in July_2024 (6/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processing coordinate normalization in June_2024 (7/13)
Lat Min: 42.042087, Lat Max: 42.34243354, Lon Min: -83.114924, Lon Max: -82.606767


Processi

In [31]:
# Create overcrowded status column (35 or less = 0, more than 35 = 1) from Actual Bus Occupancy column

def overcrowded_status(df):
    df['Overcrowded'] = (df['Actual Bus Occupancy'] > 35) * 1
    return df

for i, (key, df) in enumerate(dfs_dict.items()):
    dfs_dict[key] = overcrowded_status(df)
    print(f"Created Overcrowded status column in {key} ({i+1}/13)")

Created Overcrowded status column in April_2024 (1/13)
Created Overcrowded status column in August_2024 (2/13)
Created Overcrowded status column in December_2023 (3/13)
Created Overcrowded status column in Feb_2024 (4/13)
Created Overcrowded status column in Jan_2024 (5/13)
Created Overcrowded status column in July_2024 (6/13)
Created Overcrowded status column in June_2024 (7/13)
Created Overcrowded status column in March_2024 (8/13)
Created Overcrowded status column in May_2024 (9/13)
Created Overcrowded status column in November_2023 (10/13)
Created Overcrowded status column in October_2023 (11/13)
Created Overcrowded status column in October_2024 (12/13)
Created Overcrowded status column in September_2024 (13/13)


# Data Split (Training, Validation and Testing)
* First for September 2024
* Take all Overcrowded Trips (Group by Route Name, Direction, daily_order_trip_id, Business day)
* Count all trips with at least 1 overcrowded row
* And then count all trip with none overcrowded rows
* Take a sample of all the rows for the overcrowded trips and then take another sample of all  none overcrowded trips of the same size so the final dataset has 50% overcrowded trips and 50% no-overcrowded trips



In [32]:
# Convert th vaex dataframe to pandas dataframe
df = dfs_dict['September_2024'].copy()
df = df.to_pandas_df()

# Group by 'Business Day', 'Route Name', 'Direction' and 'daily_order_trip_id' and make a id for each group
# by combining the elements of the group, similar to adding strings together.

df['group_id'] = pd.to_datetime(df['Business Day']).dt.day.astype(str) + '_' + df['Route Name'].astype(str) + '_' + df['Direction'].astype(str) + '_' + df['daily_order_trip_id'].astype(str)

# Show groud_id column at the beginning of the dataframe
cols = list(df.columns)
cols.remove('group_id')
cols = ['group_id'] + cols
df = df[cols]

df.head(1)

Unnamed: 0,group_id,Direction,Route Name,Passengers In,Passengers Out,Actual Bus Occupancy,Business Day,daily_order_trip_id,stop_sequence,stop_sequence_normalized,...,minute_cos,day_sin,day_cos,month_sin,month_cos,weekday_sin,weekday_cos,stop_lat_norm,stop_lon_norm,Overcrowded
0,1_518_E_1,E,518,8,0,8,2024-09-01,1,1,0.0,...,0.207912,0.205863,0.978581,-1.0,0.0,0.0,1.0,0.679239,0.191049,0


In [33]:
# Step 1: Identify groups with at least one overcrowded bus
overcrowded_groups = set(df[df['Overcrowded'] == 1]['group_id'].unique())

# Step 2: Identify groups that NEVER had an overcrowded bus
all_groups = set(df['group_id'].unique())
non_overcrowded_groups = all_groups - overcrowded_groups  # Set difference ensures no overlap

# Step 3: Print counts to verify separation
print(f"Number of groups with at least one overcrowded bus: {len(overcrowded_groups):,}")
print(f"Number of groups with no overcrowded bus: {len(non_overcrowded_groups):,}")
print(f"Total number of unique groups: {len(all_groups):,}")
assert len(overcrowded_groups) + len(non_overcrowded_groups) == len(all_groups), "Error: Group splitting is incorrect!"

# Step 4: Create a radnom sample from the non_overcrowded_groups that has the same size as overcrowded_groups
random.seed(42)  # Optional: For reproducibility
sampled_non_overcrowded = random.sample(non_overcrowded_groups, len(overcrowded_groups))

# Print sample size to verify
print(f"Sampled non-overcrowded groups: {len(sampled_non_overcrowded):,}")


Number of groups with at least one overcrowded bus: 2,243
Number of groups with no overcrowded bus: 18,616
Total number of unique groups: 20,859
Sampled non-overcrowded groups: 2,243


In [34]:
from sklearn.model_selection import train_test_split

# Split Overcrowded and Non-Overcrowded Groups Separately
# Split overcrowded groups: 70% train, 20% validation, 10% test
train_overcrowded, temp_overcrowded = train_test_split(list(overcrowded_groups), test_size=0.3, random_state=42)
val_overcrowded, test_overcrowded = train_test_split(temp_overcrowded, test_size=1/3, random_state=42)

# Split non-overcrowded groups: 70% train, 20% validation, 10% test
train_non_overcrowded, temp_non_overcrowded = train_test_split(list(sampled_non_overcrowded), test_size=0.3, random_state=42)
val_non_overcrowded, test_non_overcrowded = train_test_split(temp_non_overcrowded, test_size=1/3, random_state=42)

# Combine Train, Validation, and Test Group IDs
train_group_ids = set(train_overcrowded) | set(train_non_overcrowded)
val_group_ids = set(val_overcrowded) | set(val_non_overcrowded)
test_group_ids = set(test_overcrowded) | set(test_non_overcrowded)

# Ensure No Overlap Between Splits
assert len(train_group_ids & val_group_ids) == 0, "Train and Validation sets overlap!"
assert len(train_group_ids & test_group_ids) == 0, "Train and Test sets overlap!"
assert len(val_group_ids & test_group_ids) == 0, "Validation and Test sets overlap!"

print("✅ No group_id duplication across Train, Validation, and Test sets!")

# Step 7: Create Final DataFrames
train_df = df[df['group_id'].isin(train_group_ids)]
val_df = df[df['group_id'].isin(val_group_ids)]
test_df = df[df['group_id'].isin(test_group_ids)]

✅ No group_id duplication across Train, Validation, and Test sets!


In [35]:
# Remove group_id, Business Day, Direction, daily_order_trip_id, stop_sequence columns
train_df = train_df.drop(columns=['group_id', 'Business Day', 'Direction', 'daily_order_trip_id', 'stop_sequence'])
val_df = val_df.drop(columns=['group_id', 'Business Day', 'Direction', 'daily_order_trip_id', 'stop_sequence'])
test_df = test_df.drop(columns=['group_id', 'Business Day', 'Direction', 'daily_order_trip_id', 'stop_sequence'])


In [36]:
# Target Encode the trainig set and apply the same encoding to the validation and test sets
# Compute mean overcrowding per Route Name
target_encodings = train_df.groupby('Route Name')['Overcrowded'].mean()

# Add random noise (different per Route Name)
np.random.seed(42)  # For reproducibility
random_noise = np.round(np.random.uniform(0, 0.00001, size=len(target_encodings)),6)
target_encodings += random_noise

train_df['Route Name'] = train_df['Route Name'].map(target_encodings)
val_df['Route Name'] = val_df['Route Name'].map(target_encodings)
test_df['Route Name'] = test_df['Route Name'].map(target_encodings)

# Check if all values are unique
is_unique = target_encodings.nunique() == len(target_encodings)

print(f"Are all target encoding values unique? {is_unique}")


target_encodings

Are all target encoding values unique? True


Route Name
10        0.012724
115       0.310453
14        0.000007
1A        0.138168
1C        0.112792
2         0.065252
25        0.000001
3         0.047447
4         0.008533
418       0.003650
42        0.000000
518       0.159385
6         0.155231
605       0.000002
7         0.045788
8         0.054954
TUNNEL    0.033642
Name: Overcrowded, dtype: float64

In [37]:
# Create a folder to store the cleaned datasets named 'split_cleaned_dataset'
if not os.path.exists('split_cleaned_dataset'):
    os.makedirs('split_cleaned_dataset')

# Save the cleaned datasets to CSV files
train_df.to_csv('split_cleaned_dataset/train_september.csv', index=False)
val_df.to_csv('split_cleaned_dataset/val_september.csv', index=False)
test_df.to_csv('split_cleaned_dataset/test_september.csv', index=False)