In [None]:
This notebook takes as input a flat event log stored in a CSV file and groupe the attribute values (i.e., orders objects) 
as a multiset if the activity name, timestamp and weight, price attribute have the same value. As output a new CSV file
is created called 'orders_case_notion_event_log_grouped'. The following query can be used if we want to apply the same idea
using query statement. 

SELECT activity_name, timestamp, weight, price,
ARRAY_AGG(DISTINCT numbers) AS orders object
FROM orders_case_notion_event_log
GROUP BY activity_name, timestamp, weight, price;

In [None]:
!pip install pandas
!pip install pm4py

In [None]:
import pandas as pd
import csv
import pm4py

In [None]:
df_orders = pd.read_csv('orders_case_notion_event_log.csv')

In [None]:
# select the specified columns and order the results for the 
selected_cols = ["number", "ocel activity", "ocel timestamp", "weight_y", "price_y"]
ordered_results = df_orders[selected_cols].sort_values(["ocel timestamp", "weight_y", "price_y"])

# print the results
print(ordered_results)

In [None]:
# group the rows with the same values in the specified columns
groups = df_orders.groupby(["ocel activity", "ocel timestamp", "weight_y", "price_y"])

# initialize the group number and the row number to 1
group_number = 1
row_number = 1

# create a new column "row_number" and mark the rows in each group with a sequential number
df_orders["row_number"] = 0

for group_name, group_df in groups:
    # mark the rows in the current group with a sequential row number
    df_orders.loc[group_df.index, "row_number"] = row_number
    # increment the row number for the next row
    row_number += 1
    # update the group number if the current group ends
    if len(group_df) == 1:
        group_number += 1
    # mark the rows in the current group with the current group number
    df_orders.loc[group_df.index, "group_number"] = group_number

# print the updated data frame with the row and group numbers
print(df_orders)


In [None]:
# save the updated data frame to a new CSV file
df_orders.to_csv("orders_case_notion_event_log_with_numbers.csv", index=False)

In [None]:
# group the data frame by source_id, ocel timestamp, acel activity, weight_y, and price_y
groups = df_orders.groupby(["source_id", "ocel timestamp", "ocel activity", "weight_y", "price_y"])["number"].agg(list)

# create a new data frame with the group information and the concatenated numbers
df_grouped = groups.reset_index().groupby("source_id").agg({
    "source_id":list,
    "ocel timestamp": list,
    "ocel activity": list,
    "weight_y": list,
    "price_y": list,
    "number": lambda x: [num for group in x for num in group]
})

# rename the columns in the new data frame
df_grouped.columns = ["source_id","ocel timestamp", "ocel activity", "weight_y", "price_y", "numbers"]

# Modify the data structures before saving
df_grouped["numbers"] = df_grouped["numbers"].apply(lambda lst: ','.join(map(str, lst)))
df_grouped["source_id"] = df_grouped["source_id"].apply(lambda lst: ','.join(map(str, lst)))
df_grouped["ocel timestamp"] = df_grouped["ocel timestamp"].apply(lambda lst: ','.join(map(str, lst)))
df_grouped["ocel activity"] = df_grouped["ocel activity"].apply(lambda lst: ','.join(map(str, lst)))
df_grouped["weight_y"] = df_grouped["weight_y"].apply(lambda lst: ','.join(map(str, lst)))
df_grouped["price_y"] = df_grouped["price_y"].apply(lambda lst: ','.join(map(str, lst)))

# Save the updated data frame to a new CSV file
df_grouped.to_csv("orders_case_notion_event_log_grouped.csv", index=False)


# print a message to confirm that the file was saved
print("Data frame saved to orders_case_notion_event_log_grouped.csv")