# Event log analysis 

In [1]:
### IMPORT ###
from pathlib import Path
import pandas as pd
# PM4PY useful to Cyclomatic Cycle value
import pm4py
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.obj import EventLog, Trace

### LOCAL IMPORT ###
from config import config_reader
from utilities import df_read_csv_data

In [2]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
log_dir = str(yaml_config["LOG_DIR"])
stats_dir = str(yaml_config["STATS_DIR"])
csv_sep = ","
# INPUT
level_input = "PAGE" # [PAGE, PARA]
log_file = "edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr.csv" # <- INPUT: Set the file name, leaving LEVEL word
id_column = "Case ID"
activity_column = "Activity"
timestamp_column = "Complete Timestamp"

# FUNCTIONS

In [3]:
def calculate_session_count_and_percentage(df:pd.DataFrame, id_col:str) -> pd.DataFrame:
    """
    Calculate the distinct session counts and their percentage based on QuizSessionCount.

    Parameters:
    df (pd.DataFrame): A pandas DataFrame with at least two columns: 
        - 'QuizSessionCount': the number of quiz sessions
        - 'sessionID': unique identifier for each session

    Returns:
    pd.DataFrame: A new DataFrame with three columns:
        - 'QuizSessionCount': distinct values of QuizSessionCount
        - 'SessionCount': count of distinct sessionIDs for each QuizSessionCount
        - 'SessionPerc': percentage of SessionCount with respect to the total, rounded to 2 decimal places
    """

    # Count distinct sessionID for each QuizSessionCount
    session_counts = df.groupby('QuizSessionCount')[id_col].nunique().reset_index(name='SessionCount')
    
    # Calculate the percentage of session counts relative to the total
    total_sessions = session_counts['SessionCount'].sum()
    session_counts['SessionPerc'] = (session_counts['SessionCount'] / total_sessions * 100).round(2)
    
    return session_counts

In [4]:
def calculate_column_statistics(df: pd.DataFrame, id_column: str, calc_column: str) -> pd.DataFrame:
    """
    Calculate statistics on a specific column, considering only distinct IDs, and return the results 
    without grouping by ID.

    Parameters:
    df (pd.DataFrame): The input pandas DataFrame.
    id_column (str): The name of the column representing the unique identifiers (IDs).
    calc_column (str): The name of the column on which to perform the calculations.

    Returns:
    pd.DataFrame: A new DataFrame with the following columns:
        - '{calc_column}_not_na': count of non-empty values in the calc_column
        - '{calc_column}_min': the minimum value in the calc_column
        - '{calc_column}_max': the maximum value in the calc_column
        - '{calc_column}_avg': the average value in the calc_column
        - '{calc_column}_med': the median value in the calc_column
    """
    
    # Remove duplicate IDs
    df_no_duplicates = df.drop_duplicates(subset=[id_column])
    
    # Filter out rows where the calculation column is NaN
    non_empty_df = df_no_duplicates[df_no_duplicates[calc_column].notna()]
    
    # Calculate the statistics for the calc_column
    col_not_na = non_empty_df[calc_column].count()  # Count of non-NaN values
    col_min = non_empty_df[calc_column].min()       # Minimum value
    col_max = non_empty_df[calc_column].max()       # Maximum value
    col_avg = non_empty_df[calc_column].mean().round(2)      # Mean value
    col_med = non_empty_df[calc_column].median()    # Median value
    
    # Create a DataFrame with the results, using calc_column as prefix
    stats_df = pd.DataFrame({
        f'{calc_column}_not_na': [col_not_na],
        f'{calc_column}_min': [col_min],
        f'{calc_column}_max': [col_max],
        f'{calc_column}_avg': [col_avg],
        f'{calc_column}_med': [col_med]
    })
    
    return stats_df

In [5]:
def calculate_cyclomatic_complexity(xes_log):
    """
    Calculate the cyclomatic complexity from an XES event log.

    Parameters:
    xes_log: The event log in XES format (pm4py EventLog object). It should represent a single process execution log.

    Returns:
    float: The cyclomatic complexity of the resulting Directly-Follows Graph (DFG).
    """
    # Obtain the DFG from the XES log
    dfg = dfg_discovery.apply(xes_log)
    
    # Extract nodes from the DFG keys (each key is a tuple (activity1, activity2))
    nodes = set()
    for (s, t) in dfg.keys():
        nodes.add(s)
        nodes.add(t)
    
    # Count the number of nodes and edges
    N = len(nodes)
    E = len(dfg.keys())
    
    # Assume the graph is connected, so P = 1
    P = 1  
    
    # Calculate cyclomatic complexity: V(G) = E - N + 2P
    V = E - N + 2 * P
    
    return V

In [6]:
def event_log_csv_to_xes_and_cc(df_log:pd.DataFrame, activity_column:str, timestamp_column:str):
    # Conversion to XES and computing Cyclomatic Complexity
    df_log = pm4py.format_dataframe(df_log, case_id=id_column, activity_key=activity_column, timestamp_key=timestamp_column)
    print("> Saving the event log to XES")
    file_xes = f"{Path(log_file).stem}_temp.xes"
    path_xes = Path(log_dir) / file_xes
    print("Saving XES file to:", path_xes)
    pm4py.write_xes(df_log, path_xes, case_id_key='case:concept:name')
    path_xes = Path(log_dir) / file_xes
    # Load the XES
    # xes_log = pm4py.read_xes(path_xes.as_posix())
    xes_log = xes_importer.apply(str(path_xes))
    # print("xes_log type:", type(xes_log))
    cc = calculate_cyclomatic_complexity(xes_log)
    print("Cyclomatyc complexity for this event log:", cc)
    # delete the temporary xes file
    if path_xes.exists():
        path_xes.unlink()
    return cc

In [7]:
def extract_distinct_menu_per_session(df: pd.DataFrame, key_column: str, menu_column: str) -> pd.DataFrame:
    """
    Extracts the distinct values of the menu column for each distinct sessionID from a dataframe.

    Parameters:
        df (pd.DataFrame): The dataframe containing the data.
        key_column (str): The column name to group by (typically sessionID).
        menu_column (str): The column name from which to extract distinct values (typically menu).

    Returns:
        pd.DataFrame: A dataframe with each sessionID and the distinct values of the menu column.
    """

    # Group by the key column and aggregate distinct menu values
    grouped_df = df.groupby(key_column)[menu_column].apply(lambda x: list(x.unique())).reset_index()
    
    # Rename the columns for clarity
    grouped_df.columns = [key_column, 'DistinctMenuValues']

    # Sort
    grouped_df = grouped_df.sort_values(by = "DistinctMenuValues")
    
    return grouped_df

# MAIN

In [8]:
print(">> Setings")
log_file_name = log_file.replace("LEVEL", level_input)
print("Input file:", log_file_name)
path_log_file = Path(log_dir) /log_file_name 
print("Path file:", path_log_file)

>> Setings
Input file: edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv
Path file: data_log/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv


In [9]:
print(">> Reading")
df_log = df_read_csv_data(path_log_file, None, csv_sep)

>> Reading
Data preview
                                             Case ID    Activity  \
0  0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...  SURVEY-END   
1  0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...       INTRO   
2  0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...     INTRO-Q   
3  0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...        PROG   
4  0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...      PROG-Q   

    Complete Timestamp   pageTitle    menu  pageOrder  pagePara eventPage  \
0  2024-04-22 13:50:32  SURVEY-END  menu_1          1         0    PageIN   
1  2024-04-22 14:11:42       INTRO  menu_1          1         0    PageIN   
2  2024-04-22 14:20:32     INTRO-Q  menu_1          1         0    PageIN   
3  2024-04-22 14:21:04        PROG  menu_1          2         0    PageIN   
4  2024-04-22 14:24:57      PROG-Q  menu_1          2         0    PageIN   

   click_num  dbclick_num  ...  FN_N  FN_Q  FN_A  FN_C  FN_I  FN_CT  A_Time_s  \
0          1           

In [10]:
print("Distinct cases:", df_log[id_column].nunique())

Distinct cases: 332


In [11]:
df_log.head(5)

Unnamed: 0,Case ID,Activity,Complete Timestamp,pageTitle,menu,pageOrder,pagePara,eventPage,click_num,dbclick_num,...,FN_N,FN_Q,FN_A,FN_C,FN_I,FN_CT,A_Time_s,A_Time_m,Forward_Jumps,Backward_Jumps
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,SURVEY-END,2024-04-22 13:50:32,SURVEY-END,menu_1,1,0,PageIN,1,0,...,0.0,0.0,0.0,0.0,0.0,L,1270.0,21.17,0,0
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,INTRO,2024-04-22 14:11:42,INTRO,menu_1,1,0,PageIN,1,0,...,0.0,0.0,1.0,1.0,0.0,FA,530.0,8.83,0,0
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,INTRO-Q,2024-04-22 14:20:32,INTRO-Q,menu_1,1,0,PageIN,1,0,...,0.0,0.0,1.0,1.0,0.0,FA,32.0,0.53,0,0
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,PROG,2024-04-22 14:21:04,PROG,menu_1,2,0,PageIN,1,0,...,0.0,0.0,1.0,1.0,0.0,FA,233.0,3.88,0,0
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,PROG-Q,2024-04-22 14:24:57,PROG-Q,menu_1,2,0,PageIN,1,0,...,0.0,0.0,1.0,1.0,0.0,FA,17.0,0.28,0,0


In [12]:
df_log.columns

Index(['Case ID', 'Activity', 'Complete Timestamp', 'pageTitle', 'menu',
       'pageOrder', 'pagePara', 'eventPage', 'click_num', 'dbclick_num',
       'QuizSessionCount', 'QuizAnswerCorrectTotal', 'QuizAnswerWrongTotal',
       'QuizAnswerCorrectRatioOverCount', 'QuizAnswerCorrectRatioOverAll',
       'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3',
       'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3',
       'QuizAnswerCorrectRatioOverAll_P3', 'Q_1', 'Q_2', 'Q_3', 'Q_4', 'Q_5',
       'Q_6', 'Q_7', 'Q_8', 'Q_9', 'Q_10', 'Q_11', 'Q_12', 'Q_13', 'Q_14',
       'Q_15', 'Q_16', 'Q_17', 'Q_18', 'Q_19', 'Q_20', 'Q_21', 'Q_22', 'Q_23',
       'Q_24', 'Q_25', 'Q_26', 'Q_27', 'Q_28', 'SUS',
       'Apprendimento percepito', 'UEQ - Pragmatic', 'UEQ - Hedonic',
       'UEQ - Overall', 'TotalTimeHH', 'TotalTimeMM', 'TotalTimeMM.1',
       'TotalTimeDD', 'CaseLength', 'Class', 'SUS_Tercile',
       'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile',
       'QuizAnswe

In [13]:
df_log.shape

(8092, 74)

In [14]:
null_counts = df_log.isnull().sum()
null_counts

Case ID                0
Activity               0
Complete Timestamp     0
pageTitle              0
menu                   0
                      ..
FN_CT                 24
A_Time_s               0
A_Time_m               0
Forward_Jumps          0
Backward_Jumps         0
Length: 74, dtype: int64

## Menu

In [15]:
print(">> Getting path menu by sessionID")
df_menu = extract_distinct_menu_per_session(df_log, id_column, "menu")
path_out = Path(stats_dir) / "menu_stats.csv"
print("Saving menu stats (CSV):", path_out)
df_menu.to_csv(path_out, sep=";", index=False)
path_out = Path(stats_dir) / "menu_stats.xlsx"
print("Saving menu stats (XLSX):", path_out)
df_menu.to_excel(path_out, sheet_name="menu_stats", index=False)
print()

>> Getting path menu by sessionID
Saving menu stats (CSV): stats/menu_stats.csv
Saving menu stats (XLSX): stats/menu_stats.xlsx



## Classes

In [16]:
df_log["Class"].unique()

array(['SMCN1', 'SMCN2', 'SAA', 'ECO', 'SMTO1', nan, 'SMTO2', 'SMTO3'],
      dtype=object)

## Gender

In [17]:
# Calculating value counts (distribution)
df_distinct_cases = df_log.drop_duplicates(subset=[id_column])
# Calcola la frequenza dei valori nella colonna 'Q_28'
value_counts = df_distinct_cases['Q_28'].value_counts()
df_value_counts = value_counts.reset_index()
df_value_counts.columns = ['Q_28', 'Frequency']
df_value_counts.insert(0, 'File Name', log_file_name)

In [18]:
df_value_counts

Unnamed: 0,File Name,Q_28,Frequency
0,edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv,MASCHIO,164
1,edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv,FEMMINA,147
2,edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv,PREFERISCO NON RISPONDERE,9
3,edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv,-1,7
4,edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv,ALTRO ...,5


In [19]:
print(df_value_counts["Frequency"].sum())

332


In [20]:
path_out = Path(stats_dir) / f"gender_log_{level_input}.csv"
print("Path:", path_out)
df_value_counts.to_csv(path_out, sep=";", index=False)

Path: stats/gender_log_PAGE.csv


In [21]:
# Gender by class
result_df_class = df_distinct_cases.groupby(['Class', 'Q_28']).size().reset_index(name='Count')

In [22]:
result_df_class

Unnamed: 0,Class,Q_28,Count
0,ECO,-1,4
1,ECO,ALTRO ...,1
2,ECO,FEMMINA,53
3,ECO,MASCHIO,61
4,ECO,PREFERISCO NON RISPONDERE,6
5,SAA,-1,1
6,SAA,FEMMINA,68
7,SAA,MASCHIO,27
8,SAA,PREFERISCO NON RISPONDERE,1
9,SMCN1,ALTRO ...,2


In [23]:
path_out = Path(stats_dir) / f"gender_by_class_log_{level_input}.csv"
print("Path:", path_out)
result_df_class.to_csv(path_out, sep=";", index=False)

Path: stats/gender_by_class_log_PAGE.csv


In [24]:
total_sum = result_df_class.groupby('Class')['Count'].sum().reset_index(name='Total_Sum')

In [535]:
total_sum

Unnamed: 0,Class,Total_Sum
0,ECO,125
1,SAA,97
2,SMCN1,27
3,SMCN2,5
4,SMTO1,24
5,SMTO2,18
6,SMTO3,13


In [25]:
total_sum["Total_Sum"].sum()

309

## Quizzes

In [537]:
print(">> Checking quiz sessions")
df_log["QuizSessionCount"].unique()

>> Checking quiz sessions


array([10,  8,  9,  6, 11,  3,  7,  5,  0])

In [538]:
result_df = calculate_session_count_and_percentage(df_log, id_column)

In [539]:
result_df

Unnamed: 0,QuizSessionCount,SessionCount,SessionPerc
0,0,1,0.3
1,3,1,0.3
2,5,1,0.3
3,6,4,1.2
4,7,2,0.6
5,8,11,3.31
6,9,26,7.83
7,10,285,85.84
8,11,1,0.3


In [540]:
print("> Saving quiz counts")
path_out = Path(stats_dir) / "quiz_count.csv"
print("File out:", path_out)
result_df.to_csv(path_out, index=False, sep=";")

> Saving quiz counts
File out: stats/quiz_count.csv


In [30]:
# Take only one QuizAnswerCorrectRatioOverAll per id_column (e.g., the first occurrence)
unique_values = df_log.drop_duplicates(subset=id_column)['QuizAnswerCorrectRatioOverAll']

# Calculate statistics
mean_value = unique_values.mean()
median_value = unique_values.median()
standard_deviation = unique_values.std()

# Print results
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Standard deviation: {standard_deviation}")

Mean: 0.6924471299093655
Median: 0.7
Standard deviation: 0.1731273644387137


## SURVEY timing

In [541]:
df_log_enr_survey = df_log[df_log["Activity"].str.contains(r"SURVEY-")]

In [542]:
# Calculating mean and median of SURVEY-START
df_log_survey_time = df_log[df_log["Activity"].str.contains(r"SURVEY-START")]
mean_value = round(df_log_survey_time["A_Time_m"].mean(),2)
median_value = round(df_log_survey_time["A_Time_m"].median(),2)
print("Mean:", mean_value)
print("Median:", median_value)

Mean: 2.5
Median: 2.32


In [543]:
grouped_stats = df_log_survey_time.groupby("Q_28")["A_Time_m"].agg(['mean', 'median']).round(2)
grouped_stats = grouped_stats.rename(columns={"mean": "Mean", "median": "Median"})
grouped_stats

Unnamed: 0_level_0,Mean,Median
Q_28,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,1.66,0.67
ALTRO ...,1.62,0.08
FEMMINA,3.1,2.95
MASCHIO,2.02,1.31
PREFERISCO NON RISPONDERE,2.3,1.95


## Stats on experience

In [544]:
print(">> Stats on experiences")
list_col = ["SUS", "Apprendimento percepito", "UEQ - Pragmatic", "UEQ - Hedonic", "UEQ - Overall"]
print("Columns:", list_col)

>> Stats on experiences
Columns: ['SUS', 'Apprendimento percepito', 'UEQ - Pragmatic', 'UEQ - Hedonic', 'UEQ - Overall']


In [545]:
for col_name in list_col:
    print("Stats on column:", col_name)
    exp_df = calculate_column_statistics(df_log, id_column, col_name)
    file_put = f"{col_name}_stats.csv"
    path_out = Path(stats_dir) / file_put
    print("Saving stats to:", path_out)
    exp_df.to_csv(path_out, index=False, sep=csv_sep)
    print()

Stats on column: SUS
Saving stats to: stats/SUS_stats.csv

Stats on column: Apprendimento percepito
Saving stats to: stats/Apprendimento percepito_stats.csv

Stats on column: UEQ - Pragmatic
Saving stats to: stats/UEQ - Pragmatic_stats.csv

Stats on column: UEQ - Hedonic
Saving stats to: stats/UEQ - Hedonic_stats.csv

Stats on column: UEQ - Overall
Saving stats to: stats/UEQ - Overall_stats.csv



## Mergin Stats on a single file

In [546]:
# Directory containing the CSV files
csv_directory = Path(stats_dir)

# Output Excel file
output_excel = Path(stats_dir) / "_all_stats.xlsx"

# Initialise an Excel writer
i = 0
with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
    # Iterate over all CSV files in the directory
    for csv_file in csv_directory.glob('*.csv'):
        i+=1
        # Read the CSV file
        print(f"[{i}] Adding file: {csv_file}")
        df = pd.read_csv(csv_file, dtype={"sessionID":object}, sep=";", low_memory=False)
    
        # Use the filename without extension for the sheet name
        sheet_name = str(csv_file.stem).replace(" ","_")
        # Ensure the sheet name is at most 30 characters
        if len(sheet_name) > 30:
            sheet_name = sheet_name[:30]
        # Write the DataFrame to a new sheet in the Excel file
        print("Adding sheet name:", sheet_name)
        df.to_excel(writer, sheet_name=sheet_name, index=False)
print()
print(f"Excel file with all stats created successfully: {output_excel}")

[1] Adding file: stats/Apprendimento percepito_stats.csv
Adding sheet name: Apprendimento_percepito_stats
[2] Adding file: stats/SUS_stats.csv
Adding sheet name: SUS_stats
[3] Adding file: stats/UEQ - Hedonic_stats.csv
Adding sheet name: UEQ_-_Hedonic_stats
[4] Adding file: stats/UEQ - Overall_stats.csv
Adding sheet name: UEQ_-_Overall_stats
[5] Adding file: stats/UEQ - Pragmatic_stats.csv
Adding sheet name: UEQ_-_Pragmatic_stats
[6] Adding file: stats/_edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_stats_attributes.csv
Adding sheet name: _edu_event_log_PAGE_raw_filter
[7] Adding file: stats/class_distinct_session_counts.csv
Adding sheet name: class_distinct_session_counts
[8] Adding file: stats/cyclomatic_complexity_by_menu.csv
Adding sheet name: cyclomatic_complexity_by_menu
[9] Adding file: stats/cyclomatic_complexity_by_menu_usability.csv
Adding sheet name: cyclomatic_complexity_by_menu_
[10] Adding file: stats/distinct_event_timestamps_na_class.csv
Adding sheet name: distinct_event

## Creating a stats file with the attrbutes of the event-log for every CaseID 

In [547]:
# First, add 'TotalTimeMM' to the original DataFrame by converting 'TotalTimeHH' to minutes
if 'TotalTimeMM' not in df_log.columns:
    df_log['TotalTimeMM'] = round(df_log['TotalTimeHH'] * 60 , 2)

# Group by
df_log_attributes = df_log.groupby(id_column).agg({
    'Class': 'first',
    'TotalTimeHH': 'first',
    'TotalTimeMM': 'first',
    'CaseLength': 'first',
    'SUS': 'first',
    'Apprendimento percepito': 'first',
    'UEQ - Pragmatic': 'first',
    'UEQ - Hedonic': 'first',
    'UEQ - Overall': 'first',
    'SUS_Tercile': 'first',
    'Apprendimento percepito_Tercile': 'first',
    'UEQ - Overall_Tercile': 'first',
    'QuizSessionCount': 'first',
    'QuizAnswerCorrectTotal': 'first',
    'QuizAnswerWrongTotal': 'first',
    'QuizAnswerCorrectRatioOverCount': 'first',
    'QuizAnswerCorrectRatioOverAll': 'first',
    'QuizAnswerCorrectRatioOverAll_Tercile': 'first',
    'menu': lambda x: list(set(x)),  # Creating a list of unique menu values per sessionID,
    'pageTitle': lambda x: list(set(x)),
    'click_num': 'sum',  # Summing the values of 'click_num'
    'dbclick_num': 'sum',  # Summing the values of 'dbclick_num'
}).reset_index()

In [548]:
df_log_attributes.head(5)

Unnamed: 0,Case ID,Class,TotalTimeHH,TotalTimeMM,CaseLength,SUS,Apprendimento percepito,UEQ - Pragmatic,UEQ - Hedonic,UEQ - Overall,...,QuizSessionCount,QuizAnswerCorrectTotal,QuizAnswerWrongTotal,QuizAnswerCorrectRatioOverCount,QuizAnswerCorrectRatioOverAll,QuizAnswerCorrectRatioOverAll_Tercile,menu,pageTitle,click_num,dbclick_num
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,SMCN1,0.92,55.2,22,42.5,3.0,-1.25,-0.75,-1.0,...,10,6,4,0.6,0.6,2,"[menu_2, menu_1]","[FOR-Q, CONV, SURVEY-END, LISTS-Q, IF_ELSE, SU...",22,0
1,0fEI3C7ZVbHNPQ3hgqWpnF97S5Crv15NcK8WakL64pyRIO...,SAA,0.73,43.8,26,77.5,3.0,2.0,1.5,1.75,...,10,7,3,0.7,0.7,2,"[menu_4, menu_1]","[FOR-Q, CONV, SURVEY-END, LISTS-Q, IF_ELSE, SU...",260,0
2,0yvCRzdOzkoVYbnkbkS1oIVWv8auRHmPUJ7ck1uYsO9nCd...,ECO,1.33,79.8,22,60.0,3.2,1.0,1.5,1.25,...,10,7,3,0.7,0.7,2,"[menu_4, menu_1]","[FOR-Q, CONV, SURVEY-END, LISTS-Q, IF_ELSE, SU...",22,22
3,1JRSaFuQOcATjahn7vxyD1JzbiRCR14OLT0Jd2GVOgGT7T...,SMCN1,0.94,56.4,22,30.0,2.0,-1.25,-2.0,-1.625,...,10,8,2,0.8,0.8,3,"[menu_4, menu_1]","[FOR-Q, CONV, SURVEY-END, LISTS-Q, IF_ELSE, SU...",0,0
4,1JolpU4UO7I6WIHBXiGghjI7OEentU5EtlsRXzZoxZUDz5...,SMTO1,0.86,51.6,22,70.0,3.4,1.0,-1.0,0.0,...,10,10,0,1.0,1.0,3,"[menu_2, menu_1]","[FOR-Q, CONV, SURVEY-END, LISTS-Q, IF_ELSE, SU...",0,0


## Coefficient of Variation (CV)

In [549]:
# List of pages (activities)
quiz_list = ['INTRO-Q','PROG-Q','VARS-Q','TYPES-Q','CONV-Q','IF_ELSE-Q','FOR-Q','LISTS-Q','DICTS-Q','FUNCT-Q']

In [550]:
df_log_quiz = df_log[df_log['Activity'].isin(quiz_list)]

In [551]:
grouped_quiz_cv = df_log_quiz.groupby('Case ID')['A_Time_s'].agg(
    Mean='mean',
    Median='median',
    STD='std'
).reset_index()

grouped_quiz_cv['CV'] = grouped_quiz_cv.apply(
    lambda row: row['STD'] / row['Mean'] if row['Mean'] != 0 else None,
    axis=1
)
df_grouped_quiz_cv = grouped_quiz_cv.round(3)


In [552]:
df_grouped_quiz_cv

Unnamed: 0,Case ID,Mean,Median,STD,CV
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,50.300,39.0,35.522,0.706
1,0fEI3C7ZVbHNPQ3hgqWpnF97S5Crv15NcK8WakL64pyRIO...,73.900,61.0,40.184,0.544
2,0yvCRzdOzkoVYbnkbkS1oIVWv8auRHmPUJ7ck1uYsO9nCd...,70.800,37.5,74.602,1.054
3,1JRSaFuQOcATjahn7vxyD1JzbiRCR14OLT0Jd2GVOgGT7T...,35.800,27.5,28.955,0.809
4,1JolpU4UO7I6WIHBXiGghjI7OEentU5EtlsRXzZoxZUDz5...,170.100,146.0,126.834,0.746
...,...,...,...,...,...
326,zG5WbHqUjgo7FRvlJ9i7bqU49njZ01sswvAHpgGLL2D2hw...,202.800,182.0,178.675,0.881
327,zIDdTe8pSq83Df4PvzzJWbvW02BLtB9i0hLLSlhwUT6yE5...,71.357,65.0,60.676,0.850
328,zMOI7bbGN5daa6MQgW6MW2QBOUUmPWM4ZYXY6HVinRtZSu...,96.700,93.0,56.682,0.586
329,zO8034BWt9Eaw7tI6TN64EqUxHFEyG7RaYFubdGfCJ0ptK...,8.500,7.0,4.353,0.512


## Cyclomatic Complexity measure

In [553]:
# Conversion to XES and computing Cyclomatic Complexity
df_log = pm4py.format_dataframe(df_log, case_id=id_column, activity_key=activity_column, timestamp_key=timestamp_column)
print("> Saving the event log to XES")
file_xes = f"{Path(log_file).stem}.xes"
path_xes = Path(log_dir) / file_xes
print("Saving XES file to:", path_xes)
pm4py.write_xes(df_log, path_xes, case_id_key='case:concept:name')
path_xes = Path(log_dir) / file_xes
# Load the XES
# xes_log = pm4py.read_xes(path_xes.as_posix())
xes_log = xes_importer.apply(str(path_xes))
print("xes_log type:", type(xes_log))
list_cc_res = []
for trace in xes_log:
    case_id = trace.attributes.get("concept:name", "Unknown_Case_ID")
    # single_case_log = pm4py.objects.log.log.EventLog([trace])
    
    # Create an event log with the single trace
    single_case_log = EventLog()  
    single_case_log.append(trace)  

    cc = calculate_cyclomatic_complexity(single_case_log)
    print(f"Cyclomatyc complexity fir case {case_id}: {cc}")

    dic_res = {"Case ID":case_id, "CC":cc}
    list_cc_res.append(dic_res)

> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr.xes


exporting log, completed traces :: 100%|██████████| 332/332 [00:01<00:00, 278.27it/s]
parsing log, completed traces :: 100%|██████████| 332/332 [00:01<00:00, 195.66it/s]


xes_log type: <class 'pm4py.objects.log.obj.EventLog'>
Cyclomatyc complexity fir case 0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha07qlYyVXp05yBRq3T1LB7TN132xzXdUOFNs99MTOLk92ht7zXuB6fYCXOfiLk73iPu5cEDH6YCqcKwmudVczqeigWENnQfkcOAiBMm3r: 1
Cyclomatyc complexity fir case 0fEI3C7ZVbHNPQ3hgqWpnF97S5Crv15NcK8WakL64pyRIO5i1kfJW3cGpTQeUTRvcnmIk6LKrk8dBRkxh6hVWbvgIji6aXIum7tAJzOsqMzbKy8Ibf61SBHh8BIDHPT5JIugf0MXIxEAFJ1DkZXvCY: 5
Cyclomatyc complexity fir case 0yvCRzdOzkoVYbnkbkS1oIVWv8auRHmPUJ7ck1uYsO9nCdY8Meo7XKFtGUe3XmzyrQnOxBkTwIptgXceu1qxLKkNuGqvQMiBnZSAgZItRxCWnN9qv4L5uHCcC9elRDq1zZnJMxBiHe3L4J0vXIUqN0: 1
Cyclomatyc complexity fir case 1JRSaFuQOcATjahn7vxyD1JzbiRCR14OLT0Jd2GVOgGT7TvuJLHpjbIE1DYZwTSDnuapDBUIrPktYfbIhBxlaDJkEPO0WQVoBOOiuuMZcVMlJDqraG7fe5yzKBx5Sf4DA0UL0ILBOaAVQObfrU7rgG: 1
Cyclomatyc complexity fir case 1JolpU4UO7I6WIHBXiGghjI7OEentU5EtlsRXzZoxZUDz5drXj8ecfz4S0wIJeNGhWs0vVHBEG8jOzKaWwtwfCwrsgZeS8xD0UmMKsZ7wYJBHyN1GKMFHGOWae1NW48AZpAoiB8ZXT3nEiOoRA5x7f: 1
Cyclomatyc complexit

In [554]:
df_cc_results = pd.DataFrame(list_cc_res)
df_cc_results

Unnamed: 0,Case ID,CC
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,1
1,0fEI3C7ZVbHNPQ3hgqWpnF97S5Crv15NcK8WakL64pyRIO...,5
2,0yvCRzdOzkoVYbnkbkS1oIVWv8auRHmPUJ7ck1uYsO9nCd...,1
3,1JRSaFuQOcATjahn7vxyD1JzbiRCR14OLT0Jd2GVOgGT7T...,1
4,1JolpU4UO7I6WIHBXiGghjI7OEentU5EtlsRXzZoxZUDz5...,1
...,...,...
327,zG5WbHqUjgo7FRvlJ9i7bqU49njZ01sswvAHpgGLL2D2hw...,1
328,zIDdTe8pSq83Df4PvzzJWbvW02BLtB9i0hLLSlhwUT6yE5...,5
329,zMOI7bbGN5daa6MQgW6MW2QBOUUmPWM4ZYXY6HVinRtZSu...,1
330,zO8034BWt9Eaw7tI6TN64EqUxHFEyG7RaYFubdGfCJ0ptK...,2


In [555]:
df_cc_results["CC"].mean()

2.9759036144578315

## Merge the results

In [556]:
merged_df_1 = pd.merge(df_log_attributes, df_grouped_quiz_cv, on=id_column, how='outer')
merged_df_2 = pd.merge(merged_df_1, df_cc_results, on=id_column, how='outer')
merged_df_2.shape

(332, 28)

### Analyse the results by tercile

In [571]:
# Split by SUS_Tercile, Apprendimento percepito_Tercile, UEQ - Overall_Tercile
ux_list = ['SUS_Tercile', 'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile']
ter_list = [1, 3]
list_res = []
for ux_name in ux_list:
    print("UX:", ux_name)
    for ter_value in ter_list:
        print("Tercile:", ter_value)
        df_stats_temp = merged_df_2[merged_df_2[ux_name] == ter_value] # stats dataframe
        df_log_temp = df_log[df_log[ux_name] == ter_value] # event log dataframe
        cc = event_log_csv_to_xes_and_cc(df_log_temp, activity_column, timestamp_column)
        dic_r = {'UX': ux_name, 'Tercile': ter_value, 'Cases': len(df_stats_temp), 
                'TotalTimeMM_mean': df_stats_temp["TotalTimeMM"].mean().round(3),
                'CC_mean': cc, 'CV_mean':  df_stats_temp["CV"].mean().round(3),
                'QUIZ_mean':  df_stats_temp["QuizAnswerCorrectRatioOverAll"].mean().round(3),
                'CLICK_mean': df_stats_temp["click_num"].mean().round(3), 'DBCLICK_mean': df_stats_temp["dbclick_num"].mean().round(3)
                }
        list_res.append(dic_r)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key]
A value is trying to be set on a copy of a sl

UX: SUS_Tercile
Tercile: 1
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 112/112 [00:00<00:00, 291.51it/s]
parsing log, completed traces :: 100%|██████████| 112/112 [00:00<00:00, 200.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Cyclomatyc complexity for this event log: 105
Tercile: 3
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 106/106 [00:00<00:00, 295.49it/s]
parsing log, completed traces :: 100%|██████████| 106/106 [00:00<00:00, 200.65it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Cyclomatyc complexity for this event log: 99
UX: Apprendimento percepito_Tercile
Tercile: 1
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 157/157 [00:00<00:00, 294.66it/s]
parsing log, completed traces :: 100%|██████████| 157/157 [00:00<00:00, 184.59it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Cyclomatyc complexity for this event log: 118
Tercile: 3
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 106/106 [00:00<00:00, 292.35it/s]
parsing log, completed traces :: 100%|██████████| 106/106 [00:00<00:00, 177.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Cyclomatyc complexity for this event log: 90
UX: UEQ - Overall_Tercile
Tercile: 1
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 117/117 [00:00<00:00, 292.23it/s]
parsing log, completed traces :: 100%|██████████| 117/117 [00:00<00:00, 194.20it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[constants.CASE_CONCEPT_NAME] = df[case_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[xes_constants.DEFAULT_NAME_KEY] = df[activity_key]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Cyclomatyc complexity for this event log: 108
Tercile: 3
> Saving the event log to XES
Saving XES file to: data_log/edu_event_log_LEVEL_raw_filtered_DISCO_ter_enr_temp.xes


exporting log, completed traces :: 100%|██████████| 93/93 [00:00<00:00, 227.26it/s]
parsing log, completed traces :: 100%|██████████| 93/93 [00:00<00:00, 198.42it/s]

Cyclomatyc complexity for this event log: 82





In [None]:
df_res_ux = pd.DataFrame(list_res)
df_res_ux = df_res_ux.sort_values(by=["UX", "Tercile"])
df_res_ux

Unnamed: 0,UX,Tercile,Cases,TotalTimeMM_mean,CC_mean,CV_mean,QUIZ_mean,CLICK_mean,DBCLICK_mean
0,SUS_Tercile,1,112,54.889,105,0.927,0.657,86.705,8.652
1,SUS_Tercile,3,106,57.091,99,0.901,0.719,113.17,11.434
2,Apprendimento percepito_Tercile,1,157,54.592,118,0.915,0.679,86.433,12.236
3,Apprendimento percepito_Tercile,3,106,57.623,90,0.877,0.694,99.642,14.613
4,UEQ - Overall_Tercile,1,117,55.851,108,0.902,0.683,90.838,13.615
5,UEQ - Overall_Tercile,3,93,55.819,82,0.906,0.719,105.914,7.462


In [573]:
# Saving event log attributes
print(">> Saving event log stats")
file_name = Path(log_file_name).stem
path_out = Path(stats_dir) / f"_{file_name}_stats_UX.csv"
print("Saving event log attributes to:", path_out)
df_res_ux.to_csv(path_out, sep=csv_sep, index=False)

>> Saving event log stats
Saving event log attributes to: stats/_edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_stats_UX.csv


In [570]:
# Saving event log attributes
print(">> Saving event log attributes")
file_name = Path(log_file_name).stem
path_out = Path(stats_dir) / f"_{file_name}_stats_ALL.csv"
print("Saving event log attributes to:", path_out)
merged_df_2.to_csv(path_out, sep=csv_sep, index=False)

>> Saving event log attributes
Saving event log attributes to: stats/_edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_stats_ALL.csv
