In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import scipy.stats as stats
from scipy.stats import wilcoxon
from constants import diffMappingToScore, questions, labelsToElements

In [2]:
#####################
# This notebook computes the following measures
#
#  - Fixations at different thresholds
#  - Scan-path precision
#  - average revisits to relevant elements
#
#######################

In [3]:
#load data
data = pd.read_csv("/Users/amineabbad-andaloussi/Desktop/Postdoc 2022/modularization/Cle/analysis/out/eventsDataWithAois.csv")

In [4]:
data['participant'].unique()

array(['KP1-no', 'KP10-no', 'KP11-no', 'KP12-no', 'KP13-no', 'KP14-no',
       'KP15-no', 'KP16-no', 'KP17-no', 'KP18-no', 'KP19-no', 'KP2-no',
       'KP20-no', 'KP21-no', 'KP22-no', 'KP23-no', 'KP24-no', 'KP3-no',
       'KP4-no', 'KP5-no', 'KP6-no', 'KP7-no', 'KP8-no', 'KP9-no',
       'SP1-no', 'SP10-no', 'SP11-no', 'SP12-no', 'SP13-no', 'SP14-no',
       'SP15-no', 'SP16-no', 'SP17-no', 'SP18-no', 'SP19-no', 'SP2-no',
       'SP20-no', 'SP3-no', 'SP4-no', 'SP5-no', 'SP6-no', 'SP7-no',
       'SP8-no', 'SP9-no'], dtype=object)

In [5]:
#enrich questions with relevant elements
questions = [ {**question,**{'Relevant elements labels': re.findall('"(.+?)"', question["question"])}}  for question in questions ]

for question in questions:
    for idx, label in enumerate(question["Relevant elements labels"]):
        if re.compile("\[(.+?)\]").match(label):
            question["Relevant elements labels"][idx-1] = f'{question["Relevant elements labels"][idx-1]} {label}'
            question["Relevant elements labels"].remove(label)
            
questions = [ {**question,**{'Relevant elements count': len(question["Relevant elements labels"])}}  for question in questions ]

In [6]:
#get activities labels
questions = [ {**question,**{'Relevant elements names':  [ labelsToElements[activity] for  activity in question["Relevant elements labels"] ]   }}  for question in questions ]

In [7]:
#preview questions
questions

[{'id': 7,
  'question': ' The activity "Enter container information from documents" needs to be executed before the activity "Create new erp system entry"?',
  'type': 'multiple-choice',
  'options': "True;False;I don't know",
  'model-group': '1',
  'ExpectedAnswer': 'False',
  'Type1': 'Local',
  'Type2': 'Control-flow',
  'Type3': 'Ordering',
  'Relevant elements labels': ['Enter container information from documents',
   'Create new erp system entry'],
  'Relevant elements count': 2,
  'Relevant elements names': ['Activity_1gyhik7', 'Activity_0yjbdub']},
 {'id': 10,
  'question': ' The activity "Inform customs about temporary storage" can be executed in parallel with the activity "Inform shipping company about temporary storage"?',
  'type': 'multiple-choice',
  'options': "True;False;I don't know",
  'model-group': '1',
  'ExpectedAnswer': 'False',
  'Type1': 'Local',
  'Type2': 'Control-flow',
  'Type3': 'Concurrency',
  'Relevant elements labels': ['Inform customs about temporar

In [8]:
#preview data
data.head()

Unnamed: 0,participant,FixID,Fixation X,Fixation Y,Fixation Start,Fixation End,Fixation Duration,Fixation Dispersion,SacID,Saccade Start,...,Saccade Amplitude,Saccade Peak Velocity,Saccade Peak Acceleration,Saccade Peak Deceleration,Saccade Direction,currentQuestion,tabName_element,Timestamp,tabName,element
0,KP1-no,,,,,,,,1.0,5307491.0,...,4.0922,168.9663,10177.6287,-10967.1418,93.4893,1.0,F1_Offload_Container.bpmn;nan,5307494.855,F1_Offload_Container.bpmn,
1,KP1-no,,,,,,,,2.0,5307616.0,...,0.7608,60.1521,4027.4507,-318.0821,122.1063,1.0,F1_Offload_Container.bpmn;title-for-questionID_1,5307619.849,F1_Offload_Container.bpmn,title-for-questionID_1
2,KP1-no,,,,,,,,3.0,5307674.0,...,7.0458,228.8872,13968.0601,-12587.615,93.0802,1.0,F1_Offload_Container.bpmn;Process_1jt1th5,5307686.524,F1_Offload_Container.bpmn,Process_1jt1th5
3,KP1-no,,,,,,,,4.0,5307741.0,...,0.7417,56.6251,4725.8834,-2591.7757,23.9625,1.0,F1_Offload_Container.bpmn;Process_1jt1th5,5307744.836,F1_Offload_Container.bpmn,Process_1jt1th5
4,KP1-no,1.0,988.9248,375.77,5307757.0,5307874.0,116.695,0.2819,,,...,,,,,,1.0,F1_Offload_Container.bpmn;Process_1jt1th5,5307761.489,F1_Offload_Container.bpmn,Process_1jt1th5


In [9]:
def addQuestionInfo(allData,questions):
    
    #change the type of questionID to integer
    allData['currentQuestion'] = allData['currentQuestion'].astype('int')
    
    #extend the columns of questionnaireData with those in DataFrame(questions) based the common question ID
    allData = allData.merge(pd.DataFrame(questions), left_on=['currentQuestion'], right_on=['id'])
    
    return allData

In [10]:
def fixationThresholdAnalysis(allData,grouper,threshold_min,threshold_max ):


    #Keep only fixations with a duration within a specified time interval
    fixationData = allData.loc[(allData['Fixation Duration']>=threshold_min) & (allData['Fixation Duration']<threshold_max) ]

    #groupby participant, currentQuestion and grouper
    fixationData = fixationData.groupby(['participant','currentQuestion']+grouper, as_index=False).agg(Fixations_in_range=('Fixation Duration','count'))
   

    return fixationData

def dwellRegressionOnRelevantElements(dwells,grouper):

   
    #keep only dwells on relevant Elements
    dwells = dwells[dwells.apply(lambda x: x["element_"] in x["Relevant elements names"], axis=1)]
    
    #compute revisits
    dwells = dwells.groupby(['participant','currentQuestion','tabName','element_']+grouper,as_index=False).agg(visits=('id','count'))
    #calculate revisits
    dwells["revisits"] = dwells["visits"]-1    
    

    return dwells



def scanPathPrecision(allData,grouper):


    #label fixations on relevant Elements
    allData["relevant"] = allData.apply(lambda x: 1 if x["element"] in x["Relevant elements names"] else 0, axis=1)
    
    allData = allData.groupby(['participant','currentQuestion']+grouper, as_index=False).agg(scan_path_precision=('relevant','mean'))
    
    return allData


def averageFixationDuration(fixationData,grouper):
    return fixationData.groupby(['participant','currentQuestion']+grouper, as_index=False).agg(Average_Fixation_Duration=('Fixation Duration','mean'))


In [11]:
#######################
#
# Fixation threshold analysis
# (means are calculated for each participant/task e.g., mean fixation count)
# adapted from "Business process and rule integration approaches—An empirical analysis of model understanding"
#
#######################

In [12]:
thresholds = [
    {"min": 0, "max":250},
]

In [13]:
#measure specific pre-processing

In [14]:
#drop na
fixationData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)
    
#add question info
fixationData = addQuestionInfo(fixationData,questions)

In [15]:
"""Q13 (local) and Q25 (global) need to be removed for SP11"""
fixationData_full = fixationData.drop(fixationData[(fixationData['participant'] == 'SP11-no') & (fixationData['Type3'] == 'Exclusiveness')].index)

#Select control-flow question type
fixationData = fixationData_full.loc[(fixationData_full['Type2'] == 'Control-flow')] 

In [16]:
#Example query: count of fixations within duration in [500,2000] for each participant and question
fixationThresholdAnalysis(fixationData,['Type1'],threshold_min=0,threshold_max=250)

Unnamed: 0,participant,currentQuestion,Type1,Fixations_in_range
0,KP1-no,7,Local,190
1,KP1-no,10,Local,138
2,KP1-no,13,Local,249
3,KP1-no,16,Local,47
4,KP1-no,19,Global,47
...,...,...,...,...
345,SP9-no,16,Local,103
346,SP9-no,19,Global,265
347,SP9-no,22,Global,195
348,SP9-no,25,Global,798


In [17]:
#Descriptive

In [18]:
for threshold in thresholds:
    threshold_min = threshold["min"]
    threshold_max = threshold["max"]
    print(f'Threshold_min: {threshold_min}, Threshold_max{threshold_max}')
    
    #required groupby to have one measure per participant, task
    perTaskParticipantFxAnalysis = fixationThresholdAnalysis(fixationData,['Type1','Type2','Type3'],threshold_min=threshold_min,threshold_max=threshold_max)
    
    #Calculate mean per participant
    fixPerPart = perTaskParticipantFxAnalysis.groupby(['participant','Type1','Type2'],as_index=False).agg({'Fixations_in_range':'mean'})
    
    #Calculate and display descriptives
    display(fixPerPart.groupby(['Type1']).agg({'Fixations_in_range':'mean'}))

Threshold_min: 0, Threshold_max250


Unnamed: 0_level_0,Fixations_in_range
Type1,Unnamed: 1_level_1
Global,298.882576
Local,142.153409


In [19]:
#Inferential

In [20]:
measures = ['Fixations_in_range']
flows = ['Control-flow']

for threshold in thresholds:
    threshold_min = threshold["min"]
    threshold_max = threshold["max"]
    print()
    print(f'------Threshold_min: {threshold_min}, Threshold_max{threshold_max}')
    
    #required groupby to have one measure per participant, task
    perTaskParticipantFxAnalysis = fixationThresholdAnalysis(fixationData,['Type1','Type2','Type3'],threshold_min=threshold_min,threshold_max=threshold_max)
    
    for flow in flows:
        print(f'---{flow}')
        for measure in measures:
            print(f'--{measure}')

            #Calculate mean per participant
            fixHigh_new_part = perTaskParticipantFxAnalysis.groupby(['participant','Type1','Type2'],as_index=False).agg({measure:'mean'})

            measure_Global = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Global') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]
            measure_Local = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Local') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]

            print(len(measure_Global), len(measure_Local))

            measure_GL_merge = measure_Global.merge(measure_Local, on=['participant'], suffixes=('_global', '_local'), how='inner')

            print(stats.wilcoxon(measure_GL_merge[f'{measure}_global'], measure_GL_merge[f'{measure}_local'],alternative='greater'))


------Threshold_min: 0, Threshold_max250
---Control-flow
--Fixations_in_range
44 44
WilcoxonResult(statistic=981.0, pvalue=1.8758328224066645e-12)


In [21]:
#######################
#
# ScanPath Precision (Petrusel and Mendling)
# the percetange [or ratio] of fixations on relevant elements in relation to all fixations. 
# adapted from "How visual cognition influences process model comprehension"
#
#######################

In [22]:
#measure specific pre-processing

In [23]:
#drop na
scanPathPrecisionData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)

#add question info
scanPathPrecisionData = addQuestionInfo(scanPathPrecisionData,questions)

In [24]:
"""Q13 (local) and Q25 (global) need to be removed for SP11"""
scanPathPrecisionData_full = scanPathPrecisionData.drop(scanPathPrecisionData[(scanPathPrecisionData['participant'] == 'SP11-no') & (scanPathPrecisionData['Type3'] == 'Exclusiveness')].index)

#Select control-flow question type
scanPathPrecisionData = scanPathPrecisionData_full.loc[(scanPathPrecisionData_full['Type2'] == 'Control-flow')] 

In [25]:
#Example query: scanPathPrecision for each participant and question
scanPathPrecision(scanPathPrecisionData,['Type1'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allData["relevant"] = allData.apply(lambda x: 1 if x["element"] in x["Relevant elements names"] else 0, axis=1)


Unnamed: 0,participant,currentQuestion,Type1,scan_path_precision
0,KP1-no,7,Local,0.018018
1,KP1-no,10,Local,0.052023
2,KP1-no,13,Local,0.066202
3,KP1-no,16,Local,0.057692
4,KP1-no,19,Global,0.072727
...,...,...,...,...
345,SP9-no,16,Local,0.087838
346,SP9-no,19,Global,0.074935
347,SP9-no,22,Global,0.211838
348,SP9-no,25,Global,0.100694


In [26]:
#required groupby to have one measure per participant, task
perTaskParticipantScanPathPrecision = scanPathPrecision(scanPathPrecisionData,['Type1','Type2','Type3'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  allData["relevant"] = allData.apply(lambda x: 1 if x["element"] in x["Relevant elements names"] else 0, axis=1)


In [27]:
#Descriptives

In [28]:
#calculate mean per participant
SRPart = perTaskParticipantScanPathPrecision.groupby(['participant','Type1','Type2'],as_index=False).agg({'scan_path_precision':'mean'})
#calculate descriptives
SRPart.groupby(['Type1']).agg({'scan_path_precision':'mean'})

Unnamed: 0_level_0,scan_path_precision
Type1,Unnamed: 1_level_1
Global,0.077369
Local,0.110821


In [29]:
#Inferentials

In [30]:
measures = ['scan_path_precision']
flows = ['Control-flow']

for flow in flows:
    print(f'---{flow}')
    for measure in measures:
        print(f'--{measure}')

        #Calculate mean per participant
        fixHigh_new_part = perTaskParticipantScanPathPrecision.groupby(['participant','Type1','Type2'],as_index=False).agg({measure:'mean'})

        measure_Global = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Global') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]
        measure_Local = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Local') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]

        print(len(measure_Global), len(measure_Local))

        measure_GL_merge = measure_Global.merge(measure_Local, on=['participant'], suffixes=('_global', '_local'), how='inner')

        print(stats.wilcoxon(measure_GL_merge[f'{measure}_global'], measure_GL_merge[f'{measure}_local'],alternative='less'))

---Control-flow
--scan_path_precision
44 44
WilcoxonResult(statistic=82.0, pvalue=5.481501830217894e-08)


In [31]:
#######################
#
# (Participant) Average returns to relevant regions
#
#######################

In [32]:
#measure specific pre-processing

In [33]:
#drop na to keep fixation data only
dwellRegData = data.loc[(~data['FixID'].isna())].copy(deep=True)

In [34]:
#find dwells 
dwellRegData["element_"] = dwellRegData["element"]
dwells = dwellRegData.groupby([(dwellRegData['element'].shift() != dwellRegData['element']).cumsum(),'element_','participant','currentQuestion','tabName'], as_index=False).agg(fixations=('FixID','count'))
#remove dwells with empty current question
dwells = dwells.loc[(~dwells['currentQuestion'].isna())].copy(deep=True)

In [35]:
#add question info
dwells = addQuestionInfo(dwells,questions)

In [36]:
"""Q13 (local) and Q25 (global) need to be removed for SP11"""
dwells = dwells.drop(dwells[(dwells['participant'] == 'SP11-no') & (dwells['Type3'] == 'Exclusiveness')].index)

#Select control-flow question type
dwells = dwells.loc[(dwells['Type2'] == 'Control-flow')] 

In [37]:
#Example query: get the number of revisits for each participant, question, tabName, element
dwellRegressionOnRelevantElements(dwells,['Type1'])

Unnamed: 0,participant,currentQuestion,tabName,element_,Type1,visits,revisits
0,KP1-no,7,F5_Manage_File_in_ERP.bpmn,Activity_0yjbdub,Local,2,1
1,KP1-no,10,F3_Store_Container_Temporarily.bpmn,Activity_043rk4n,Local,5,4
2,KP1-no,10,F3_Store_Container_Temporarily.bpmn,Activity_06wwcqs,Local,2,1
3,KP1-no,13,F4_Load_Container_Onward_Carriage.bpmn,Activity_0v934ow,Local,4,3
4,KP1-no,13,F4_Load_Container_Onward_Carriage.bpmn,Activity_1fk1rvd,Local,5,4
...,...,...,...,...,...,...,...
666,SP9-no,22,F1_Offload_Container.bpmn,Activity_1gumzzz,Global,28,27
667,SP9-no,22,F6_Analyse_Shake_Event.bpmn,Activity_0p2t8c9,Global,6,5
668,SP9-no,25,F3_Store_Container_Temporarily.bpmn,Activity_00lcg2a,Global,37,36
669,SP9-no,25,F4_Load_Container_Onward_Carriage.bpmn,Activity_0quien2,Global,46,45


In [38]:
#required groupby to have one measure per participant, task
perTaskParticipantDwellRegressionOnRelevantElements = dwellRegressionOnRelevantElements(dwells,['Type1','Type2','Type3']).groupby(['participant','currentQuestion','Type1','Type2','Type3'], as_index=False).agg({'revisits':'mean'})

In [39]:
#Descriptive

In [40]:
#calculate mean per participant
RVPart = perTaskParticipantDwellRegressionOnRelevantElements.groupby(['participant','Type1','Type2'],as_index=False).agg({'revisits':'mean'})
#calculate descriptives
RVPart.groupby(['Type1']).agg({'revisits':'mean'})

Unnamed: 0_level_0,revisits
Type1,Unnamed: 1_level_1
Global,7.844697
Local,4.107955


In [41]:
#Inferential

In [42]:
measures = ['revisits']
flows = ['Control-flow']


for flow in flows:
    print(f'---{flow}')
    for measure in measures:
        print(f'--{measure}')

        #Calculate mean per participant
        fixHigh_new_part = perTaskParticipantDwellRegressionOnRelevantElements.groupby(['participant','Type1','Type2'],as_index=False).agg({measure:'mean'})

        measure_Global = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Global') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]
        measure_Local = fixHigh_new_part.loc[(fixHigh_new_part["Type1"]=='Local') & (fixHigh_new_part["Type2"]==flow)][['participant',measure]]

        print(len(measure_Global), len(measure_Local))

        measure_GL_merge = measure_Global.merge(measure_Local, on=['participant'], suffixes=('_global', '_local'), how='inner')

        print(stats.wilcoxon(measure_GL_merge[f'{measure}_global'], measure_GL_merge[f'{measure}_local'],alternative='greater'))

---Control-flow
--revisits
44 44
WilcoxonResult(statistic=923.5, pvalue=2.6549372839935103e-08)




In [43]:
#######################
#
# Average fixation duration
# (means are calculated for each participant/task)
#
#######################

In [44]:
#######################
#
# #Average fixation Duration
#
#######################

In [45]:
#drop na
fixationData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)

#add question info
fixationData = addQuestionInfo(fixationData,questions)

#Select control-flow question type
fixationData = fixationData.loc[(fixationData['Type2'] == 'Control-flow')] 

In [46]:
"""Q13 (local) and Q25 (global) need to be removed for SP11"""
fixationData = fixationData.drop(fixationData[(fixationData['participant'] == 'SP11-no') & (fixationData['Type3'] == 'Exclusiveness')].index)

In [47]:
#Example query: get the average fixation duration for each task and participant
avFDPT = averageFixationDuration(fixationData,['Type1'])
avFDPT

Unnamed: 0,participant,currentQuestion,Type1,Average_Fixation_Duration
0,KP1-no,7,Local,161.955273
1,KP1-no,10,Local,165.348968
2,KP1-no,13,Local,161.568267
3,KP1-no,16,Local,162.155827
4,KP1-no,19,Global,163.008736
...,...,...,...,...
345,SP9-no,16,Local,230.043824
346,SP9-no,19,Global,228.205987
347,SP9-no,22,Global,259.134956
348,SP9-no,25,Global,245.842304


In [48]:
#Descriptive

In [49]:
avFDPT = averageFixationDuration(fixationData,['Type1','Type2'])
#calculate mean per participant
AVPart = avFDPT.groupby(['participant','Type1','Type2'],as_index=False).agg({'Average_Fixation_Duration':'mean'})
#calculate descriptives
AVPart.groupby(['Type1']).agg({'Average_Fixation_Duration':'mean'})

Unnamed: 0_level_0,Average_Fixation_Duration
Type1,Unnamed: 1_level_1
Global,200.96631
Local,194.322859


In [50]:
#Inferential

In [51]:
measures = ['Average_Fixation_Duration']
flows = ['Control-flow']

avFDPT = averageFixationDuration(fixationData,['Type1','Type2'])



for flow in flows:
    print(f'---{flow}')
    for measure in measures:
        print(f'--{measure}')

        #Calculate mean per participant
        fix_new = avFDPT.groupby(['participant','Type1','Type2'],as_index=False).agg({measure:'mean'})

        measure_Global = fix_new.loc[(fix_new["Type1"]=='Global') & (fix_new["Type2"]==flow)][['participant',measure]]
        measure_Local = fix_new.loc[(fix_new["Type1"]=='Local') & (fix_new["Type2"]==flow)][['participant',measure]]

        print(len(measure_Global), len(measure_Local))

        measure_GL_merge = measure_Global.merge(measure_Local, on=['participant'], suffixes=('_global', '_local'), how='inner')

        print(stats.wilcoxon(measure_GL_merge[f'{measure}_global'], measure_GL_merge[f'{measure}_local'],alternative='greater'))

---Control-flow
--Average_Fixation_Duration
44 44
WilcoxonResult(statistic=801.0, pvalue=0.0001052383949513569)
