In [2]:
import pandas as pd
import re
import numpy as np
import scipy.stats as stats
from scipy.stats import wilcoxon
from constants import diffMappingToScore, questions, labelsToElements
from functools import reduce
from utils import fixationProportionThresholdAnalysis, phaseDetection, dwellRegressionOnRelevantElements, periodCalculation, scanPathPrecision, averageFixationDuration, averageSaccadeAmplitudeForPhases, addQuestionInfo

In [3]:
#load data
data = pd.read_csv("/Users/amineabbad-andaloussi/Desktop/Postdoc 2022/modularization/Cle/analysis/out/eventsDataWithAois.csv")

In [4]:
#enrich questions with relevant elements
questions = [ {**question,**{'Relevant elements labels': re.findall('"(.+?)"', question["question"])}}  for question in questions ]

for question in questions:
    for idx, label in enumerate(question["Relevant elements labels"]):
        if re.compile("\[(.+?)\]").match(label):
            question["Relevant elements labels"][idx-1] = f'{question["Relevant elements labels"][idx-1]} {label}'
            question["Relevant elements labels"].remove(label)
            
questions = [ {**question,**{'Relevant elements count': len(question["Relevant elements labels"])}}  for question in questions ]

In [5]:
#get activities labels
questions = [ {**question,**{'Relevant elements names':  [ labelsToElements[activity] for  activity in question["Relevant elements labels"] ]   }}  for question in questions ]

In [6]:
#################
#
# Phase detection
#
#################

In [7]:
#drop na
fixationData = data.loc[(~data['FixID'].isna()) & (~data['currentQuestion'].isna())].copy(deep=True)
#add question info
fixationData = addQuestionInfo(fixationData,questions)

"""Q13 (local) and Q25 (global) need to be removed for SP11 due to low data quality"""
fixationData = fixationData.drop(fixationData[(fixationData['participant'] == 'SP11-no') & (fixationData['Type3'] == 'Exclusiveness')].index)

In [8]:
#detect phases (phase 1: pre/post to the point when all relevant activities identified)
phDectFix = phaseDetection(fixationData,questions)

In [9]:
#add Timestamp_formatted column
phDectFix["timestamp_formatted"] = pd.to_datetime(phDectFix['Fixation Start'], unit='ms')

In [10]:
#######################
#
# Phase duration
#
#######################

In [11]:
"""
2 rows are removed for participant SP11. This concerns the questions local and global (control-flow) Exclusiveness, since the participant 
skipped the answer for the local Exclusiveness question by mistake. Hence the duration and accuracy were biased!
"""
resTimeData = phDectFix.drop(phDectFix[(phDectFix['participant'] == 'SP11-no') & (phDectFix['Type3'] == 'Exclusiveness')].index)
print(resTimeData.shape)

(174701, 37)


In [12]:
#Example of query: time interval taken to answer each question and phase by each participant
periodCalculation(resTimeData)

Unnamed: 0,currentQuestion,participant,Type1,Type2,Type3,Phase,timeInterval
0,7,KP1-no,Local,Control-flow,Ordering,,75912.797
1,7,KP10-no,Local,Control-flow,Ordering,inference,15914.914
2,7,KP10-no,Local,Control-flow,Ordering,search,13181.871
3,7,KP11-no,Local,Control-flow,Ordering,inference,55194.007
4,7,KP11-no,Local,Control-flow,Ordering,search,29780.136
...,...,...,...,...,...,...,...
1028,46,SP6-no,Global,Data-flow,Data-based Routing,search,74863.285
1029,46,SP7-no,Global,Data-flow,Data-based Routing,inference,73650.420
1030,46,SP7-no,Global,Data-flow,Data-based Routing,search,51252.836
1031,46,SP8-no,Global,Data-flow,Data-based Routing,,80591.306


In [13]:
#filter out those with N/A
periods = periodCalculation(resTimeData)
periods = periods.loc[periods["Phase"]!="N/A"].copy(deep=True)
periods

Unnamed: 0,currentQuestion,participant,Type1,Type2,Type3,Phase,timeInterval
1,7,KP10-no,Local,Control-flow,Ordering,inference,15914.914
2,7,KP10-no,Local,Control-flow,Ordering,search,13181.871
3,7,KP11-no,Local,Control-flow,Ordering,inference,55194.007
4,7,KP11-no,Local,Control-flow,Ordering,search,29780.136
5,7,KP12-no,Local,Control-flow,Ordering,inference,24174.483
...,...,...,...,...,...,...,...
1026,46,SP5-no,Global,Data-flow,Data-based Routing,search,58585.158
1027,46,SP6-no,Global,Data-flow,Data-based Routing,inference,85947.712
1028,46,SP6-no,Global,Data-flow,Data-based Routing,search,74863.285
1029,46,SP7-no,Global,Data-flow,Data-based Routing,inference,73650.420


In [14]:
# keep only control-flow questions

In [15]:
periods = periods[periods['Type2']=='Control-flow']

In [16]:
#Descriptives (Local/Global)

In [41]:
#horizental analysis
periods.groupby(['Type1','Phase']).agg({"timeInterval":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,timeInterval
Type1,Phase,Unnamed: 2_level_1
Global,inference,77837.355705
Global,search,45084.496859
Local,inference,24856.041461
Local,search,28074.683844


In [40]:
#vertical analysis
periods.groupby(['Phase','Type1']).agg({"timeInterval":"mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,timeInterval
Phase,Type1,Unnamed: 2_level_1
inference,Global,77837.355705
inference,Local,24856.041461
search,Global,45084.496859
search,Local,28074.683844


In [18]:
#Inferentials

In [19]:
#horizental analysis
types = ['Local','Global']
measures = ['timeInterval']

for typeQ in types:
    print(f'----{typeQ}')
    for measure in measures:
        print(f'--{measure}')
        
        #get one measure per participant/Type1/Phase
        periods_part = periods.groupby(['participant','Type1','Phase'], as_index=False).agg({measure:"mean"})
       
        measure_a = periods_part.loc[(periods_part["Type1"]==typeQ) & (periods_part["Phase"]=='inference')][['participant',measure]]
        measure_b = periods_part.loc[(periods_part["Type1"]==typeQ) & (periods_part["Phase"]=='search')][['participant',measure]]
        print(len(measure_a), len(measure_b))
        measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
        print(len(measure_merge))
        print(stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']))




----Local
--timeInterval
44 44
44
WilcoxonResult(statistic=338.0, pvalue=0.06747004004739665)
----Global
--timeInterval
44 44
44
WilcoxonResult(statistic=119.0, pvalue=2.4714264554859255e-06)


In [46]:
#vertical analysis
phases = ['search','inference']
measures = ['timeInterval']

for phaseQ in phases:
    print(f'----{phaseQ}')
    for measure in measures:
        print(f'--{measure}')
        
        periods_part = periods.groupby(['participant','Type1','Phase'], as_index=False).agg({measure:"mean"})
       
        measure_a = periods_part.loc[(periods_part["Phase"]==phaseQ) & (periods_part["Type1"]=='Local')][['participant',measure]]
        measure_b = periods_part.loc[(periods_part["Phase"]==phaseQ) & (periods_part["Type1"]=='Global')][['participant',measure]]
        print(len(measure_a), len(measure_b))
        measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
        print(len(measure_merge))
        print(stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']))

----search
--timeInterval
44 44
44
WilcoxonResult(statistic=52.0, pvalue=4.570324563246686e-09)
----inference
--timeInterval
44 44
44
WilcoxonResult(statistic=1.0, pvalue=2.2737367544323206e-13)


In [20]:
#Interpretation
# The duration of the phases is different between local/global tasks
#    1. Local tasks (easy from BPM paper): Search phase is (slightly) longer than inference phase (sig 0.06) -> Balanced search and inference (24sec vs 28sec)
#    2. Global tasks (difficult from BPM paper): Inference phase is (sig) longer than search phase (sig ***)  -> More need for inference than search (78 sec vs 45 sec)

# In local tasks search and inference take more or less the same time
# In global tasks inference takes way more time than search which suggests that the inference process is more time consuming in global tasks



In [21]:
#######################
#
# comparision between search and inference at the level of local and global tasks 
#
#######################

In [47]:
#######################
#
# Average fixation duration
#
#######################
avFDPT = averageFixationDuration(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
avFDPT = avFDPT.loc[avFDPT["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
avFDPT = avFDPT.loc[avFDPT["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
avFDPT = avFDPT.sort_values(by=['participant','currentQuestion','timestamp'])

In [48]:
#######################
#
# Fixation threshold proportion analysis
#
#######################
fxThresholdsData = fixationProportionThresholdAnalysis(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
fxThresholdsData = fxThresholdsData.loc[fxThresholdsData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
fxThresholdsData = fxThresholdsData.sort_values(by=['participant','currentQuestion','timestamp'])

In [49]:
####################
#
# Scan-path precision
#
####################
scanPathPrecisionData = scanPathPrecision(phDectFix,['Type1','Type2','Type3','Phase'])
#filter out those with N/A
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Phase"]!="N/A"].copy(deep=True)
#Keep only control-flow
scanPathPrecisionData = scanPathPrecisionData.loc[scanPathPrecisionData["Type2"]=="Control-flow"].copy(deep=True)
#sorting (extra)
scanPathPrecisionData = scanPathPrecisionData.sort_values(by=['participant','currentQuestion','timestamp'])

In [50]:
#merge all dataframes (computed previously)
dfs = [avFDPT,scanPathPrecisionData,fxThresholdsData]
all_measures = reduce(lambda left,right: pd.merge(left,right,on=['participant', 'currentQuestion', 'Type1', 'Type2', 'Type3', 'Phase','timestamp'], how='inner'), dfs)
all_measures.columns

Index(['participant', 'currentQuestion', 'Type1', 'Type2', 'Type3', 'Phase',
       'Average_Fixation_Duration', 'timestamp', 'scan_path_precision',
       'timeInterval', 'shortFixationsProp', 'longFixationsProp'],
      dtype='object')

In [51]:
#horizental analysis
allstats = all_measures.groupby(['Type1','Phase']).agg({'Average_Fixation_Duration':'mean',
'scan_path_precision':'mean', 
'shortFixationsProp':'mean', 
'longFixationsProp':'mean'},as_index=False)
allstats.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Average_Fixation_Duration,scan_path_precision,shortFixationsProp,longFixationsProp
Type1,Phase,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Global,inference,212.211,0.108,0.761,0.056
Global,search,188.246,0.039,0.805,0.025
Local,inference,211.806,0.222,0.763,0.054
Local,search,184.461,0.037,0.813,0.02


In [52]:
#vertical analysis
allstats = all_measures.groupby(['Phase','Type1']).agg({'Average_Fixation_Duration':'mean',
'scan_path_precision':'mean', 
'shortFixationsProp':'mean', 
'longFixationsProp':'mean'},as_index=False)
allstats.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Average_Fixation_Duration,scan_path_precision,shortFixationsProp,longFixationsProp
Phase,Type1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
inference,Global,212.211,0.108,0.761,0.056
inference,Local,211.806,0.222,0.763,0.054
search,Global,188.246,0.039,0.805,0.025
search,Local,184.461,0.037,0.813,0.02


In [36]:
#horizental analysis
measures = ['Average_Fixation_Duration', 
       'scan_path_precision', 'shortFixationsProp', 
       'longFixationsProp']

# new dict with measure values
values = []


for measure in measures:
    for typee in ['Local','Global']:
        print(f'--{measure}, {typee}')
    
    #get one measure per participant/phase
        all_measures_forType =  all_measures[all_measures['Type1']==typee]
        all_measures_part = all_measures_forType.groupby(['participant','Phase'], as_index=False).agg({measure:"mean"})


        measure_a = all_measures_part.loc[(all_measures_part["Phase"]=='inference')][['participant',measure]]
        measure_b = all_measures_part.loc[(all_measures_part["Phase"]=='search')][['participant',measure]]
        print(len(measure_a), len(measure_b))
        measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
        print(len(measure_merge))
        pvalue = stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']).pvalue
        print(pvalue)
        values.append(pvalue)

--Average_Fixation_Duration, Local
44 44
44
7.50560502638109e-10
--Average_Fixation_Duration, Global
44 44
44
7.275957614183426e-11
--scan_path_precision, Local
44 44
44
1.1368683772161603e-13
--scan_path_precision, Global
44 44
44
5.684341886080801e-13
--shortFixationsProp, Local
44 44
44
1.1377991313565872e-06
--shortFixationsProp, Global
44 44
44
1.6860146843100665e-06
--longFixationsProp, Local
44 44
44
1.921307557495311e-11
--longFixationsProp, Global
44 44
44
1.1368683772161603e-12


In [54]:
#vertical analysis
measures = ['Average_Fixation_Duration', 
       'scan_path_precision', 'shortFixationsProp', 
       'longFixationsProp']

# new dict with measure values
values = []


for measure in measures:
    for phasee in ['search','inference']:
        print(f'--{measure}, {phasee}')
    
    #get one measure per participant/phase
        all_measures_forType =  all_measures[all_measures['Phase']==phasee]
        all_measures_part = all_measures_forType.groupby(['participant','Type1'], as_index=False).agg({measure:"mean"})


        measure_a = all_measures_part.loc[(all_measures_part["Type1"]=='Local')][['participant',measure]]
        measure_b = all_measures_part.loc[(all_measures_part["Type1"]=='Global')][['participant',measure]]
        print(len(measure_a), len(measure_b))
        measure_merge = measure_a.merge(measure_b, on=['participant'], suffixes=('_a', '_b'), how='inner')
        print(len(measure_merge))
        pvalue = stats.wilcoxon(measure_merge[f'{measure}_a'], measure_merge[f'{measure}_b']).pvalue
        print(pvalue)
        values.append(pvalue)

--Average_Fixation_Duration, search
44 44
44
0.016498111235137003
--Average_Fixation_Duration, inference
44 44
44
0.8217495229185943
--scan_path_precision, search
44 44
44
0.3217145050847421
--scan_path_precision, inference
44 44
44
1.1368683772161603e-13
--shortFixationsProp, search
44 44
44
0.07692739918388725
--shortFixationsProp, inference
44 44
44
0.4346026045950566
--longFixationsProp, search
44 44
44
0.04587502898289131
--longFixationsProp, inference
44 44
44
0.19928661531321268
