In [1]:
import os
import json
import re
import pandas as pd
import matplotlib.pyplot as plt


def load_all_data(folder_path, ignore_completed=False):
    """
    Iterate over all .json files in folder_path:
      - By default, include only those with quiz['completed'] == True;
      - Rename answer keys that have numeric suffixes (_1, _2, …)
      - Apply renaming logic for post-task-question and post-task-survey keys

    Returns a dict mapping each filename (without extension) to its processed quiz data dict.
    """
    def extract_suffix(key):
        m = re.search(r'_(\d+)$', key)
        return int(m.group(1)) if m else 0

    def remove_suffix(key):
        return re.sub(r'_(\d+)$', '', key)

    all_data = {}
    for fn in os.listdir(folder_path):
        if not fn.lower().endswith('.json'):
            continue
        path = os.path.join(folder_path, fn)
        try:
            with open(path, encoding='utf-8') as f:
                quiz = json.load(f)
        except json.JSONDecodeError:
            continue

        if not ignore_completed and not quiz.get('completed', False):
            continue

        key_name = os.path.splitext(fn)[0]
        all_data[key_name] = quiz

        answers = quiz.get('answers', {})
        if not isinstance(answers, dict):
            continue

        sorted_keys = sorted(answers.keys(), key=extract_suffix)
        new_answers = {}
        last_task = None
        for i, old in enumerate(sorted_keys):
            base = remove_suffix(old)

            if base == 'post-task-question':
                new_key = f"{last_task}_post-task-question" if last_task else base
            elif base.startswith('post-task-survey'):
                if i > 0:
                    prev = sorted_keys[i-1]
                    prev_base = remove_suffix(prev)
                    suffix = prev_base[prev_base.rfind('-'):] if '-' in prev_base else ''
                    new_key = base + suffix
                else:
                    new_key = base
                last_task = None
            else:
                new_key = base
                last_task = base

            new_answers[new_key] = answers[old]

        quiz['answers'] = new_answers

    return all_data

def time_analysis(all_data, metric='total_duration_min'):
    """
    Parameters:
      all_data (dict): The dictionary returned by load_quiz_data.

    Returns:
      df_task_level (pandas.DataFrame):
        A DataFrame with one row per task, including columns for start time, end time, duration, and format.
      df_participant_level (pandas.DataFrame):
        A DataFrame with one row per participant, including columns for total duration and format.
    """
    task_rows = []
    part_rows = []

    for fn, quiz in all_data.items():
        answers = quiz.get('answers', {})
        # look for participantId
        pid = fn
        for info in answers.values():
            if isinstance(info, dict):
                ans = info.get('answer', {})
                if isinstance(ans, dict) and 'prolificId' in ans:
                    pid = ans['prolificId']
                    break

        # extract  format
        current_format = None
        total_sec = 0
        temp = []
        for name, info in answers.items():
            if not isinstance(info, dict):
                continue
            st = info.get('startTime')
            ed = info.get('endTime')
            if st is not None and ed is not None:
                dur = (ed - st)/1000.0
            else:
                dur = None

            # tutorial-<fmt>-part1
            if current_format is None:
                m = re.match(r'tutorial-(\w+)-part1', name)
                if m:
                    current_format = m.group(1).lower()

            temp.append({
                'participantId': pid,
                'task': name,
                'startTime': st,
                'endTime': ed,
                'duration_sec': dur,
                'duration_min': dur/60 if dur is not None else None
            })
            if dur:
                total_sec += dur

        # add format 
        fmt = current_format or 'unknown'
        for row in temp:
            row['format'] = fmt
            task_rows.append(row)

        part_rows.append({
            'participantId': pid,
            'format': fmt,
            'duration_sec': round(total_sec,3),
            'duration_min': round(total_sec/60,2)
        })

    df_task = pd.DataFrame(task_rows)
    df_part = pd.DataFrame(part_rows)
    def clean(name, fmt):
        if fmt and fmt!='unknown':
            return name.replace(f"-{fmt}", "")
        return name

    df_task['task'] = df_task.apply(lambda r: clean(r['task'], r['format']), axis=1)
    format_stat = df_part['format'].value_counts()
    total_participant = len(df_part)
    print(f"Total number of valid participants: {total_participant}")

    return df_task, df_part, format_stat



In [2]:
import sys
sys.path.append('/Users/shiyihe/Desktop/USABILITY_ANALYSIS')   

# load data , rename task name, etc,.
folder = '/Users/shiyihe/Desktop/USABILITY_ANALYSIS/tabular'
all_data = load_all_data(folder, ignore_completed=True)
df_task, df_part, format_stat= time_analysis(all_data)
df_task

Total number of valid participants: 106


Unnamed: 0,participantId,task,startTime,endTime,duration_sec,duration_min,format
0,64457bc906c125cebd4bf66b,introduction,1746559492770,1746559515399,22.629,0.377150,json5
1,64457bc906c125cebd4bf66b,qualifications,1746559515600,1746559523566,7.966,0.132767,json5
2,64457bc906c125cebd4bf66b,consent,1746559523735,1746559534847,11.112,0.185200,json5
3,64457bc906c125cebd4bf66b,task,1746559535031,1746559541217,6.186,0.103100,json5
4,64457bc906c125cebd4bf66b,writing-task-NL,1746559541303,1746559656705,115.402,1.923367,json5
...,...,...,...,...,...,...,...
3281,6675c40cdc52b37294f0514e,modifying-task-tabular-4,1746235729585,1746235939094,209.509,3.491817,hjson
3282,6675c40cdc52b37294f0514e,modifying-task-tabular-4_post-task-question,1746235939448,1746235944746,5.298,0.088300,hjson
3283,6675c40cdc52b37294f0514e,$nasa-tlx.co.nasa-tlx,1746235945130,1746235958400,13.270,0.221167,hjson
3284,6675c40cdc52b37294f0514e,post-task-survey-tlx,1746235958794,1746236001871,43.077,0.717950,hjson


In [None]:
def help_analysis(all_data):
    


In [None]:
1.任务名字聚合summary
2.不同任务每个format的点击次数显著性差异以及箱形图可视化

In [None]:
quiz analysis 
1. 提取答案数据
2. 计算答案正确率
3. 聚合数据 
4. 可视化不同format之间的正确率显著性差异可视化

survey analysis
1. 提取数据
2. 聚合任务平均值
3. 可视化不同任务在format之间的con+diff差异可视化

nltx analysis
1. 提取数据
2. 可视化format之间的con+diff差异可视化

post-task-survey: todo

reading 
1. 提取答案数据
2. 计算答案正确率
3. 聚合数据
4. 可视化不同任务在format之间的正确率显著性差异可视化


