In [5]:
from pathlib import Path
from pprint import pprint

from tqdm import tqdm

### Convert WebGazer dataset

In [6]:
WEBGAZER_DATASET_INPUT_DIR = Path(input("webgazer dataset root dir:"))
WEBGAZER_DATASET_OUTPUT_DIR = Path("./data/webgazer/")
WEBGAZER_DATASET_INPUT_DIR

WindowsPath('E:/webgazer-dataset/WebGazerETRA2018Dataset_Release20180420')

#### Load metadata

In [7]:
def load_participants(root: Path) -> list[dict]:
    from csv import DictReader

    path = root.joinpath("participant_characteristics.csv")
    with path.open("r") as f:
        participants = [row for row in DictReader(f)]
    
    participants.sort(key=lambda p: int(p["Participant ID"].split("_")[1]))
    return participants

participants = load_participants(WEBGAZER_DATASET_INPUT_DIR)
pprint(participants[0])

{'Age': '25',
 'Date': '4/5/2017',
 'Display Height (pixels)': '900',
 'Display Width (pixels)': '1440',
 'Distance From Screen (cm)': '60',
 'Duration': '16:40:00',
 'Facial Hair': 'None',
 'Gender': 'Male',
 'Notes': 'Did not see the button at the bottom of the Google page. Had to '
          'briefly intervene to show.',
 'Participant ID': 'P_01',
 'Participant Log ID': '1491423217564',
 'Pointing Device': 'Trackpad',
 'Screen Height (cm)': '20.73',
 'Screen Recording Start Time (Unix milliseconds)': '1491423552200',
 'Screen Recording Start Time (Wall Clock UTC)': '4/5/2017 20:36',
 'Screen Width (cm)': '33.17',
 'Self-Reported Eye Color': 'Dark Brown to Brown',
 'Self-Reported Handedness': 'Right',
 'Self-Reported Race': 'Asian',
 'Self-Reported Skin Color': '1',
 'Self-Reported Vision': 'Normal',
 'Setting': 'Laptop',
 'Time of day': '16:00',
 'Touch Typer': 'Yes',
 'Weather': 'Cloudy'}


#### Processing

In [8]:
def load_snapshots(root: Path, participant: dict) -> list[dict]:
    from contextlib import suppress
    from json import loads, JSONDecodeError

    pid = participant["Participant ID"]
    path = root.joinpath(pid).joinpath(pid).with_suffix(".txt")

    snapshots = []
    with path.open("rb") as f:
        for line in f.readlines():
            with suppress(JSONDecodeError):
                snapshots.append(loads(line))

    snapshots.sort(key=lambda snapshot: snapshot["true_time"])
    return snapshots


def convert_and_write_snapshots(participant: dict, snapshots: list[dict], dest: Path):
    from csv import DictWriter

    pid = participant["Participant ID"]

    display_width = int(participant["Display Width (pixels)"])
    display_height = int(participant["Display Height (pixels)"])

    timestamp_origin = snapshots[0]["true_time"]

    path = dest.joinpath(pid).with_suffix(".csv")

    with path.open("w") as f:
        writer = DictWriter(f, ["time", "x", "y"], lineterminator="\n")
        writer.writeheader()

        for snapshot in snapshots:
            if snapshot["right_gaze_point_validity"] == 1:
                side = "right"
            elif snapshot["left_gaze_point_validity"] == 1:
                side = "left"
            else:
                continue

            x, y = snapshot[f"{side}_gaze_point_on_display_area"]

            writer.writerow(
                {
                    "time": snapshot["true_time"] - timestamp_origin,
                    "x": int(x * display_width),
                    "y": int(y * display_height),
                }
            )


def process_participant(participant: dict, input: Path, output: Path):
    convert_and_write_snapshots(
        participant=participant,
        snapshots=load_snapshots(root=input, participant=participant),
        dest=output,
    )


In [25]:
from concurrent.futures import Executor, as_completed

def run(executor: Executor, participants: list[dict], input: Path, output: Path):
    with tqdm(total=len(participants), desc="participants") as pbar:
        futures = [
            executor.submit(
                process_participant,
                participant=participant,
                input=input,
                output=output,
            )
            for participant in participants
        ]

        for _ in as_completed(futures):
            pbar.update(1)

In [27]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=4) as executor:
    run(executor, participants, WEBGAZER_DATASET_INPUT_DIR, WEBGAZER_DATASET_OUTPUT_DIR)

participants: 100%|██████████| 51/51 [04:54<00:00,  5.77s/it]


### Kaggle dataset

In [9]:
KAGGLE_DATASET_INPUT_DIR = Path(input("kaggle dataset root dir:"))
KAGGLE_DATASET_OUTPUT_DIR = Path("./data/kaggle/")
KAGGLE_DATASET_INPUT_DIR

WindowsPath('E:/dataset_normalised_5mins')

In [19]:
import csv

for path in tqdm(KAGGLE_DATASET_INPUT_DIR.glob("*.csv")):
    pid, activity = path.name.split("_")

    snapshots = []
    with path.open("r") as f:
        for row in csv.DictReader(f):
            time = int(row["timestamp"]) / 1000
            x, y = int(row['x']), int(row["y"])
            snapshots.append({"time": time, "x": x, "y": y})
    
    out = KAGGLE_DATASET_OUTPUT_DIR / path.name
    
    with out.open("w") as f:
        writer = csv.DictWriter(f, ["time", "x", "y"], lineterminator="\n")
        writer.writeheader()
        writer.writerows(snapshots)

192it [00:10, 18.59it/s]
