## Data Processing

This notebook merges and processes raw data from `data/raw/` to prepare it for training. It extracts and formats mouse cursor path data, then exports the processed dataset to `data/processed/`.  

### Current Scope  
- Merges and cleans raw data for training.  
- Extracts and processes mouse cursor path data.  
- Exports the structured dataset for model input.  


In [1]:
import os
import json
import time
from typing import List, Dict, Any

In [2]:
current_path = os.getcwd()
bumblebee_path = os.path.dirname(current_path)

In [3]:
training_data_folder_name = ["paths-only", "paths-plus-speed"]
training_data_folder_path = [
    os.path.join(bumblebee_path, "data", "raw", folder)
    for folder in training_data_folder_name
]
prepared_data_file_name = (
    f"merged-prepared-data-{time.strftime('%Y-%m-%d-%H:%M:%S')}.json"
)

In [4]:
files_to_merge = []
for folder in training_data_folder_path:
    files = os.listdir(folder)
    for file in files:
        if file.endswith(".json"):
            files_to_merge.append(os.path.join(folder, file))
        else:
            print(f"Skipping file {file} as it is not a json file")

In [5]:
def merge_json_data(files_to_merge) -> List[Dict[str, Any]]:
    merged_data = []
    for file in files_to_merge:
        with open(file, "r") as f:
            data = json.load(f)
            merged_data.extend(data)
    return merged_data


merged_json_data = merge_json_data(files_to_merge)

In [14]:
def prepare_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    prepared_data = []
    for dp in data:
        current_data = {}
        current_data["initial"] = dp["initialPosition"]
        current_data["final"] = dp["finalMousePosition"]

        current_path_data = []
        for step in dp["movementData"]:
            movement_step = {
                "x": step["x"],
                "y": step["y"],
            }
            current_path_data.append(movement_step)
        current_data["path"] = current_path_data
        prepared_data.append(current_data)
    return prepared_data


prepared_data = prepare_data(merged_json_data)

In [15]:
json.dump(
    prepared_data,
    open(
        os.path.join(bumblebee_path, "data", "processed", prepared_data_file_name), "w"
    ),
    indent=2,
)