In [4]:
import tensorflow as tf
from android_env.proto.a11y import android_accessibility_forest_pb2
import json

In [50]:
dataset = tf.data.TFRecordDataset("./dataset/android_control/android_control-00011-of-00020",compression_type="GZIP")

In [51]:
for raw_record in dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    # 提取字段
    features = example.features.feature
    accessibility_bytes = example.features.feature["accessibility_trees"].bytes_list.value[0]
    forest = android_accessibility_forest_pb2.AndroidAccessibilityForest().FromString(accessibility_bytes)
    

In [None]:
from android_env.proto.a11y import android_accessibility_forest_pb2

def collect_all_nodes(windows):
    """收集所有窗口中的所有节点"""
    all_nodes = []
    for window in windows:
        for node in window.tree.nodes:
            all_nodes.append(node)
    return all_nodes

def is_point_inside_bounds(x, y, bounds):
    """判断点击是否在节点 bounds 内"""
    return (
        bounds.left <= x <= bounds.right and
        bounds.top <= y <= bounds.bottom
    )

def find_nodes_containing_point(nodes, x, y):
    """找出包含点击点的所有节点"""
    return [node for node in nodes if is_point_inside_bounds(x, y, node.bounds_in_screen)]

def has_click_action(node):
    """判断是否具有 click 或 long_click 行为"""
    return any(action.id in [1, 16] for action in getattr(node, "actions", []))

def has_semantic_info(node):
    """判断是否有有意义的识别信息"""
    return bool(
        getattr(node, "view_id_resource_name", "") or
        getattr(node, "content_description", "")
    )

def get_best_target_node(nodes):
    """
    优先返回：is_clickable=True 且有语义的节点；
    若没有，则退而求其次选 is_clickable=True；
    最后 fallback 到普通的深层节点
    """
    semantically_clickable_nodes = [
        node for node in nodes
        if getattr(node, "is_clickable", False) and has_semantic_info(node)
    ]
    if semantically_clickable_nodes:
        return max(semantically_clickable_nodes, key=lambda n: getattr(n, "depth", 0))

    clickable_nodes = [
        node for node in nodes
        if getattr(node, "is_clickable", False)
    ]
    if clickable_nodes:
        return max(clickable_nodes, key=lambda n: getattr(n, "depth", 0))

    # fallback: 深度最大节点
    return max(nodes, key=lambda node: getattr(node, "depth", 0), default=None)


def extract_node_info(node):
    """提取目标元素信息"""
    return {
        "class": node.class_name,
        "content description": getattr(node, "content_description", ""),
        "is_clickable": getattr(node, "is_clickable", False),
        "resource name": getattr(node, "view_id_resource_name", ""),
        "package": node.package_name
    }

def convert_click_to_element_action(forest_bytes, x, y):
    """
    主函数：由 accessibility tree 和 (x, y) 坐标，得到结构化点击动作
    """
    forest = android_accessibility_forest_pb2.AndroidAccessibilityForest()
    forest.ParseFromString(forest_bytes)

    all_nodes = collect_all_nodes(forest.windows)
    matched_nodes = find_nodes_containing_point(all_nodes, x, y)

    if not matched_nodes:
        print(f"⚠️ 没有找到包含点 ({x}, {y}) 的元素")
        return None

    target_node = get_best_target_node(matched_nodes)
    if target_node is None:
        print(f"⚠️ 匹配失败：点 ({x}, {y}) 没有合适的元素")
        return None

    return {
        "action type": "click",
        "target element": extract_node_info(target_node)
    }


In [67]:
features = example.features.feature
x, y = 544, 369  # 假设点击坐标

result = convert_click_to_element_action(
    forest_bytes=features['accessibility_trees'].bytes_list.value[1],
    x=x,
    y=y
)

print(result)

{'action type': 'click', 'target element': {'class': 'android.widget.TextView', 'content description': '', 'is_clickable': False, 'resource name': 'com.ticktick.task:id/title', 'package': 'com.ticktick.task'}}
