In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath('../')))

from tqdm import tqdm
import pandas as pd
from utils.file_utils import read_image, read_json, save_image, save_json, read_pkl
from utils.draw_utils import draw_box
from utils.helper_utils import float2_0_1000

# make val

In [5]:
img_root = '/home/shaotao/DATA/AMEX/screenshot'
ann_root = '/home/shaotao/DATA/AMEX/element_anno'
shape_pkl_p = './out/amex_img_shapes.pkl'
val_df_p = './out/amex_val.xlsx'
choice = 'text'
assert choice in {'func', 'text'}
out_json_p = f'amex_val_{choice}.json'

df = pd.read_excel(val_df_p)
# get ori img shape dict
img_shape_dict = read_pkl(shape_pkl_p)
all_ann_p = sorted(os.listdir(ann_root))

INIT_PROMPT = """<image>Output the location of target element according to the given instruction.
## Instruction
{instruction}"""

import random
random.seed(42)
all_datas = []
for img_idx in tqdm(range(len(df))):
    ann_p = os.path.join(ann_root, df.iloc[img_idx]['filename'])
    ann = read_json(ann_p)
    
    img_p = ann['image_path']
    click_ele_lst = ann['clickable_elements']
    h, w = img_shape_dict[img_p]
    img_p = os.path.join(img_root, img_p)
    if img_idx % 100 == 0:
        img = read_image(img_p)
        
    random.shuffle(click_ele_lst)
    conversation = []
    if choice == 'func':
        for ele_idx, ele in enumerate(click_ele_lst):
            box = ele['bbox']
            func_ann = ele.get('functionality', '').strip()
            has_func = func_ann != ''
            if has_func:
                if func_ann.startswith('Click to '):
                    func_ann = func_ann.lower().replace('click to ', '')
                prompt = INIT_PROMPT.format(instruction=func_ann)
                break
            else:
                continue
    else:
        for ele_idx, ele in enumerate(click_ele_lst):
            box = ele['bbox']
            text_ann = ele.get('xml_desc', [])
            has_text = len(text_ann) > 0
            if has_text:
                text_ann = text_ann[0]
                text_ann = text_ann.replace('\n', ',')
                prompt = INIT_PROMPT.format(instruction=text_ann)
                break
            else:
                continue
            
    x1, y1, x2, y2 = box
    gt_box = x1 / w, y1 / h, x2 / w, y2 / h
    if img_idx % 100 == 0:
        img = draw_box(img, gt_box)
        save_image(img, f'tmp_{img_idx}.jpg')
    try:
        gt_box = list(map(float2_0_1000, gt_box))
    except Exception as e:
        print(e)
        continue
    ans = f'({gt_box[0]},{gt_box[1]},{gt_box[2]},{gt_box[3]})'
    conversation.append({'from': 'human', 'value': prompt})
    conversation.append({'from': 'gt', 'value': ans, 'type': 'func' if choice == 0 else 'text'})

    line = {'conversation': conversation, 'image_lst': [os.path.join(img_root, img_p)]}
    all_datas.append(line)
print('total data num: ', len(all_datas))
save_json(all_datas, out_json_p)

100%|██████████| 1000/1000 [00:00<00:00, 2597.33it/s]

total data num:  1000





# make train

In [18]:
img_root = '/home/shaotao/DATA/AMEX/screenshot'
ann_root = '/home/shaotao/DATA/AMEX/element_anno'
shape_pkl_p = './out/amex_img_shapes.pkl'
df_p = './out/amex_func_20k.xlsx'
ele_per_diag = 10
inst_type = 'func'
out_json_p = f'amex_20k_{inst_type}.json'
assert inst_type in ['func', 'text']

df = pd.read_excel(df_p)
# get ori img shape dict
img_shape_dict = read_pkl(shape_pkl_p)

all_ann_p = sorted(os.listdir(ann_root))

INIT_PROMPT = """<image>Output the location of target element according to the given instruction.
## Instruction
{instruction}"""

CONTINUE_PROMPT = """## Instruction
{instruction}"""

import random
random.seed(42)
all_datas = []
for img_idx in tqdm(range(df.shape[0])):
    ann_p = os.path.join(ann_root, df.iloc[img_idx]['filename'])
    ann = read_json(ann_p)
    
    img_p = ann['image_path']
    click_ele_lst = ann['clickable_elements']
    h, w = img_shape_dict[img_p]
    img_p = os.path.join(img_root, img_p)
    if img_idx % 500 == 0:
        img = read_image(img_p)
    random.shuffle(click_ele_lst)
    
    # find specific elements
    final_ele_lst = []
    if inst_type == 'func':
        for ele in click_ele_lst:
            if len(final_ele_lst) == ele_per_diag:
                break
            func_ann = ele.get('functionality', '').strip()
            has_func = func_ann != ''
            if has_func:
                final_ele_lst.append(ele)
    else:
        for ele in click_ele_lst:
            if len(final_ele_lst) == ele_per_diag:
                break
            text_ann = ele.get('xml_desc', [])
            has_text = len(text_ann) > 0
            if has_text:
                final_ele_lst.append(ele)
        
    conversation = []
    for ele_idx, ele in enumerate(final_ele_lst):
        box = ele['bbox']
        if inst_type == 'func':
            func_ann = ele.get('functionality', '').strip()
            has_func = func_ann != ''
            if has_func:
                if func_ann.startswith('Click to '):
                    func_ann = func_ann.lower().replace('click to ', '')
                    
                if ele_idx == 0:   
                    prompt = INIT_PROMPT.format(instruction=func_ann)
                else:
                    prompt = CONTINUE_PROMPT.format(instruction=func_ann)
            else:
                print(f'no func ann in idx: {img_idx}')
                continue
        else:
            text_ann = ele.get('xml_desc', [])
            has_text = len(text_ann) > 0
            if has_text:
                text_ann = text_ann[0]
                text_ann = text_ann.replace('\n', ',')
                if ele_idx == 0:   
                    prompt = INIT_PROMPT.format(instruction=text_ann)
                else:
                    prompt = CONTINUE_PROMPT.format(instruction=text_ann)
            else:
                print(f'no text ann in idx: {img_idx}')
                continue
            
        x1, y1, x2, y2 = box
        x1, y1, x2, y2 = x1 / w, y1 / h, x2 / w, y2 / h
        cent_x, cent_y = (x1 + x2) / 2, (y1 + y2) / 2
        pt = [cent_x, cent_y]
        try:
            pt = list(map(float2_0_1000, pt))
        except Exception as e:
            print('idx: ', img_idx, e)
            continue
        ans = f'({pt[0]},{pt[1]})'
        conversation.append({'from': 'human', 'value': prompt})
        conversation.append({'from': 'gpt', 'value': ans})
        if img_idx % 500 == 0:
            img = draw_box(img, (x1, y1, x2, y2))
    if img_idx % 500 == 0:
        save_image(img, f'tmp_{img_idx}.jpg')
    line = {'conversation': conversation, 'image_lst': [os.path.join(img_root, img_p)]}
    all_datas.append(line)
    if len(all_datas) % 500 == 0:
        if inst_type == 'func':
            print(f'IDX: {len(all_datas)},  sample func_ann: {func_ann}')
        else:
            print(f'IDX: {len(all_datas)},  sample text_ann: {text_ann}')
print('total data num: ', len(all_datas))
save_json(all_datas, out_json_p)

  3%|▎         | 501/20000 [00:00<00:08, 2415.61it/s]

idx:  242 get input: 1.1854636591478696
idx:  242 get input: 1.06265664160401
idx:  242 get input: 1.799498746867168
idx:  242 get input: 1.5538847117794488
idx:  242 get input: 1.431077694235589
idx:  242 get input: 1.6766917293233083
idx:  242 get input: 1.3082706766917294
IDX: 500,  sample func_ann: view details about 'the international americana music show'.


  7%|▋         | 1323/20000 [00:00<00:05, 3196.11it/s]

IDX: 1000,  sample func_ann: enter a destination to search for travel options.
IDX: 1500,  sample func_ann: select economy class for the flight search.


 12%|█▏        | 2494/20000 [00:00<00:04, 3537.06it/s]

IDX: 2000,  sample func_ann: open the messaging app.
IDX: 2500,  sample func_ann: view details or emails related to 'amazon'


 18%|█▊        | 3533/20000 [00:01<00:05, 3189.48it/s]

IDX: 3000,  sample func_ann: mark the first workout as complete or to start the first workout.
idx:  3161 get input: 1.8028846153846154
idx:  3161 get input: 1.5513784461152882
idx:  3161 get input: 1.3399122807017543
idx:  3345 get input: 1.2735969387755102
idx:  3345 get input: 1.4546703296703296
idx:  3345 get input: 1.6936813186813187
idx:  3345 get input: 1.6483516483516483
idx:  3345 get input: 1.8459821428571428
IDX: 3500,  sample func_ann: go to podcasts.


 20%|██        | 4001/20000 [00:01<00:05, 3199.02it/s]

idx:  3610 get input: 1.7123376623376623
idx:  3610 get input: 1.275974025974026
idx:  3610 get input: 1.5253246753246752
idx:  3610 get input: 1.7746753246753246
idx:  3906 get input: 1.4237882653061225
idx:  3906 get input: 1.7802197802197801
idx:  3906 get input: 1.5556318681318682
idx:  3906 get input: 1.8762755102040816
idx:  3906 get input: 1.527423469387755
idx:  3906 get input: 1.2740384615384617
idx:  3906 get input: 1.3846153846153846
idx:  3906 get input: 1.8762755102040816
IDX: 4000,  sample func_ann: view details of this recent search.
idx:  4298 get input: 1.4623376623376623
idx:  4298 get input: 1.6987012987012986
idx:  4298 get input: 1.2155844155844155
idx:  4298 get input: 1.630952380952381
idx:  4298 get input: 1.844155844155844
idx:  4298 get input: 1.4623376623376623
idx:  4298 get input: 1.1785714285714286
idx:  4332 get input: 1.3968253968253967
idx:  4332 get input: 1.3968253968253967
idx:  4332 get input: 1.6666666666666665
idx:  4332 get input: 1.3968253968253

 24%|██▍       | 4863/20000 [00:01<00:04, 3498.08it/s]

IDX: 4500,  sample func_ann: learn more about the songkran event.
idx:  4765 get input: 1.8577922077922078
idx:  4765 get input: 1.8577922077922078
idx:  4765 get input: 1.029100529100529
idx:  4765 get input: 1.7288359788359788
idx:  4765 get input: 1.3333333333333333
idx:  4765 get input: 1.7142857142857142
idx:  4765 get input: 1.8577922077922078
IDX: 5000,  sample func_ann: navigate to the home tab.


 30%|███       | 6001/20000 [00:01<00:04, 3299.39it/s]

IDX: 5500,  sample func_ann: select 'personal o profesor' (staff or teacher).
IDX: 6000,  sample func_ann: confirm the selected dates for booking.


 34%|███▍      | 6873/20000 [00:02<00:03, 3557.45it/s]

IDX: 6500,  sample func_ann: downvote.
idx:  6542 get input: 1.469298245614035
idx:  6542 get input: 1.3289473684210527
idx:  6542 get input: 1.8408521303258145
idx:  6542 get input: 1.0482456140350878
idx:  6542 get input: 1.6096491228070176
idx:  6542 get input: 1.8408521303258145
idx:  6542 get input: 1.1885964912280702
IDX: 7000,  sample func_ann: claim your artist profile.


 40%|████      | 8033/20000 [00:02<00:03, 3522.27it/s]

IDX: 7500,  sample func_ann: filter search results to show items made of brass.
IDX: 8000,  sample func_ann: view details and purchase the running shoes priced at $25.37.


 45%|████▍     | 8992/20000 [00:02<00:02, 3812.25it/s]

IDX: 8500,  sample func_ann: enter a keyword for search.
IDX: 9000,  sample func_ann: filter products by 'beverages' category.


 51%|█████     | 10140/20000 [00:02<00:02, 3462.61it/s]

IDX: 9500,  sample func_ann: manage tv provider settings.
idx:  9649 get input: 1.728021978021978
IDX: 10000,  sample func_ann: filter events that are online
idx:  10083 get input: 1.045112781954887


 55%|█████▍    | 10998/20000 [00:03<00:02, 3759.16it/s]

IDX: 10500,  sample func_ann: start an article search.
IDX: 11000,  sample func_ann: sign out.


 59%|█████▊    | 11744/20000 [00:03<00:02, 3379.85it/s]

IDX: 11500,  sample func_ann: View album
IDX: 12000,  sample func_ann: read the full article about the class of 2024


 65%|██████▌   | 13001/20000 [00:03<00:02, 3217.30it/s]

IDX: 12500,  sample func_ann: view your profile and notifications.
IDX: 13000,  sample func_ann: Opens the 'Explore' section where the user can discover new music, genres, and playlists.


 69%|██████▉   | 13876/20000 [00:04<00:01, 3458.78it/s]

IDX: 13500,  sample func_ann: favorite indie rpg night @ dovetail brewery.
idx:  13939 get input: 1.1068239795918369
idx:  13939 get input: 1.7388392857142856
idx:  13939 get input: 1.4524872448979593
idx:  13939 get input: 1.2796556122448979
IDX: 14000,  sample func_ann: select the 1er set priced at hk$24.39.


 75%|███████▌  | 15001/20000 [00:04<00:01, 3157.39it/s]

IDX: 14500,  sample func_ann: open settings.
idx:  14818 get input: 1.085164835164835
idx:  14818 get input: 1.1152882205513786
idx:  14818 get input: 1.1152882205513786
idx:  14818 get input: 1.8145363408521304
idx:  14818 get input: 1.771291208791209
idx:  14818 get input: 1.717032967032967
idx:  14818 get input: 1.4627192982456139
IDX: 15000,  sample func_ann: Initiates a search for events or other content.


 79%|███████▉  | 15853/20000 [00:04<00:01, 3490.72it/s]

idx:  15210 get input: 1.510204081632653
idx:  15210 get input: 1.7244897959183674
idx:  15210 get input: 1.2959183673469388
idx:  15210 get input: 1.403061224489796
idx:  15210 get input: 1.8316326530612246
idx:  15408 get input: 1.6105889724310778
idx:  15408 get input: 1.6483516483516483
idx:  15408 get input: 1.4075814536340852
idx:  15408 get input: 1.8434065934065935
idx:  15408 get input: 1.7077067669172932
idx:  15408 get input: 1.513471177944862
idx:  15408 get input: 1.8408521303258145
IDX: 15500,  sample func_ann: navigate to the live page.
idx:  15518 get input: 1.6719924812030076
idx:  15518 get input: 1.0974310776942355
idx:  15518 get input: 1.7816416040100251
idx:  15518 get input: 1.4946741854636592
idx:  15518 get input: 1.3850250626566416
IDX: 16000,  sample func_ann: confirm the selection and proceed with the chosen departure airport.


 83%|████████▎ | 16555/20000 [00:04<00:01, 3089.86it/s]

IDX: 16500,  sample func_ann: confirm the date selection and proceed with the chosen date.
idx:  16886 get input: 1.3743131868131868
idx:  16886 get input: 1.8829719387755102
idx:  16886 get input: 1.5164835164835164
idx:  16886 get input: 1.442920918367347
idx:  16886 get input: 1.1128826530612246
IDX: 17000,  sample func_ann: set or change the start time of the event.


 88%|████████▊ | 17501/20000 [00:05<00:00, 3363.37it/s]

idx:  17043 get input: 1.7448979591836735
idx:  17408 get input: 1.3925324675324675
idx:  17408 get input: 1.35978835978836
idx:  17408 get input: 1.7777777777777777
idx:  17408 get input: 1.6527777777777777
idx:  17408 get input: 1.6693121693121693
idx:  17408 get input: 1.6666666666666665
idx:  17408 get input: 1.503968253968254
idx:  17408 get input: 1.8636363636363638
IDX: 17500,  sample func_ann: read more of the post.


 92%|█████████▏| 18373/20000 [00:05<00:00, 3508.52it/s]

IDX: 18000,  sample func_ann: open appium settings.
IDX: 18500,  sample func_ann: view or edit the search summary for car rentals in los angeles.


 96%|█████████▌| 19131/20000 [00:05<00:00, 3627.52it/s]

idx:  18786 get input: 1.7392857142857143
idx:  18786 get input: 1.7142857142857142
idx:  18786 get input: 1.8675324675324676
idx:  18786 get input: 1.0061688311688313
idx:  18786 get input: 1.8675324675324676
idx:  18786 get input: 1.3840909090909093
IDX: 19000,  sample func_ann: edit the device name.
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19061 get input: 1.360576923076923
idx:  19199 get input: 1.7301587301587302
idx:  19199 get input: 1.6964285714285716
idx:  19199 get input: 1.7301587301587302
idx:  19199 get input: 1.753968253968254
idx:  19199 get input: 1.3624338624338623
idx:  19199 get input: 1.8662337662337662
IDX: 19500,  sample func_ann: clear

100%|██████████| 20000/20000 [00:05<00:00, 3433.09it/s]


IDX: 20000,  sample func_ann: navigate to the live tab.
total data num:  20000
