In [2]:
import numpy as np
import pandas as pd

In [3]:
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm

def parse_annotation(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    filename = root.findtext("filename")
    image_id = os.path.splitext(filename)[0] if filename else None
    width = int(root.findtext("size/width", default="0"))
    height = int(root.findtext("size/height", default="0"))
    
    objects = []
    for obj in root.findall("object"):
        name = obj.findtext("name")
        phrase = obj.findtext("phrase", default=name)
        
        bbox = obj.find("bndbox")
        if bbox is None:
            # Some entities don't have bounding boxes (e.g., sky, background)
            continue

        try:
            xmin = int(float(bbox.findtext("xmin")))
            ymin = int(float(bbox.findtext("ymin")))
            xmax = int(float(bbox.findtext("xmax")))
            ymax = int(float(bbox.findtext("ymax")))
        except (TypeError, ValueError):
            # Skip malformed entries
            continue

        objects.append({
            "image_id": image_id,
            "filename": filename,
            "phrase": phrase,
            "object_name": name,
            "xmin": xmin,
            "ymin": ymin,
            "xmax": xmax,
            "ymax": ymax,
            "width": width,
            "height": height,
        })
    
    return objects



def load_all_annotations(xml_dir):
    """Parse all XML files into a single DataFrame."""
    all_objects = []
    for xml_file in tqdm(os.listdir(xml_dir)):
        if xml_file.endswith(".xml"):
            xml_path = os.path.join(xml_dir, xml_file)
            all_objects.extend(parse_annotation(xml_path))
    return pd.DataFrame(all_objects)

In [4]:
res = parse_annotation("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/Annotations/1000092795.xml")
pd.DataFrame(res)

Unnamed: 0,image_id,filename,phrase,object_name,xmin,ymin,xmax,ymax,width,height
0,1000092795,1000092795.jpg,1,1,159,125,219,335,333,500
1,1000092795,1000092795.jpg,4,4,1,220,211,473,333,500
2,1000092795,1000092795.jpg,4,4,276,215,332,337,333,500
3,1000092795,1000092795.jpg,1,1,197,110,261,373,333,500
4,1000092795,1000092795.jpg,5,5,207,144,257,244,333,500
5,1000092795,1000092795.jpg,5,5,173,156,217,236,333,500
6,1000092795,1000092795.jpg,2,2,180,125,206,156,333,500
7,1000092795,1000092795.jpg,2,2,198,114,240,146,333,500
8,1000092795,1000092795.jpg,3,3,173,184,198,203,333,500
9,1000092795,1000092795.jpg,3,3,158,198,191,225,333,500


In [5]:

xml_dir = "/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/Annotations"
df = load_all_annotations(xml_dir)

  0%|          | 0/31783 [00:00<?, ?it/s]

100%|██████████| 31783/31783 [00:03<00:00, 9665.46it/s]


In [6]:
df.to_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/flickr30k_annotations.csv", index=False)

In [6]:
all_sen_files = os.listdir("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/Sentences")

In [7]:
path = "/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/Sentences/"

In [8]:
all_sens = []
all_ids = []
for i in all_sen_files:
    file_path = os.path.join(path, i)
    with open(file_path, "r") as f:
        sentences = f.readlines()
    all_ids.append(i.strip(".txt"))
    all_sens.append([s.strip() for s in sentences])

In [9]:
sens_df = pd.DataFrame({
    "image_id": all_ids,
    "sentences": all_sens
})

In [10]:
sens_df.head(2)

Unnamed: 0,image_id,sentences
0,6340723441,"[[/EN#253166/people A muscled , shirtless man]..."
1,270864951,[[/EN#67746/people A woman] wears [/EN#67748/c...


In [11]:
total_df = pd.merge(df, sens_df, on="image_id", how="left")

In [12]:
total_df.head(2)

Unnamed: 0,image_id,filename,phrase,object_name,xmin,ymin,xmax,ymax,width,height,sentences
0,2960759328,2960759328.jpg,84277,84277,214,104,260,190,500,333,[[/EN#84277/people Two cowboys] riding [/EN#84...
1,2960759328,2960759328.jpg,84277,84277,287,106,330,191,500,333,[[/EN#84277/people Two cowboys] riding [/EN#84...


In [13]:
import re

text = "[/EN#84277/people Two cowboys] on [/EN#84280/animals horses] chase [/EN#84278/animals a young cow] with [/EN#84282/other lassos] ' ."

def get_labels(text):
    pattern = r"\[(/EN#[0-9]+/[a-zA-Z_]+)\s+([^\]]+)\]"
    matches = re.findall(pattern, text)

    cleaned_labels = {}
    for label, phrase in matches:
        cleaned_labels[phrase.strip()] = label.split("/")[-2].strip("EN#")
    return cleaned_labels

def clean_annotated_text(text: str) -> str:
    cleaned = re.sub(r'\[/EN#[0-9]+/[a-zA-Z_]+\s(.*?)\]', r'\1', text)
    
    # Clean up multiple spaces, stray punctuation
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    # Fix punctuation spacing
    cleaned = re.sub(r'\s([?.!,"])', r'\1', cleaned)
    
    return cleaned

In [14]:
import random

all_dfs = []

for name, df in total_df.groupby("image_id"):
    sentences = df["sentences"].iloc[0]
    # sam_sen = random.choice(sentences)
    clean_sens = [clean_annotated_text(i) for i in sentences]
    req_df = pd.DataFrame({"caption": clean_sens, "image_id": name, "bb_df": df.to_json(orient="records")})
    all_dfs.append(req_df)

In [15]:
req_total_df = pd.concat(all_dfs)

In [17]:
req_total_df.head(2)

Unnamed: 0,caption,image_id,bb_df
0,Two young guys with shaggy hair look at their ...,1000092795,"[{""image_id"":""1000092795"",""filename"":""10000927..."
1,"Two young, White males are outside near many b...",1000092795,"[{""image_id"":""1000092795"",""filename"":""10000927..."


In [18]:
processed_df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/train_processed_flickr_30k.csv")

In [19]:
processed_df.head(2)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...


In [20]:
from tqdm import tqdm

all_img_ids = []
all_bb_info = []
error = 0
for cap in tqdm(processed_df["caption"]):
    try:
        filt_df = req_total_df[req_total_df["caption"] == cap].iloc[0]
        img_id = filt_df["image_id"]
        bb_info = filt_df["bb_df"]
        
        all_img_ids.append(img_id)
        all_bb_info.append(bb_info)
    except:
        error += 1
        all_img_ids.append(None)
        all_bb_info.append(None)

100%|██████████| 31713/31713 [08:39<00:00, 61.04it/s]


In [21]:
processed_df["db_image_id"] = all_img_ids
processed_df["bb_info"] = all_bb_info

In [22]:
processed_df.head(2)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df,db_image_id,bb_info
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...,1191423753,"[{""image_id"":""1191423753"",""filename"":""11914237..."
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...,1360456780,"[{""image_id"":""1360456780"",""filename"":""13604567..."


In [23]:
processed_df["image_id"] = processed_df["image_id"].apply(lambda x: str(int(x)))

In [24]:
processed_df_clean = processed_df[processed_df["image_id"] == processed_df["db_image_id"]]

In [25]:
processed_df_clean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df_clean.dropna(inplace=True)


In [26]:
processed_df_clean.head(4)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df,db_image_id,bb_info
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...,1191423753,"[{""image_id"":""1191423753"",""filename"":""11914237..."
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...,1360456780,"[{""image_id"":""1360456780"",""filename"":""13604567..."
2,2,A woman is welding metal at a work table.,What is the woman doing at the work table?,"{'woman': '22853', 'metal': '22855', 'work': '...",1897067588,image_id filename phrase obje...,1897067588,"[{""image_id"":""1897067588"",""filename"":""18970675..."
3,3,A girl in a white shirt is holding a ball and ...,What is the girl pointing at?,"{'girl': '27660', 'white': '27663', 'shirt': '...",208472767,image_id filename phrase object...,208472767,"[{""image_id"":""208472767"",""filename"":""208472767..."


In [29]:
all_token_bb_dict = []
for inx, row in processed_df_clean.iterrows():
    bb_info = row["bb_info"]
    bb_df = pd.DataFrame(eval(bb_info))
    labels = eval(row["expanded_labels"])
    token_bb_dict = {}
    for token, label in labels.items():
        f_bb_df = bb_df[bb_df["phrase"] == label]
        if f_bb_df.shape[0] == 0:
            token_bb_dict[token] = []
        else:
            token_bb_dict[token] = f_bb_df[["xmin", "ymin", "xmax", "ymax"]].to_dict(orient="records")
    
    all_token_bb_dict.append(token_bb_dict)

In [30]:
processed_df_clean.head(2)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df,db_image_id,bb_info
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...,1191423753,"[{""image_id"":""1191423753"",""filename"":""11914237..."
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...,1360456780,"[{""image_id"":""1360456780"",""filename"":""13604567..."


In [31]:
processed_df_clean["candidate_bb_info"] = all_token_bb_dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_df_clean["candidate_bb_info"] = all_token_bb_dict


In [33]:
processed_df_clean.tail(2)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df,db_image_id,bb_info,candidate_bb_info
31711,31711,A little boys is enjoying a blue lollipop.,What is the boy eating?,"{'little': '249282', 'boys': '249282', 'blue':...",6178994097,image_id filename phrase obj...,6178994097,"[{""image_id"":""6178994097"",""filename"":""61789940...","{'little': [{'xmin': 4, 'ymin': 2, 'xmax': 333..."
31712,31712,A woman kneading dough on a table,What is the woman doing on the table?,"{'woman': '283159', 'dough': '283162', 'table'...",89779839,image_id filename phrase object_na...,89779839,"[{""image_id"":""89779839"",""filename"":""89779839.j...","{'woman': [{'xmin': 2, 'ymin': 28, 'xmax': 343..."


In [38]:
final_target_df = processed_df_clean.drop(columns=["db_image_id", "bb_info", "bb_df", "expanded_labels", "Unnamed: 0"])

In [39]:
final_target_df.head(2)

Unnamed: 0,caption,question,image_id,candidate_bb_info
0,Three people are on a sidewalk.,How many people are on the sidewalk?,1191423753,"{'Three': [{'xmin': 351, 'ymin': 177, 'xmax': ..."
1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,1360456780,"{'Beautiful': [{'xmin': 106, 'ymin': 38, 'xmax..."


In [41]:
final_target_df.to_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/final_flicker_30k_bb_annot.csv", index=False)