In [1]:
import numpy as np
import pandas as pd
import json
from google import genai

from google.oauth2.service_account import Credentials
from google.cloud import storage
import os


scopes = ["https://www.googleapis.com/auth/cloud-platform"]
SERVICE_ACCOUNT_FILE = "/Data2/Arun-UAV/NLP/new_cloud_coount.json"
credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=scopes)

client = storage.Client(credentials=credentials)

gen_client = genai.Client(
    vertexai=True,
    project='third-apex-476512-a7',   # or set directly
    location='us-central1',    # or set directly, e.g. "us-central1"
    credentials=credentials
)

In [2]:
from pycocotools.coco import COCO
import os

def load_coco(annotation_json_path):
    """
    Load COCO annotations file and return a COCO object.
    """
    if not os.path.exists(annotation_json_path):
        raise FileNotFoundError(f"Annotation file not found: {annotation_json_path}")
    return COCO(annotation_json_path)

def xywh_to_xyxy(bxy):
    """
    Convert COCO bbox [x, y, w, h] -> [xmin, ymin, xmax, ymax] (integers)
    """
    x, y, w, h = bxy
    xmin = int(round(x))
    ymin = int(round(y))
    xmax = int(round(x + w))
    ymax = int(round(y + h))
    return [xmin, ymin, xmax, ymax]

def get_objects_for_images(coco, image_ids=None, file_names=None, include_segmentation=False):
    """
    Return object annotations for the given images (by image_ids or file_names).
    Output format:
      {
        <image_id>: {
            "file_name": <str>,
            "width": <int>,
            "height": <int>,
            "annotations": [
                {
                  "ann_id": <annotation_id>,
                  "category_id": <int>,
                  "category_name": <str>,
                  "bbox_xywh": [x,y,w,h],
                  "bbox_xyxy": [xmin,ymin,xmax,ymax],
                  "area": <float>,
                  "iscrowd": <0/1>,
                  "segmentation": <...> (optional)
                }, ...
            ]
        }, ...
      }
    """
    # resolve image ids if file_names provided
    ids = []
    if file_names:
        for fn in file_names:
            results = coco.getImgIds(imgIds=[], file_name=fn)
            if len(results) == 0:
                # try matching by partial name or raise
                raise ValueError(f"No image found with file_name '{fn}' in COCO annotations")
            ids.extend(results)
    if image_ids:
        ids.extend(image_ids)
    if not ids:
        raise ValueError("Provide image_ids or file_names (or both)")

    out = {}
    cat_map = {c['id']: c['name'] for c in coco.loadCats(coco.getCatIds())}

    for img_id in ids:
        img_info = coco.loadImgs(img_id)[0]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)
        ann_list = []
        for a in anns:
            obj = {
                "ann_id": a.get("id"),
                "category_id": a.get("category_id"),
                "category_name": cat_map.get(a.get("category_id"), "unknown"),
                "bbox_xywh": a.get("bbox"),  # [x,y,w,h]
                "bbox_xyxy": xywh_to_xyxy(a.get("bbox")),
                "area": a.get("area"),
                "iscrowd": a.get("iscrowd", 0),
            }
            if include_segmentation:
                obj["segmentation"] = a.get("segmentation")
            ann_list.append(obj)

        out[img_id] = {
            "file_name": img_info.get("file_name"),
            "width": img_info.get("width"),
            "height": img_info.get("height"),
            "annotations": ann_list
        }
    return out

# ----------------------------
# Example usage:
# ----------------------------

In [None]:

ANNO = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/annotations/instances_train2017.json"   # change to your path
coco = load_coco(ANNO)

In [50]:
final_df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/hal_detection_head_train_datasets/coco/gemini_labeld_15k.csv")

In [53]:
final_df.index

RangeIndex(start=0, stop=14574, step=1)

In [54]:
from tqdm import tqdm

all_object_info = []
all_req_info = []
failed = []
for inx, row in tqdm(final_df.iterrows(), total=final_df.shape[0]):
    try:
        image_id = int(row["image_id"].strip(".jpg").split("_")[-1])
        results = get_objects_for_images(coco, image_ids=[image_id])
        object_df = pd.DataFrame(results[image_id]["annotations"])
        req_df = object_df[["category_id", "category_name"]].drop_duplicates()
        req_df.columns = ["object_id", "object"]
        
        all_object_info.append(object_df.to_dict(orient="records"))
        all_req_info.append(req_df.to_dict(orient="records"))
    except Exception as e:
        failed.append(inx)

100%|██████████| 14574/14574 [00:27<00:00, 532.03it/s]


In [57]:
final_df = final_df.drop(index=failed)

In [59]:
final_df.index 

RangeIndex(start=0, stop=14435, step=1)

In [60]:
final_df["object_info"] = all_object_info
final_df["req_info"] = all_req_info

In [85]:
final_df["answer"].nunique()

14435

# batch creation

In [62]:
from tqdm import tqdm

def get_prompt(question, answer, objects_info, candidate_tokens):
    prompt = f"""
You are given:

* A **question** about an image
* Its **answer**
* A list of **candidate tokens** extracted from the answer.
  Each candidate token belongs to one of the following classes:
  **objects, attributes, relations, actions, count, decision tokens.**
* A set of **objects present in the image** with their unique **object IDs**.

Your task is to **assign a list of relevant object IDs** to each candidate token based on semantic meaning and contextual relevance in the answer.
Follow these rules carefully:

1. **Objects:** link to their exact object ID.
2. **Attributes:** assign the ID of the object that expresses this attribute (e.g., “red” → object that is red).
3. **Relations:** assign IDs of all objects involved (e.g., “on” → [table_id, cup_id]).
4. **Actions:** assign IDs of the object(s) performing or receiving the action.
5. **Count:** assign IDs of all objects being counted.
6. **Decision tokens (yes, no, true, false, exist, not, visible, etc.):** assign all relevant IDs if contextually grounded, else leave as `[-1]`.
7. If a candidate’s corresponding object class **does not appear** in the given object list, label it as `[-1]`.

Note: while matching objects, consider symantical meaning no need to be exact match

### 🧾 **Expected Output Format**

```json
[
  {{"word": "red", "class": "attribute", "object_ids": [12]}},
  {{"word": "on", "class": "relation", "object_ids": [5, 8]}},
  {{"word": "dog", "class": "object", "object_ids": [7]}},
  {{"word": "cat", "class": "object", "object_ids": [-1]}},
  {{"word": "three", "class": "count", "object_ids": [2, 3, 4]}},
  {{"word": "visible", "class": "decision_token", "object_ids": [-1]}}
]
```
<image_id>: 

Inputs:
- Question: {question}

- Answer: {answer}

- Objects in Image: {objects_info}

- Candidate Tokens: {candidate_tokens}
"""
    return prompt

all_res = []
i = 0
for inx, row in tqdm(final_df.iterrows()):
    question = row["question"]
    answer  = row["answer"]
    candidate_tokens = row["candidates"]
    object_info = row["req_info"]

    PROMPT = get_prompt(question, answer, object_info, candidate_tokens)
  
    res = {"request":{"contents": [{"role": "user", "parts": [{"text": PROMPT}]}], 
                      "generationConfig": {"temperature": 0.8, "topP": 1, "maxOutputTokens": 5000,"thinking_config":{"thinking_budget":1000}}}}
    all_res.append(res)

14435it [00:00, 15128.46it/s]


In [64]:
df = pd.DataFrame(all_res)
df.to_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl", orient="records", lines=True)

# Uploading files to gcp

In [65]:
def upload_to_gcs(local_path: str, gcs_uri: str):
    """
    Upload a local file to a target GCS URI.

    Args:
        local_path (str): Local file path to upload.
        gcs_uri (str): Target GCS URI like 'gs://my-bucket/path/to/upload.txt'
        service_account_path (str): Path to GCP service account JSON.
    """
    if not gcs_uri.startswith("gs://"):
        raise ValueError("Invalid GCS URI. Must start with gs://")

    parts = gcs_uri[5:].split("/", 1)
    bucket_name = parts[0]
    blob_name = parts[1]

    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.upload_from_filename(local_path)

    print(f"✅ Uploaded {local_path} → {gcs_uri}")


In [66]:
upload_to_gcs(local_path="/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl", gcs_uri = "gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl")

✅ Uploaded /Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl → gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl


# start batch processing

In [68]:
import time

from google import genai
from google.genai.types import CreateBatchJobConfig, JobState, HttpOptions
output_uri = "gs://train_data_vision_1/gemini_batch_info/"

# See the documentation: https://googleapis.github.io/python-genai/genai.html#genai.batches.Batches.create
job = gen_client.batches.create(
    model="gemini-2.5-flash",
    src="gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl",
    config=CreateBatchJobConfig(dest=output_uri),
)
print(f"Job name: {job.name}")
print(f"Job state: {job.state}")

Job name: projects/358874265041/locations/us-central1/batchPredictionJobs/8230525080061345792
Job state: JOB_STATE_PENDING


In [73]:
job_info = gen_client.batches.get(name=job.name)
job_info.state

<JobState.JOB_STATE_SUCCEEDED: 'JOB_STATE_SUCCEEDED'>

# download batch results

In [74]:
def download_from_gcs(gcs_uri: str, local_path: str):
    """
    Download a file from GCS based on its gs:// URI.

    Args:
        gcs_uri (str): GCS URI like 'gs://my-bucket/path/to/file.txt'
        local_path (str): Path to store the downloaded file locally.
    """
    # Parse bucket and blob name
    if not gcs_uri.startswith("gs://"):
        raise ValueError("Invalid GCS URI. Must start with gs://")

    parts = gcs_uri[5:].split("/", 1)
    bucket_name = parts[0]
    blob_name = parts[1]
    
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    blob.download_to_filename(local_path)

    print(f"✅ Downloaded {gcs_uri} → {local_path}")


In [75]:
download_from_gcs(gcs_uri="gs://train_data_vision_1/gemini_batch_info/prediction-model-2025-10-30T11:36:37.902607Z/predictions.jsonl", local_path ="/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl")

✅ Downloaded gs://train_data_vision_1/gemini_batch_info/prediction-model-2025-10-30T11:36:37.902607Z/predictions.jsonl → /Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl


In [76]:
pred_data = pd.read_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl", lines=True)

In [208]:
pred_data.head(5)

Unnamed: 0,request,status,response,processed_time
0,{'contents': [{'parts': [{'text': '\nYou are g...,,{'candidates': [{'avgLogprobs': -0.05581432005...,2025-10-30 11:43:00.463000+00:00
1,{'contents': [{'parts': [{'text': '\nYou are g...,,{'candidates': [{'avgLogprobs': -0.04954319887...,2025-10-30 11:43:00.482000+00:00
2,{'contents': [{'parts': [{'text': '\nYou are g...,,{'candidates': [{'avgLogprobs': -0.06469375922...,2025-10-30 11:43:00.492000+00:00
3,{'contents': [{'parts': [{'text': '\nYou are g...,,{'candidates': [{'avgLogprobs': -0.05808878598...,2025-10-30 11:43:00.488000+00:00
4,{'contents': [{'parts': [{'text': '\nYou are g...,,{'candidates': [{'avgLogprobs': -0.05786834847...,2025-10-30 11:43:00.531000+00:00


In [156]:
def get_word_2_bbox(res, objects_df):
    bb_box_info = []
    for row in res:
        obj_ids = row["object_ids"]
        bbox = objects_df[objects_df["object_id"].isin(obj_ids)][["h", "w", "y", "x"]].to_dict(orient="records")
        bb_box_info.append({"word": row["word"], 'class': row["class"], "bbox": bbox})
    return bb_box_info

In [89]:
final_df.head(2)

Unnamed: 0,image_id,question,answer,image_path,candidates,hallucination_candidates,object_info,req_info
0,COCO_train2014_000000321493.jpg,Please describe this image in detail.,"This outdoor scene captures a bright, clear da...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...,"['outdoor', 'bright', 'clear', 'day', 'sprawli...","['concentration', 'effort', 'overhead', 'behin...","[{'ann_id': 74336, 'category_id': 21, 'categor...","[{'object_id': 21, 'object': 'cow'}, {'object_..."
1,COCO_train2014_000000405541.jpg,Please describe this image in detail.,The indoor scene captures a striking Siamese c...,/Data2/Arun-UAV/NLP/vision_halu/train_datasets...,"['indoor', 'siamese', 'cat', 'snowshoe', 'loun...","['caramel-colored', 'fluffy', 'pink', 'art', '...","[{'ann_id': 19215, 'category_id': 64, 'categor...","[{'object_id': 64, 'object': 'potted plant'}, ..."


In [106]:
res_df = pd.DataFrame(res)

In [109]:
res_df = res_df[res_df["object_ids"].apply(lambda x: False if x == [-1] else True)]

In [190]:
def get_bb_info (res_df, object_df):
    all_res_df = []
    for inx, row in res_df.iterrows():
        obj_ids = row["object_ids"]
        req_bbs = []
        for _id in obj_ids:
            bbs = object_df[object_df["category_id"] == str(_id)]["bbox_xyxy"].to_list()
            req_bbs.extend(bbs)
        all_res_df.append(req_bbs)
    return all_res_df


In [191]:
from tqdm import tqdm

all_res = []
error = 0
for inx, row in tqdm(pred_data.iloc[0:].iterrows()):
    try:
        res = eval(row["response"]["candidates"][0]["content"]["parts"][0]["text"].replace("json", "").strip("```"))
        answer = row["request"]["contents"][0]["parts"][0]["text"].split("- Answer:")[-1].split("- Objects in Image:")[0].strip()
        req_info = final_df[final_df["answer"] == answer]
        question = req_info["question"].values[0]
        object_info = req_info["object_info"].values[0]
        object_df = pd.DataFrame(object_info)
        object_df["category_id"] = object_df["category_id"].apply(lambda x: str(int(x)))
        image_id = req_info["image_id"].values[0]
        image_path = req_info["image_path"].values[0]
    
        res_df = pd.DataFrame(res)
        res_df = res_df[res_df["object_ids"].apply(lambda x: False if x == [-1] else True)]
        bb_info = get_bb_info(res_df, object_df)
        res_df["bb_info"] = bb_info
    
        res_df = res_df[["word", "class", "bb_info"]]
        target_words = res_df.to_dict(orient="records")
    
        all_res.append({
            "image_id": image_id,
            "question": question,
            "answer": answer,
            "target_words": target_words,
            "image_path": image_path
        })
    except Exception as e:
        error += 1

14435it [03:31, 68.27it/s]


In [193]:
df = pd.DataFrame(all_res)

In [194]:
df.head(2)

Unnamed: 0,image_id,question,answer,target_words,image_path
0,COCO_train2014_000000557315.jpg,Please describe this image in detail.,This outdoor scene captures a black bear in wh...,"[{'word': 'black', 'class': 'attribute', 'bb_i...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...
1,COCO_train2014_000000106639.jpg,Please describe this image in detail.,"The image presents an inviting indoor scene, l...","[{'word': 'dining', 'class': 'attribute', 'bb_...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...


In [195]:
total_res_df = pd.DataFrame(all_res)

In [196]:
total_res_df = total_res_df.dropna()

In [202]:
def convert_bb_info_to_xyxy(data):
    """
    Converts list of dicts with 'bb_info' [[x1, y1, x2, y2], ...]
    into explicit bbox dicts: [{'xmin':, 'ymin':, 'xmax':, 'ymax':}, ...]

    Args:
        data (list[dict]): Each element like:
            {'word': str, 'class': str, 'bb_info': [[x1, y1, x2, y2], ...]}

    Returns:
        list[dict]: Converted list with structure:
            [{'word': str,
              'class': str,
              'bbox': [{'xmin':, 'ymin':, 'xmax':, 'ymax':}, ...]}]
    """
    converted = []
    for item in data:
        word = item.get("word", "")
        cls = item.get("class", "")
        bb_info = item.get("bb_info", [])

        # Safely convert all bounding boxes
        bbox_list = []
        for bb in bb_info:
            if len(bb) == 4:
                xmin, ymin, xmax, ymax = map(int, bb)
                bbox_list.append({
                    "xmin": xmin,
                    "ymin": ymin,
                    "xmax": xmax,
                    "ymax": ymax
                })
            else:
                # handle malformed bbox gracefully
                bbox_list.append({
                    "xmin": 0, "ymin": 0, "xmax": 0, "ymax": 0
                })

        converted.append({
            "word": word,
            "class": cls,
            "bbox": bbox_list
        })

    return converted

In [203]:
from tqdm import tqdm
converted_data = []
for i in tqdm(total_res_df["target_words"]):
    converted_data.append(convert_bb_info_to_xyxy(i))

100%|██████████| 13943/13943 [00:04<00:00, 3418.02it/s]


In [210]:
len(converted_data)

13943

In [211]:
total_res_df["target_words"] = converted_data

In [214]:
total_res_df.head(2)

Unnamed: 0,image_id,question,answer,target_words,image_path
0,COCO_train2014_000000557315.jpg,Please describe this image in detail.,This outdoor scene captures a black bear in wh...,"[{'word': 'black', 'class': 'attribute', 'bbox...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...
1,COCO_train2014_000000106639.jpg,Please describe this image in detail.,"The image presents an inviting indoor scene, l...","[{'word': 'dining', 'class': 'attribute', 'bbo...",/Data2/Arun-UAV/NLP/vision_halu/train_datasets...


In [215]:
total_res_df.to_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/coco_long_captions/coco_img_des_10k_bb_annot.csv", index=False)

In [None]:
total_res_df

In [4]:
import pandas as pd
df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/train_processed_flickr_30k.csv")

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...
2,2,A woman is welding metal at a work table.,What is the woman doing at the work table?,"{'woman': '22853', 'metal': '22855', 'work': '...",1897067588,image_id filename phrase obje...
3,3,A girl in a white shirt is holding a ball and ...,What is the girl pointing at?,"{'girl': '27660', 'white': '27663', 'shirt': '...",208472767,image_id filename phrase object...
4,4,"The large, dark colored dog is doing a climbin...","What is the large, dark-colored dog doing?","{'large': '32430', 'dark': '32430', 'colored':...",2176364472,image_id filename phrase obje...


In [None]:
i = 10
print(df["question"].iloc[i])
print(df["caption"].iloc[i])
print(df["expanded_labels"].iloc[i])

Is there a person behind the street lamp?
The street lamp is the focal point, but there is a person bent down directly behind the lamp.
{'street': '218332', 'lamp': '218332', 'focal': '218334', 'point': '218334', 'person': '218336'}


In [None]:
df["bb_df"].iloc[i]

'          image_id        filename  phrase object_name  xmin  ymin  xmax  \\\n228801  4919450790  4919450790.jpg  218328      218328     2   208   500   \n228802  4919450790  4919450790.jpg  218329      218329     5   191   497   \n228803  4919450790  4919450790.jpg  218330      218330   299    29   500   \n228804  4919450790  4919450790.jpg  218332      218332   260    79   288   \n228805  4919450790  4919450790.jpg  218335      218335   262    81   282   \n228806  4919450790  4919450790.jpg  218336      218336   217   182   240   \n\n        ymax  width  height                                          sentences  \n228801   229    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228802   258    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228803   225    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228804   193    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228805   225    500     271  [[/EN#218326/scene

In [None]:
from io import StringIO
import pandas as pd

In [None]:
nested_str = df["bb_df"].iloc[i]
nested_df = pd.read_fwf(StringIO(nested_str))