In [3]:
import numpy as np
import pandas as pd
import json
from google import genai

from google.oauth2.service_account import Credentials
from google.cloud import storage
import os


scopes = ["https://www.googleapis.com/auth/cloud-platform"]
SERVICE_ACCOUNT_FILE = "/Data2/Arun-UAV/NLP/new_cloud_coount.json"
credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=scopes)

client = storage.Client(credentials=credentials)

gen_client = genai.Client(
    vertexai=True,
    project='third-apex-476512-a7',   # or set directly
    location='us-central1',    # or set directly, e.g. "us-central1"
    credentials=credentials
)

In [4]:
final_df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/finecops_ref/train_processed_finecopes_ref_30k.csv")

In [5]:
final_df.head(5)

Unnamed: 0,image_id,answer,objects_info
0,2373626,"The dish, positioned to the right of the gray ...","[{'object_id': '3725798', 'names': ['dish'], '..."
1,285719,"On the left side of the metal and black fence,...","[{'object_id': '4401457', 'names': ['bike'], '..."
2,2383125,The post that is to the left of the car that i...,"[{'object_id': '534946', 'names': ['car'], 'h'..."
3,2349520,The clock that is to the left of the tower tha...,"[{'object_id': '3903072', 'names': ['roof'], '..."
4,2352548,The white keyboard positioned to the right of ...,"[{'object_id': '1807596', 'names': ['computer'..."


In [6]:
final_df.head(2)

Unnamed: 0,image_id,answer,objects_info
0,2373626,"The dish, positioned to the right of the gray ...","[{'object_id': '3725798', 'names': ['dish'], '..."
1,285719,"On the left side of the metal and black fence,...","[{'object_id': '4401457', 'names': ['bike'], '..."


In [7]:
final_df = final_df.drop_duplicates(subset=["answer"])

In [None]:
final_df["answer"]

29839

# batch creation

In [48]:
from tqdm import tqdm

def get_prompt(caption, labels):
  prompt = f"""
You are an expert visual reasoning model analyzing captions and scene descriptions.

---

#### **Task 1 — Question Generation**

Given a **caption** that describes an image, generate **one natural, human-like question** that someone might ask about that image based on the caption.
The question must:

* Be **relevant**, **contextual**, and **grammatically natural**.
* Avoid generic phrasing (“What is shown here?”).
* Focus on salient visual content, relationships, or actions implied in the caption.

---

#### **Task 2 — Semantic Word Extraction**

You are also given a list of **objects with their IDs** that exist in the image.

Analyze the same caption and extract all **meaningful, semantically important words** that can be linked to the given object IDs.
Each extracted word must be categorized into one of the following classes:

* **objects** — tangible entities (e.g., car, tree, bowl)
* **attributes** — visual or descriptive traits (e.g., red, shiny, tall)
* **relations** — spatial or functional links (e.g., on, behind, holding)
* **actions** — verbs implying activity (e.g., running, sitting)
* **count** — numerical or quantity words (e.g., two, many)
* **scene/context** — setting or environment (e.g., street, kitchen)
* **decision tokens** — logical or binary cues (yes, no, visible, present, absent)

Only include words that:

* **Appear in or are implied by the caption**, and
* **Can be associated with at least one given object ID.**

---

### 🧾 **Expected JSON Output Format**

```json
{{
  "question": "What is the man holding on the table?",
  "attributes": [
    {{"word": "gray", "objects_involved": ["2428018"]}},
    {{"word": "red", "objects_involved": ["2235351"]}}
  ],
  "relations": [
    {{"word": "on", "objects_involved": ["2428018", "3725798"]}}
  ],
  "actions": [],
  "count": [],
  "decision_tokens": []
}}
```

Inputs:
Caption: {caption}
Objects with IDs: {labels}
"""
  return prompt

all_res = []
i = 0
for inx, row in tqdm(final_df.iterrows()):
    answer  = row["answer"]
    labels = pd.DataFrame(eval(row["objects_info"]))[["names", "object_id"]].to_dict(orient="records")
    PROMPT = get_prompt(answer, labels=labels)
    res = {"request":{"contents": [{"role": "user", "parts": [{"text": PROMPT}]}], 
                      "generationConfig": {"temperature": 0.8, "topP": 1, "maxOutputTokens": 2000,"thinking_config":{"thinking_budget":1000}}}}
    all_res.append(res)

29839it [00:23, 1252.05it/s]


In [49]:
len(all_res)

29839

In [51]:
df = pd.DataFrame(all_res)
df.to_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl", orient="records", lines=True)

# Uploading files to gcp

In [55]:
def upload_to_gcs(local_path: str, gcs_uri: str):
    """
    Upload a local file to a target GCS URI.

    Args:
        local_path (str): Local file path to upload.
        gcs_uri (str): Target GCS URI like 'gs://my-bucket/path/to/upload.txt'
        service_account_path (str): Path to GCP service account JSON.
    """
    if not gcs_uri.startswith("gs://"):
        raise ValueError("Invalid GCS URI. Must start with gs://")

    parts = gcs_uri[5:].split("/", 1)
    bucket_name = parts[0]
    blob_name = parts[1]

    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.upload_from_filename(local_path)

    print(f"✅ Uploaded {local_path} → {gcs_uri}")


In [56]:
upload_to_gcs(local_path="/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl", gcs_uri = "gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl")

✅ Uploaded /Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch.jsonl → gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl


In [None]:
# start batch processing

# start batch processing

In [57]:
import time

from google import genai
from google.genai.types import CreateBatchJobConfig, JobState, HttpOptions
output_uri = "gs://train_data_vision_1/gemini_batch_info/"

# See the documentation: https://googleapis.github.io/python-genai/genai.html#genai.batches.Batches.create
job = gen_client.batches.create(
    model="gemini-2.5-flash",
    src="gs://train_data_vision_1/gemini_batch_info/gemini_batch.jsonl",
    config=CreateBatchJobConfig(dest=output_uri),
)
print(f"Job name: {job.name}")
print(f"Job state: {job.state}")

Job name: projects/358874265041/locations/us-central1/batchPredictionJobs/4931218339570647040
Job state: JOB_STATE_PENDING


In [1]:
job_info = gen_client.batches.get(name=job.name)
job_info.state

NameError: name 'gen_client' is not defined

# download batch results

In [11]:
def download_from_gcs(gcs_uri: str, local_path: str):
    """
    Download a file from GCS based on its gs:// URI.

    Args:
        gcs_uri (str): GCS URI like 'gs://my-bucket/path/to/file.txt'
        local_path (str): Path to store the downloaded file locally.
    """
    # Parse bucket and blob name
    if not gcs_uri.startswith("gs://"):
        raise ValueError("Invalid GCS URI. Must start with gs://")

    parts = gcs_uri[5:].split("/", 1)
    bucket_name = parts[0]
    blob_name = parts[1]
    
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    blob.download_to_filename(local_path)

    print(f"✅ Downloaded {gcs_uri} → {local_path}")


In [12]:
download_from_gcs(gcs_uri="gs://train_data_vision_1/gemini_batch_info/prediction-model-2025-10-28T18:37:13.007798Z/predictions.jsonl", local_path ="/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl")

✅ Downloaded gs://train_data_vision_1/gemini_batch_info/prediction-model-2025-10-28T18:37:13.007798Z/predictions.jsonl → /Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl


In [13]:
pred_data = pd.read_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_btach_res.jsonl", lines=True)

In [14]:
pred_data.head(5)

Unnamed: 0,request,status,response,processed_time
0,{'contents': [{'parts': [{'text': '\nYou are a...,,{'candidates': [{'avgLogprobs': -1.20831563849...,2025-10-28 18:43:47.966000+00:00
1,{'contents': [{'parts': [{'text': '\nYou are a...,,{'candidates': [{'avgLogprobs': -1.15532089992...,2025-10-28 18:43:47.980000+00:00
2,{'contents': [{'parts': [{'text': '\nYou are a...,,{'candidates': [{'avgLogprobs': -0.88970524249...,2025-10-28 18:43:47.973000+00:00
3,{'contents': [{'parts': [{'text': '\nYou are a...,,{'candidates': [{'avgLogprobs': -0.90098913092...,2025-10-28 18:43:47.983000+00:00
4,{'contents': [{'parts': [{'text': '\nYou are a...,,{'candidates': [{'avgLogprobs': -0.67929188992...,2025-10-28 18:43:47.985000+00:00


In [18]:
res = eval(row["response"]["candidates"][0]["content"]["parts"][0]["text"].replace("json", "").strip("```"))

In [19]:
res

{'question': 'Where is the woman located in relation to the man?',
 'attributes': [{'word': 'walking', 'objects_involved': ['150916']}],
 'relations': [{'word': 'left of', 'objects_involved': ['150910', '150924']},
  {'word': 'left of', 'objects_involved': ['150924', '150916']}],
 'actions': [],
 'count': [],
 'decision_tokens': []}

In [27]:
res

{'question': 'Where is the woman located in relation to the man?',
 'attributes': [{'word': 'walking', 'objects_involved': ['150916']}],
 'relations': [{'word': 'left of', 'objects_involved': ['150910', '150924']},
  {'word': 'left of', 'objects_involved': ['150924', '150916']}],
 'actions': [],
 'count': [],
 'decision_tokens': []}

In [28]:
question

'Where is the woman located in relation to the man?'

In [29]:
caption

'Situated to the left of the building, which is to the left of the walking man, is the woman.'

In [42]:
objects_df

Unnamed: 0,object_id,names,h,w,y,x
0,150924,[building],343,80,7,57
1,150916,[man],91,44,246,220
2,150910,[woman],208,88,258,0


In [45]:
attr_df = pd.DataFrame(res["attributes"])

In [48]:
objects_df

Unnamed: 0,object_id,names,h,w,y,x
0,150924,[building],343,80,7,57
1,150916,[man],91,44,246,220
2,150910,[woman],208,88,258,0


In [49]:
attr_df

Unnamed: 0,word,objects_involved
0,walking,[150916]


In [62]:
def get_word_2_bbox(target_df, objects_df):
    all_bbox_map = []
    for row in target_df.iterrows():
        word = row[1]["word"]
        obj_ids = row[1]["objects_involved"]
        bbox = objects_df[objects_df["object_id"].isin(obj_ids)][["h", "w", "y", "x"]].to_dict(orient="records")[0]
        bbox_map = {}
        bbox_map["word"] = word
        bbox_map.update(bbox)
        all_bbox_map.append(bbox_map)
    return all_bbox_map

In [66]:
target_words

[[{'word': 'walking', 'h': 91, 'w': 44, 'y': 246, 'x': 220}],
 [{'word': 'left of', 'h': 343, 'w': 80, 'y': 7, 'x': 57},
  {'word': 'left of', 'h': 343, 'w': 80, 'y': 7, 'x': 57}]]

In [83]:
target_words

[{'word': 'building', 'h': 343, 'w': 80, 'y': 7, 'x': 57},
 {'word': 'man', 'h': 91, 'w': 44, 'y': 246, 'x': 220},
 {'word': 'woman', 'h': 208, 'w': 88, 'y': 258, 'x': 0},
 {'word': 'walking', 'h': 91, 'w': 44, 'y': 246, 'x': 220},
 {'word': 'left of', 'h': 343, 'w': 80, 'y': 7, 'x': 57},
 {'word': 'left of', 'h': 343, 'w': 80, 'y': 7, 'x': 57}]

In [85]:
from tqdm import tqdm

all_res = []
error = 0
for inx, row in tqdm(pred_data.iterrows()):

    try:
        res = eval(row["response"]["candidates"][0]["content"]["parts"][0]["text"].replace("json", "").strip("```"))
        question = res["question"]
        caption = row["request"]["contents"][0]["parts"][0]["text"].split("Caption:")[-1].split("Objects with IDs:")[0].strip()
        req_info = final_df[final_df["answer"] == caption]
        img_id = str(int(req_info["image_id"].values[0]))
        objects_df =  pd.DataFrame(eval(req_info["objects_info"].values[0]))
        objects_df["names"] = objects_df["names"].apply(lambda x: x[0])
        req_obj_df = objects_df[["names", "h", "w", "y", "x"]]
        req_obj_df.columns = ["word", "h", "w", "y", "x"]
        
        target_words = req_obj_df.to_dict(orient="records")
        try:
            attr_df = pd.DataFrame(res["attributes"])
            if attr_df.shape[0]>0:
                attr_df = attr_df[attr_df["objects_involved"].apply(lambda x: len(x)!=0)]
                att_words = get_word_2_bbox(attr_df, objects_df)
                target_words.extend(att_words)

        except Exception as e:
            error += 1
            
        try:
            rel_df = pd.DataFrame(res["relations"])
            if rel_df.shape[0]>0:
                rel_df = rel_df[rel_df["objects_involved"].apply(lambda x: len(x)!=0)]
                rel_words = get_word_2_bbox(rel_df, objects_df)
                target_words.extend(rel_words)
        except Exception as e:
            error += 1
        
        try:
            count_df = pd.DataFrame(res["count"])
            if count_df.shape[0]>0:
                count_df = count_df[count_df["objects_involved"].apply(lambda x: len(x)!=0)]
                count_words = get_word_2_bbox(count_df, objects_df)
                target_words.extend(count_words)
        except Exception as e:
            error += 1

        try:
            decision_df = pd.DataFrame(res["decision_tokens"])
            if decision_df.shape[0]>0:
                decision_df = decision_df[decision_df["objects_involved"].apply(lambda x: len(x)!=0)]
                decision_words = get_word_2_bbox(decision_df, objects_df)
            target_words.extend(decision_words)
        except Exception as e:
            error += 1
        
        all_res.append({
            "image_id": img_id,
            "question": question,
            "caption": caption,
            "target_words": target_words
        })
        
    except Exception as e:
        error += 1

29839it [03:59, 124.84it/s]


In [87]:
len(all_res)

29831

In [88]:
total_res_df = pd.DataFrame(all_res)

In [91]:
total_res_df = total_res_df.dropna()

In [94]:
total_res_df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/finecops_ref/final_finecopes_ref_30k_bb_annot.csv")

In [None]:
len(all_res)

31713

In [None]:
result_df.iloc[0]["bb_df"]

'          image_id        filename phrase object_name  xmin  ymin  xmax  ymax  \\\n194344  1191423753  1191423753.jpg   4934        4934   351   177   405   298   \n194345  1191423753  1191423753.jpg   4934        4934    63   223   103   349   \n194346  1191423753  1191423753.jpg   4934        4934    11   215    78   360   \n194347  1191423753  1191423753.jpg   4935        4935     1     1   346   344   \n194348  1191423753  1191423753.jpg   4936        4936   268   165   317   276   \n194349  1191423753  1191423753.jpg   4937        4937   222   179   263   307   \n194350  1191423753  1191423753.jpg   4937        4937   117   179   178   320   \n194351  1191423753  1191423753.jpg   4937        4937   161   187   198   309   \n194352  1191423753  1191423753.jpg   4937        4937   192   177   228   313   \n194353  1191423753  1191423753.jpg   4937        4937   264   154   322   284   \n194354  1191423753  1191423753.jpg   4938        4938   312   196   348   249   \n194355  119142

In [4]:
import pandas as pd
df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/evidence_head_train_datasets/flicker/train_processed_flickr_30k.csv")

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,caption,question,expanded_labels,image_id,bb_df
0,0,Three people are on a sidewalk.,How many people are on the sidewalk?,"{'Three': '4934', 'people': '4934', 'sidewalk'...",1191423753,image_id filename phrase obje...
1,1,"Beautiful brunette woman, draped in purple, bl...",What colors are the scarves draped around the ...,"{'Beautiful': '9911', 'brunette': '9911', 'wom...",1360456780,image_id filename phrase objec...
2,2,A woman is welding metal at a work table.,What is the woman doing at the work table?,"{'woman': '22853', 'metal': '22855', 'work': '...",1897067588,image_id filename phrase obje...
3,3,A girl in a white shirt is holding a ball and ...,What is the girl pointing at?,"{'girl': '27660', 'white': '27663', 'shirt': '...",208472767,image_id filename phrase object...
4,4,"The large, dark colored dog is doing a climbin...","What is the large, dark-colored dog doing?","{'large': '32430', 'dark': '32430', 'colored':...",2176364472,image_id filename phrase obje...


In [None]:
i = 10
print(df["question"].iloc[i])
print(df["caption"].iloc[i])
print(df["expanded_labels"].iloc[i])

Is there a person behind the street lamp?
The street lamp is the focal point, but there is a person bent down directly behind the lamp.
{'street': '218332', 'lamp': '218332', 'focal': '218334', 'point': '218334', 'person': '218336'}


In [None]:
df["bb_df"].iloc[i]

'          image_id        filename  phrase object_name  xmin  ymin  xmax  \\\n228801  4919450790  4919450790.jpg  218328      218328     2   208   500   \n228802  4919450790  4919450790.jpg  218329      218329     5   191   497   \n228803  4919450790  4919450790.jpg  218330      218330   299    29   500   \n228804  4919450790  4919450790.jpg  218332      218332   260    79   288   \n228805  4919450790  4919450790.jpg  218335      218335   262    81   282   \n228806  4919450790  4919450790.jpg  218336      218336   217   182   240   \n\n        ymax  width  height                                          sentences  \n228801   229    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228802   258    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228803   225    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228804   193    500     271  [[/EN#218326/scene A city] with [/EN#218330/sc...  \n228805   225    500     271  [[/EN#218326/scene

In [None]:
from io import StringIO
import pandas as pd

In [None]:
nested_str = df["bb_df"].iloc[i]
nested_df = pd.read_fwf(StringIO(nested_str))