In [34]:
import numpy as np
import pandas as pd
from typing import List
from pydantic import BaseModel, Field
import os
from google import genai
from google.oauth2.service_account import Credentials
from google.genai import types
import cv2
from PIL import Image
import io


In [2]:
scopes = ["https://www.googleapis.com/auth/cloud-platform"]

SERVICE_ACCOUNT_FILE = "/Data2/Arun-UAV/NLP/self-halu-detection/vertix_ai.json"

credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=scopes)

client = genai.Client(
    vertexai=True,
    project='hazel-math-472314-h9',   # or set directly
    location='us-central1',    # or set directly, e.g. "us-central1"
    credentials=credentials
)


class ImageDescription(BaseModel):
    image_description: str = Field(
        ...,
        description="Detailed description of the given image."
    )


prompt = """
You are specialist in rich and precise scene understanding.
Given an input image, generate a comprehensive, contextually aware, and fluent description that captures all key visual elements, their relationships, emotions, and possible context or story.

Your description should go beyond short captions — it must resemble a paragraph of visual storytelling that includes:

Scene type: indoor/outdoor, environment, lighting, time of day

Objects and entities: names, counts, shapes, colors, materials

Actions and interactions: what the people or objects are doing

Spatial layout: foreground, background, relative positions

Emotions or atmosphere: tone, mood, aesthetics

Possible context: what might be happening or implied by the scene

Avoid generic or repetitive statements. Be vivid, factual, and coherent. Use natural language instead of bullet points.

Output json Format:

{image_description: <full attached image description>}

"""


def get_image_des(image_path):
    
    image = cv2.imread(image_path)
    
    _, total_encoded_img = cv2.imencode(".jpg", image)
    total_img_bytes = io.BytesIO(total_encoded_img.tobytes()).getvalue()

    image_contents = [PROMPT, types.Part.from_bytes(data=total_img_bytes, mime_type="image/jpeg")]

    structured_generation_config = types.GenerateContentConfig(
        temperature=0.6,
        top_p=1.0,
        top_k=32,
        candidate_count=1,
        max_output_tokens=65535,
        response_schema=ImageDescription,
        response_mime_type="application/json",
        thinking_config=types. ThinkingConfig(thinking_budget=0),
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=image_contents,
        config=structured_generation_config,
    )
    
    return response.parsed.model_dump()


coco_annot_data = pd.read_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_sample_5000.json")
img_list = os.listdir("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images")


all_img_des = []
for inx, row in coco_annot_data.iterrows():
    img_file_name = row["file_name"]
    img_path = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images"+ "/" + img_file_name
    img_des = get_image_des(img_path)
    all_img_des.append(img_des)

NameError: name 'PROMPT' is not defined

In [None]:
all_img_des = []
for inx, row in coco_annot_data.iterrows():
    img_file_name = row["file_name"]
    if img_file_name in img_list:
        img_path = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images"+ "/" + img_file_name
        img_des = get_image_des(img_path)
        all_img_des.append(img_des)

In [44]:

img_file_name in img_list

True

In [8]:
scopes = ["https://www.googleapis.com/auth/cloud-platform"]

SERVICE_ACCOUNT_FILE = "/Data2/Arun-UAV/NLP/self-halu-detection/vertix_ai.json"

credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=scopes)

client = genai.Client(
    vertexai=True,
    project='hazel-math-472314-h9',   # or set directly
    location='us-central1',    # or set directly, e.g. "us-central1"
    credentials=credentials
)

In [12]:
class ImagePatchDescription(BaseModel):
    total_image_description: str = Field(
        ...,
        description="Detailed description of the entire image (global context)."
    )
    top_left_description: str = Field(
        ...,
        description="Detailed description of the top-left patch."
    )
    top_right_description: str = Field(
        ...,
        description="Detailed description of the top-right patch."
    )
    bottom_left_description: str = Field(
        ...,
        description="Detailed description of the bottom-left patch."
    )
    bottom_right_description: str = Field(
        ...,
        description="Detailed description of the bottom-right patch."
    )
    
class ImageDescription(BaseModel):
    image_description: str = Field(
        ...,
        description="Detailed description of the given image."
    )

In [13]:
PROMPT = """
You are a multimodal language model that receives one **total image** and its four **quadrant patches** (non-overlapping, obtained by bisecting the image horizontally and vertically). Your task is to generate **detailed, human-readable descriptions** of:

1. The **entire image** (holistic/global view).
2. Each **patch individually**, while using the total image only for context.

**Rules:**

* **Global Description** → capture layout, objects, relationships, background etc.. like every detail
* **Patch Descriptions** → focus **only on that patch**.

  * Be precise: mention objects, colors, textures, shapes, activities, etc without missing any key details.
  * Optionally relate to global context (e.g., “this is the left side of the building visible in the full image”).
* Output must be **strictly valid JSON**.

**Output Format (strict JSON):**

```json
{{
   "total_image_description": "<detailed description of the entire image>",
   "top_left_description": "<detailed description of the top-left patch>",
   "top_right_description": "<detailed description of the top-right patch>",
   "bottom_left_description": "<detailed description of the bottom-left patch>",
   "bottom_right_description": "<detailed description of the bottom-right patch>"
}}

```
"""

In [15]:
prompt = """
You are specialist in rich and precise scene understanding.
Given an input image, generate a comprehensive, contextually aware, and fluent description that captures all key visual elements, their relationships, emotions, and possible context or story.

Your description should go beyond short captions — it must resemble a paragraph of visual storytelling that includes:

Scene type: indoor/outdoor, environment, lighting, time of day

Objects and entities: names, counts, shapes, colors, materials

Actions and interactions: what the people or objects are doing

Spatial layout: foreground, background, relative positions

Emotions or atmosphere: tone, mood, aesthetics

Possible context: what might be happening or implied by the scene

Avoid generic or repetitive statements. Be vivid, factual, and coherent. Use natural language instead of bullet points.

Output json Format:

{image_description: <full attached image description>}

"""

In [16]:
def get_image_des_patches(image_path):
    
    image = cv2.imread(image_path)
    h, w, c = image.shape

    # Compute mid points
    mid_h, mid_w = h // 2, w // 2

    # Split into 4 sub-images
    top_left     = image[0:mid_h, 0:mid_w]
    top_right    = image[0:mid_h, mid_w:w]
    bottom_left  = image[mid_h:h, 0:mid_w]
    bottom_right = image[mid_h:h, mid_w:w]

    # Function to convert patch into JPEG bytes
    def to_jpeg_bytes(patch):
        success, encoded_img = cv2.imencode(".jpg", patch)
        if not success:
            raise ValueError("Encoding to JPEG failed!")
        img_bytes = io.BytesIO(encoded_img.tobytes()).getvalue()
        return img_bytes

    _, total_encoded_img = cv2.imencode(".jpg", image)
    total_img_bytes = io.BytesIO(total_encoded_img.tobytes()).getvalue()

    image_contents = [types.Part.from_bytes(data=total_img_bytes, mime_type="image/jpeg")]
    patch_bytes = {
        "top_left": to_jpeg_bytes(top_left),
        "top_right": to_jpeg_bytes(top_right),
        "bottom_left": to_jpeg_bytes(bottom_left),
        "bottom_right": to_jpeg_bytes(bottom_right)
    }

    for key,value in patch_bytes.items():
        image_contents.append(types.Part.from_bytes(data=value, mime_type="image/jpeg"))

    image_contents.insert(0, PROMPT)

    structured_generation_config = types.GenerateContentConfig(
        temperature=0.6,
        top_p=1.0,
        top_k=32,
        candidate_count=1,
        max_output_tokens=65535,
        response_schema=ImagePatchDescription,
        response_mime_type="application/json",
        thinking_config=types.ThinkingConfig(thinking_budget=0),
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=image_contents,
        config=structured_generation_config,
    )
    
    return response.parsed.model_dump()


def get_image_des(image_path):
    
    image = cv2.imread(image_path)
    
    _, total_encoded_img = cv2.imencode(".jpg", image)
    total_img_bytes = io.BytesIO(total_encoded_img.tobytes()).getvalue()

    image_contents = [PROMPT, types.Part.from_bytes(data=total_img_bytes, mime_type="image/jpeg")]

    structured_generation_config = types.GenerateContentConfig(
        temperature=0.6,
        top_p=1.0,
        top_k=32,
        candidate_count=1,
        max_output_tokens=65535,
        response_schema=ImageDescription,
        response_mime_type="application/json",
        thinking_config=types. ThinkingConfig(thinking_budget=0),
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=image_contents,
        config=structured_generation_config,
    )
    
    return response.parsed.model_dump()

In [58]:
from tqdm import tqdm 
import json

# all_des = []
all_lines = []
with open("/Data2/Arun-UAV/NLP/vision_halu/Deco/opera_log/llava-1.5/ours.jsonl", "r", encoding="utf-8") as f:
    for _, data_line in tqdm(enumerate(f.readlines()), total=500):
        line = json.loads(data_line)
        all_lines.append(line) 
        # idx = line["image_id"]
        # image_file = "/Data2/Arun-UAV/NLP/vision_halu/benchmarks/coco2024/val2014/COCO_val2014_" + str(idx).zfill(12) + ".jpg"
        # all_des.append(get_image_des(image_file))

100%|██████████| 500/500 [00:00<00:00, 133135.60it/s]


In [17]:
coco_data = pd.read_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_sample_5000.json")

In [18]:
coco_data.head(2)

Unnamed: 0,image_id,file_name,coco_url,captions
0,73262,COCO_train2014_000000073262.jpg,http://images.cocodataset.org/train2014/COCO_t...,[A skateboarder performing a trick next to a b...
1,130534,COCO_train2014_000000130534.jpg,http://images.cocodataset.org/train2014/COCO_t...,"[A woman standing in a room in front of a TV.,..."


In [None]:
img_list = os.listdir("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images")
img_path = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images"+ "/" + img_list[0]

In [7]:
des = pd.read_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_img_descriptions_parallel.json")

In [9]:
des["error"].iloc[0]

'HTTPSConnectionPool(host=\'oauth2.googleapis.com\', port=443): Max retries exceeded with url: /token (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f5953bf7760>: Failed to resolve \'oauth2.googleapis.com\' ([Errno -3] Temporary failure in name resolution)"))'

In [10]:
import os
import io
import cv2
import json
import pandas as pd
from tqdm import tqdm
from typing import List
from pydantic import BaseModel, Field
from concurrent.futures import ProcessPoolExecutor, as_completed

from google import genai
from google.oauth2.service_account import Credentials
from google.genai import types

# ================== CONFIG ==================
SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
SERVICE_ACCOUNT_FILE = "/Data2/Arun-UAV/NLP/self-halu-detection/vertix_ai.json"
PROJECT_ID = "hazel-math-472314-h9"
LOCATION = "us-central1"
MODEL_NAME = "gemini-2.5-flash"

COCO_JSON = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_sample_5000.json"
IMAGE_DIR = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images"
OUTPUT_JSON = "/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_img_descriptions_parallel.json"

MAX_WORKERS = 64  # Adjust based on CPU count
# ============================================

# ---------- GEMINI CLIENT SETUP -------------
credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location=LOCATION,
    credentials=credentials,
)
# --------------------------------------------

# ---------- PROMPT & RESPONSE SCHEMA --------
class ImageDescription(BaseModel):
    image_description: str = Field(
        ..., description="Detailed description of the given image."
    )

PROMPT = """
You are a specialist in rich and precise scene understanding.
Given an input image, generate a comprehensive, contextually aware, and fluent description that captures all key visual elements, their relationships, emotions, and possible context or story.

Your description should go beyond short captions — it must resemble a paragraph of visual storytelling that includes:

Scene type: indoor/outdoor, environment, lighting, time of day
Objects and entities: names, counts, shapes, colors, materials
Actions and interactions: what the people or objects are doing
Spatial layout: foreground, background, relative positions
Emotions or atmosphere: tone, mood, aesthetics
Possible context: what might be happening or implied by the scene

Avoid generic or repetitive statements. Be vivid, factual, and coherent. Use natural language instead of bullet points.

Output JSON format:
{image_description: <full attached image description>}
"""
# --------------------------------------------


def get_image_description(img_path: str):
    """Worker function: generate detailed image description."""
    try:
        image = cv2.imread(img_path)
        if image is None:
            return {"file_name": os.path.basename(img_path), "error": "Image not found"}

        _, encoded_img = cv2.imencode(".jpg", image)
        img_bytes = io.BytesIO(encoded_img.tobytes()).getvalue()

        contents = [PROMPT, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")]

        structured_config = types.GenerateContentConfig(
            temperature=0.6,
            top_p=1.0,
            top_k=32,
            candidate_count=1,
            max_output_tokens=65535,
            response_schema=ImageDescription,
            response_mime_type="application/json",
            thinking_config=types.ThinkingConfig(thinking_budget=0),
        )

        response = client.models.generate_content(
            model=MODEL_NAME,
            contents=contents,
            config=structured_config,
        )

        parsed = response.parsed.model_dump()
        parsed["file_name"] = os.path.basename(img_path)
        return parsed

    except Exception as e:
        return {"file_name": os.path.basename(img_path), "error": str(e)}


In [12]:
coco_data = pd.read_json(COCO_JSON)
img_files = coco_data["file_name"].tolist()
img_list = os.listdir("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/poc_5000_coco_images")

In [None]:
results = []
with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {
        executor.submit(get_image_description, os.path.join(IMAGE_DIR, f)): f
        for f in img_files if f in img_list
    }

    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing images"):
        result = future.result()
        results.append(result)

# Save all results to JSON
with open(OUTPUT_JSON, "w") as f_out:
    json.dump(results, f_out, indent=2)

print(f"✅ Done! Saved {len(results)} image descriptions to {OUTPUT_JSON}")

In [15]:
for f in img_files:
    if f in img_list:
        path = os.path.join(IMAGE_DIR, f)
        des = get_image_description(path)
        break

In [16]:
des

{'image_description': 'The image captures an exhilarating outdoor scene at a skate park on a bright, sunny day with a vibrant blue sky dotted with fluffy white clouds. In the foreground, a young man in a red t-shirt, blue jeans, and a grey beanie is intensely focused while riding a bright neon green BMX bike on the edge of a concrete ramp. His body is slightly hunched over the handlebars, conveying speed and control. Directly above and slightly behind him, another young man, dressed in a yellow t-shirt, dark blue jeans, and a light blue baseball cap, is captured mid-air performing a trick on a skateboard. He is suspended high above the ramp, arms outstretched for balance, with his skateboard perfectly aligned beneath his feet. The dynamic composition creates a sense of simultaneous action and youthful energy. In the mid-ground, the concrete skate park features various ramps and structures, including a white wall with some graffiti and a concrete bench. Further back, several lush green 

In [25]:
df = pd.read_csv("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/coco_5000_gcp_upload_urs.csv")

In [26]:

PROMPT = """
You are a specialist in rich and precise scene understanding.
Given an input image, generate a comprehensive, contextually aware, and fluent description that captures all key visual elements, their relationships, emotions, and possible context or story.

Your description should go beyond short captions — it must resemble a paragraph of visual storytelling that includes:

Scene type: indoor/outdoor, environment, lighting, time of day
Objects and entities: names, counts, shapes, colors, materials
Actions and interactions: what the people or objects are doing
Spatial layout: foreground, background, relative positions
Emotions or atmosphere: tone, mood, aesthetics
Possible context: what might be happening or implied by the scene

Avoid generic or repetitive statements. Be vivid, factual, and coherent. Use natural language instead of bullet points.

Output JSON format:
{image_description: <full attached image description>}
"""

all_res = []
for uri in df["gcs_uri"].tolist():
    res = {"request":{"contents": [{"role": "user", "parts": [{"text": PROMPT}, {"fileData": {"fileUri": uri, "mimeType": "image/jpeg"}}]}], 
                      "generationConfig": {"temperature": 0.6, "topP": 1, "maxOutputTokens": 1000,"thinking_config":{"thinking_budget":0}}}}
    all_res.append(res)

In [28]:
df = pd.DataFrame(all_res)

In [29]:
df.head(2)

Unnamed: 0,request
0,"{'contents': [{'role': 'user', 'parts': [{'tex..."
1,"{'contents': [{'role': 'user', 'parts': [{'tex..."


In [33]:
df.iloc[:10].to_json("/Data2/Arun-UAV/NLP/vision_halu/train_datasets/gemini_batch_10_testing.jsonl", orient="records", lines=True)