In [1]:
import cv2
import re
import math
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import html
import os
import torch
import matplotlib.pyplot as plt
import torchvision.transforms as T
from garuda.od import ConfusionMatrix
from typing import List
from garuda.core import obb_iou
from dataclasses import dataclass

from geochat.model.builder import load_pretrained_model
from geochat.mm_utils import  get_model_name_from_path
from geochat.conversation import conv_templates, Chat

from PIL import Image
from glob import glob

gpu_id = 3
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)



[2025-01-27 13:53:28,842] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
bounding_box_size = 100
# region = 'lucknow_airshed_most_15'
# region = 'uttar_pradesh_most_15'
region = 'west_bengal_most_15'

# planet imagery
# gr_imgs_path = f'/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/vlm_data/{region}/images/*'

# superresolution imagery
gr_imgs_path = f'/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/vlm_data/{region}/swinir_images/*'

# labels path
gr_labels = f'/home/rishabh.mondal/Brick-Kilns-project/ijcai_2025_kilns/data/vlm_data/{region}/labels'

# geochat demo images
# gr_imgs_path = '/home/shataxi.dubey/shataxi_work/GeoChat/demo_images/*'


In [3]:
# user_message = 'Give the bounding box coordinates of all the brick kilns present in the image separated by newline'
# user_message = 'Draw bounding box around the brick kiln with chimney in the image'

# Visual question answering - given the image and a question, it generates the answer.

# Scene classification - given the image, it generates the scene category.
# user_message = 'Classify the image in one word. The classes are Church, School, Bareland, Beach, Forest'

# Region-level caption - given the bounding box on the image, it generates brief description about the object.mro
# user_message = '[identify] What is this object?'

# Grounded description - describe the object and give the bounding box.
# user_message = 'describe the image in detail'

# Refering expressions - refer to the object by providing some attributes : large, top, close etc, will produce the bounding box
user_message = '[refer] Where is the brick kiln with chimney in the image? Give its oriented bounding box'

In [4]:
def escape_markdown(text):
    # List of Markdown special characters that need to be escaped
    md_chars = ['<', '>']

    # Escape each special character
    for char in md_chars:
        text = text.replace(char, '\\' + char)

    return text

def reverse_escape(text):
    md_chars = ['\\<', '\\>']

    for char in md_chars:
        text = text.replace(char, char[1:])

    return text

def bbox_and_angle_to_polygon(x1, y1, x2, y2, a):
    # Calculate center coordinates
    x_ctr = (x1 + x2) / 2
    y_ctr = (y1 + y2) / 2
    
    # Calculate width and height
    w = abs(x2 - x1)
    h = abs(y2 - y1)
    
    # Calculate the angle in radians
    angle_rad = math.radians(a)
    
    # Calculate coordinates of the four corners of the rotated bounding box
    cos_a = math.cos(angle_rad)
    sin_a = math.sin(angle_rad)
    
    x1_rot = cos_a * (-w / 2) - sin_a * (-h / 2) + x_ctr
    y1_rot = sin_a * (-w / 2) + cos_a * (-h / 2) + y_ctr
    
    x2_rot = cos_a * (w / 2) - sin_a * (-h / 2) + x_ctr
    y2_rot = sin_a * (w / 2) + cos_a * (-h / 2) + y_ctr
    
    x3_rot = cos_a * (w / 2) - sin_a * (h / 2) + x_ctr
    y3_rot = sin_a * (w / 2) + cos_a * (h / 2) + y_ctr
    
    x4_rot = cos_a * (-w / 2) - sin_a * (h / 2) + x_ctr
    y4_rot = sin_a * (-w / 2) + cos_a * (h / 2) + y_ctr
    
    # Return the polygon coordinates
    polygon_coords = np.array((x1_rot, y1_rot, x2_rot, y2_rot, x3_rot, y3_rot, x4_rot, y4_rot))
    
    return polygon_coords

def rotate_bbox(top_right, bottom_left, angle_degrees):
    # Convert angle to radians
    angle_radians = np.radians(angle_degrees)

    # Calculate the center of the rectangle
    center = ((top_right[0] + bottom_left[0]) / 2, (top_right[1] + bottom_left[1]) / 2)

    # Calculate the width and height of the rectangle
    width = top_right[0] - bottom_left[0]
    height = top_right[1] - bottom_left[1]

    # Create a rotation matrix
    rotation_matrix = cv2.getRotationMatrix2D(center, angle_degrees, 1)

    # Create an array of the rectangle corners
    rectangle_points = np.array([[bottom_left[0], bottom_left[1]],
                                 [top_right[0], bottom_left[1]],
                                 [top_right[0], top_right[1]],
                                 [bottom_left[0], top_right[1]]], dtype=np.float32)

    # Rotate the rectangle points
    rotated_rectangle = cv2.transform(np.array([rectangle_points]), rotation_matrix)[0]

    return rotated_rectangle
def extract_substrings(string):
    # first check if there is no-finished bracket
    index = string.rfind('}')
    if index != -1:
        string = string[:index + 1]

    pattern = r'<p>(.*?)\}(?!<)'
    matches = re.findall(pattern, string)
    substrings = [match for match in matches]

    return substrings


def is_overlapping(rect1, rect2):
    x1, y1, x2, y2 = rect1
    x3, y3, x4, y4 = rect2
    return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)


def computeIoU(bbox1, bbox2):
    x1, y1, x2, y2 = bbox1
    x3, y3, x4, y4 = bbox2
    intersection_x1 = max(x1, x3)
    intersection_y1 = max(y1, y3)
    intersection_x2 = min(x2, x4)
    intersection_y2 = min(y2, y4)
    intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(0, intersection_y2 - intersection_y1 + 1)
    bbox1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
    bbox2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
    union_area = bbox1_area + bbox2_area - intersection_area
    iou = intersection_area / union_area
    return iou


def save_tmp_img(visual_img, img_name):
    # file_name = "".join([str(random.randint(0, 9)) for _ in range(5)]) + ".jpg"
    # file_path = "/tmp/gradio" + file_name
    file_path = "/tmp/gradio" + img_name
    visual_img.save(file_path)
    return file_path


def mask2bbox(mask):
    if mask is None:
        return ''
    mask = mask.resize([100, 100], resample=Image.NEAREST)
    mask = np.array(mask)[:, :, 0]

    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)

    if rows.sum():
        # Get the top, bottom, left, and right boundaries
        rmin, rmax = np.where(rows)[0][[0, -1]]
        cmin, cmax = np.where(cols)[0][[0, -1]]
        bbox = '{{<{}><{}><{}><{}>}}'.format(cmin, rmin, cmax, rmax)
    else:
        bbox = ''

    return bbox


def escape_markdown(text):
    # List of Markdown special characters that need to be escaped
    md_chars = ['<', '>']

    # Escape each special character
    for char in md_chars:
        text = text.replace(char, '\\' + char)

    return text


def reverse_escape(text):
    md_chars = ['\\<', '\\>']

    for char in md_chars:
        text = text.replace(char, char[1:])

    return text


colors = [
    (255, 0, 0),
    (0, 255, 0),
    (0, 0, 255),
    (210, 210, 0),
    (255, 0, 255),
    (0, 255, 255),
    (114, 128, 250),
    (0, 165, 255),
    (0, 128, 0),
    (144, 238, 144),
    (238, 238, 175),
    (255, 191, 0),
    (0, 128, 0),
    (226, 43, 138),
    (255, 0, 255),
    (0, 215, 255),
]

color_map = {
    f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for
    color_id, color in enumerate(colors)
}

used_colors = colors


def visualize_all_bbox_together(image, generation):
    
    if image is None:
        return None, ''

    generation = html.unescape(generation)
    image_width, image_height = image.size
    image = image.resize([500, int(500 / image_width * image_height)]) # if image width and hight are same then image is of size 500x500
    image_width, image_height = image.size

    string_list = extract_substrings(generation)
    # print(f'String list {string_list}')
    if string_list:  # it is grounding or detection
        mode = 'all'
        entities = defaultdict(list)
        i = 0
        j = 0
        for string in string_list:
            try:
                obj, string = string.split('</p>')
            except ValueError:
                print('wrong string: ', string)
                continue
            if "}{" in string:
                string=string.replace("}{","}<delim>{")
            bbox_list = string.split('<delim>')
            # print(f'bbox_list {bbox_list}')
            flag = False
            for bbox_string in bbox_list:
                integers = re.findall(r'-?\d+', bbox_string)
                if len(integers)==4:
                    angle=0
                else:
                    angle=integers[4]
                integers=integers[:-1]
                
                if len(integers) == 4:
                    x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3])
                    left = x0 / bounding_box_size * image_width
                    bottom = y0 / bounding_box_size * image_height
                    right = x1 / bounding_box_size * image_width
                    top = y1 / bounding_box_size * image_height

                    entities[obj].append([left, bottom, right, top,angle])

                    j += 1
                    flag = True
            if flag:
                i += 1
    else:
        integers = re.findall(r'-?\d+', generation)
        # if len(integers)==4:
        angle=0
        # else:
            # angle=integers[4]
        integers=integers[:-1]
        if len(integers) == 4:  # it is refer
            mode = 'single'

            entities = list()
            x0, y0, x1, y1 = int(integers[0]), int(integers[1]), int(integers[2]), int(integers[3])
            left = x0 / bounding_box_size * image_width
            bottom = y0 / bounding_box_size * image_height
            right = x1 / bounding_box_size * image_width
            top = y1 / bounding_box_size * image_height
            entities.append([left, bottom, right, top,angle])
        else:
            # don't detect any valid bbox to visualize
            return None, '', None

    if len(entities) == 0:
        return None, '', None

    if isinstance(image, Image.Image):
        image_h = image.height
        image_w = image.width
        image = np.array(image)

    # elif isinstance(image, str):
    #     if os.path.exists(image):
    #         pil_img = Image.open(image).convert("RGB")
    #         image = np.array(pil_img)[:, :, [2, 1, 0]]
    #         image_h = pil_img.height
    #         image_w = pil_img.width
    #     else:
    #         raise ValueError(f"invaild image path, {image}")
    # elif isinstance(image, torch.Tensor):
    #     image_tensor = image.cpu()
    #     reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
    #     reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
    #     image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
    #     pil_img = T.ToPILImage()(image_tensor)
    #     image_h = pil_img.height
    #     image_w = pil_img.width
    #     image = np.array(pil_img)[:, :, [2, 1, 0]]
    else:
        raise ValueError(f"invalid image format, {type(image)} for {image}")

    indices = list(range(len(entities)))

    new_image = image.copy()

    previous_bboxes = []
    # size of text
    text_size = 0.4
    # thickness of text
    text_line = 1  # int(max(1 * min(image_h, image_w) / 512, 1))
    box_line = 2
    (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
    base_height = int(text_height * 0.675)
    text_offset_original = text_height - base_height
    text_spaces = 2

    # num_bboxes = sum(len(x[-1]) for x in entities)
    used_colors = colors  # random.sample(colors, k=num_bboxes)

    color_id = -1
    # print(f'entities {entities}')
    for entity_idx, entity_name in enumerate(entities):
        if mode == 'single' or mode == 'identify':
            bbox_coords = []
            bboxes = entity_name
            bboxes = [bboxes]
        else:
            bbox_coords = defaultdict(list)
            bboxes = entities[entity_name]
        color_id += 1
        for bbox_id, (x1_norm, y1_norm, x2_norm, y2_norm,angle) in enumerate(bboxes):
            skip_flag = False
            orig_x1, orig_y1, orig_x2, orig_y2,angle = int(x1_norm), int(y1_norm), int(x2_norm), int(y2_norm), int(angle)

            color = used_colors[entity_idx % len(used_colors)] # tuple(np.random.randint(0, 255, size=3).tolist())
            top_right=(orig_x1,orig_y1)
            bottom_left=(orig_x2,orig_y2)
            angle=angle
            rotated_bbox = rotate_bbox(top_right, bottom_left, angle)
            # print(f'rotated_bbox {rotated_bbox}')
            if mode == 'single' or mode == 'identify':
                bbox_coords.append(rotated_bbox)
            else:
                bbox_coords[entity_name].append(rotated_bbox)
            new_image=cv2.polylines(new_image, [rotated_bbox.astype(np.int32)], isClosed=True, thickness=2, color=color)

            # new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)

            if mode == 'all':
                l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1

                x1 = orig_x1 - l_o
                y1 = orig_y1 - l_o

                if y1 < text_height + text_offset_original + 2 * text_spaces:
                    y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
                    x1 = orig_x1 + r_o

                # add text background
                (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size,
                                                               text_line)
                text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (
                            text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
                
                # print(f'previous_bboxes: {previous_bboxes}')
                # for prev_bbox in previous_bboxes:
                #     if computeIoU((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']) > 0.95 and \
                #             prev_bbox['phrase'] == entity_name:
                #         skip_flag = True
                #         break
                #     while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox['bbox']):
                #         text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
                #         text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
                #         y1 += (text_height + text_offset_original + 2 * text_spaces)

                #         if text_bg_y2 >= image_h:
                #             text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
                #             text_bg_y2 = image_h
                #             y1 = image_h
                #             break
                if not skip_flag:
                    alpha = 0.5
                    for i in range(text_bg_y1, text_bg_y2):
                        for j in range(text_bg_x1, text_bg_x2):
                            if i < image_h and j < image_w:
                                if j < text_bg_x1 + 1.35 * c_width:
                                    # original color
                                    bg_color = color
                                else:
                                    # white
                                    bg_color = [255, 255, 255]
                                new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(
                                    np.uint8)

                    cv2.putText(
                        new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces),
                        cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
                    )

                    previous_bboxes.append(
                        {'bbox': (text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), 'phrase': entity_name})

    if mode == 'all':
        def color_iterator(colors):
            while True:
                for color in colors:
                    yield color

        color_gen = color_iterator(colors)

        # Add colors to phrases and remove <p></p>
        def colored_phrases(match):
            phrase = match.group(1)
            color = next(color_gen)
            return f'<span style="color:rgb{color}">{phrase}</span>'

        generation = re.sub(r'{<\d+><\d+><\d+><\d+>}|<delim>', '', generation)
        generation_colored = re.sub(r'<p>(.*?)</p>', colored_phrases, generation)
    else:
        generation_colored = ''

    pil_image = Image.fromarray(new_image)
    # print(f'bbox_coords {bbox_coords}')
    return pil_image, generation_colored, bbox_coords

In [5]:
model_path = 'MBZUAI/geochat-7B'
model_name = get_model_name_from_path(model_path)
model_name

'geochat-7B'

In [6]:
device = 'cuda:{}'.format(gpu_id)
# set device_map = None to use single GPU, otherwise 'auto' to load model in all GPUs (auto will do sharding it will load layers, weights in different GPUs for better memory efficiency).
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name, device_map = None, device = device)

Loading GeoChat......




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model = model.eval()

In [8]:
chat = Chat(model.to(device), image_processor,tokenizer, device=device)

In [9]:
# conv_templates

In [10]:
CONV_VISION = conv_templates['llava_v1'].copy()
chat_state = CONV_VISION.copy()
chat_state

Conversation(system="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", roles=('USER', 'ASSISTANT'), messages=[], offset=0, sep_style=<SeparatorStyle.TWO: 2>, sep=' ', sep2='</s>', version='v1', skip_next=False)

In [50]:
image_path_with_detected_objects = []

target_results = []
predicted_results = []

for img_path in glob(gr_imgs_path):
    gr_img = Image.open(img_path).convert('RGB')
    img_list = []
    chat_state = CONV_VISION.copy()
    llm_message = chat.upload_img(gr_img, chat_state, img_list = img_list)
    # print(llm_message)
    chat.ask(user_message, chat_state) # ask the question grounding, refer, expression, scene classification etc

    if len(img_list) > 0:
        if not isinstance(img_list[0], torch.Tensor):
            chat.encode_img(img_list)
    streamer = chat.stream_answer(conv=chat_state,
                                    img_list=img_list,
                                    temperature=0.5,
                                    max_new_tokens=500,
                                    max_length=2000)
        
    output = ''
    for new_output in streamer:
        # print(new_output)
        output=output+new_output
    # print(output)

    # output = escape_markdown(output)
    chat_state.messages[-1][1] = '</s>'

    # output = reverse_escape(output)
    # print(output)
    visual_img, generation_color, bbox = visualize_all_bbox_together(gr_img, output) # None, dictionary, list
    # print(visual_img)
    img_name = os.path.basename(img_path)
    if visual_img is not None:
        file_path = save_tmp_img(visual_img, img_name)
        image_path_with_detected_objects.append(file_path)
    # print(generation_color)

    if bbox is None:
        predicted_results.append(np.zeros((1, 8)))
    elif isinstance(bbox, dict): # grounding return bboxes in dict type
        tmp = []
        for key, values in bbox.items():
            for value in values:
                tmp.append(np.array(value).reshape(-1))
        predicted_results.append(np.array(tmp))
    else: # [refer] return bboxes in list type
        tmp = []
        for value in bbox:
                tmp.append(np.array(value).reshape(-1))
        predicted_results.append(np.array(tmp))
    if '.tif' in img_name:
        target_path = os.path.join(gr_labels, img_name.replace('.tif', '.txt'))
    else:
        target_path = os.path.join(gr_labels, img_name.replace('.png', '.txt'))
    target_results.append(np.loadtxt(target_path, ndmin=2))

    

In [51]:
predicted_results

[array([[465., 310., 445., 310., 445., 290., 465., 290.]], dtype=float32),
 array([[ 50., 310.,  30., 310.,  30., 290.,  50., 290.]], dtype=float32),
 array([[160., 460., 150., 460., 150., 440., 160., 440.]], dtype=float32),
 array([[145., 465., 125., 465., 125., 445., 145., 445.]], dtype=float32),
 array([[450., 355., 430., 355., 430., 335., 450., 335.]], dtype=float32),
 array([[460.,  60., 440.,  60., 440.,  50., 460.,  50.]], dtype=float32),
 array([[215., 370., 195., 370., 195., 350., 215., 350.]], dtype=float32),
 array([[465., 355., 445., 355., 445., 335., 465., 335.]], dtype=float32),
 array([[ 50., 425.,  30., 425.,  30., 405.,  50., 405.]], dtype=float32),
 array([[210., 145., 190., 145., 190., 125., 210., 125.]], dtype=float32),
 array([[415.,  90., 395.,  90., 395.,  70., 415.,  70.]], dtype=float32),
 array([[70., 70., 50., 70., 50., 50., 70., 50.]], dtype=float32),
 array([[ 20., 335.,  10., 335.,  10., 315.,  20., 315.]], dtype=float32),
 array([[210., 355., 200., 355., 

In [52]:
new_predicted_results = []
for res in predicted_results:
    res = np.hstack([np.zeros((len(res),1)), res/500, np.ones((len(res),1))], dtype=np.float32) # add class label 0 at index 0 and confidence score 1 at last index
    new_predicted_results.append(res)
new_predicted_results

[array([[0.  , 0.93, 0.62, 0.89, 0.62, 0.89, 0.58, 0.93, 0.58, 1.  ]],
       dtype=float32),
 array([[0.  , 0.1 , 0.62, 0.06, 0.62, 0.06, 0.58, 0.1 , 0.58, 1.  ]],
       dtype=float32),
 array([[0.  , 0.32, 0.92, 0.3 , 0.92, 0.3 , 0.88, 0.32, 0.88, 1.  ]],
       dtype=float32),
 array([[0.  , 0.29, 0.93, 0.25, 0.93, 0.25, 0.89, 0.29, 0.89, 1.  ]],
       dtype=float32),
 array([[0.  , 0.9 , 0.71, 0.86, 0.71, 0.86, 0.67, 0.9 , 0.67, 1.  ]],
       dtype=float32),
 array([[0.  , 0.92, 0.12, 0.88, 0.12, 0.88, 0.1 , 0.92, 0.1 , 1.  ]],
       dtype=float32),
 array([[0.  , 0.43, 0.74, 0.39, 0.74, 0.39, 0.7 , 0.43, 0.7 , 1.  ]],
       dtype=float32),
 array([[0.  , 0.93, 0.71, 0.89, 0.71, 0.89, 0.67, 0.93, 0.67, 1.  ]],
       dtype=float32),
 array([[0.  , 0.1 , 0.85, 0.06, 0.85, 0.06, 0.81, 0.1 , 0.81, 1.  ]],
       dtype=float32),
 array([[0.  , 0.42, 0.29, 0.38, 0.29, 0.38, 0.25, 0.42, 0.25, 1.  ]],
       dtype=float32),
 array([[0.  , 0.83, 0.18, 0.79, 0.18, 0.79, 0.14, 0.83, 0.1

In [53]:
new_target_results = []
for res in target_results:
    res[:,0] = 0 # convert class labels to 0
    res = res.astype(np.float32)
    new_target_results.append(res)
new_target_results

[array([[0.      , 0.3145  , 0.85608 , 0.336543, 0.866424, 0.351943,
         0.833606, 0.3299  , 0.823262],
        [0.      , 0.389164, 0.834785, 0.398912, 0.852498, 0.432382,
         0.834079, 0.422634, 0.816366],
        [0.      , 0.23758 , 0.835607, 0.257196, 0.835577, 0.257137,
         0.797977, 0.237521, 0.798008],
        [0.      , 0.275743, 0.783986, 0.290829, 0.78384 , 0.290504,
         0.750332, 0.275418, 0.750479],
        [0.      , 0.171065, 0.738056, 0.189455, 0.734245, 0.180705,
         0.69201 , 0.162315, 0.69582 ],
        [0.      , 0.23165 , 0.728279, 0.246543, 0.728238, 0.246443,
         0.691921, 0.231549, 0.691962],
        [0.      , 0.082585, 0.709764, 0.100519, 0.712639, 0.1056  ,
         0.680946, 0.087666, 0.67807 ],
        [0.      , 0.045182, 0.711299, 0.053272, 0.690878, 0.015928,
         0.676085, 0.007838, 0.696506],
        [0.      , 0.169869, 0.566286, 0.18776 , 0.568874, 0.193464,
         0.529437, 0.175573, 0.526849],
        [0.      , 

In [54]:
i = 0
for targets, predictions in zip(new_target_results[-3:-2], new_predicted_results[-3:-2]):
    iou = obb_iou(targets[:, 1:9].reshape(-1,4,2)*500, predictions[:, 1:9].reshape(-1,4,2)*500)
    print(i, iou)
    i += 1

0 [[-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99993253e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]
 [-4.99999988e-08]]


In [55]:
# fig, ax = plt.subplots()
# img = Image.open(image_path_with_detected_objects[0])
# # img = Image.open(glob(gr_imgs_path)[0])
# print(img.size[0])
# ax.imshow(img)
# for target in new_target_results[0][17:18]:
#     rect = plt.Polygon(target[1:9].reshape(4,2)*img.size[0], fill=None, edgecolor='green', linewidth=5)
#     ax.add_patch(rect)

# # target = np.array([0, 361.9584 , 598.90176, 399.42656, 577.16992, 351.51872, 494.57152, 314.05056, 516.1984 ])
# # predicted = np.array([0, 358.4, 409.6, 307.2, 409.6, 307.2, 307.2, 358.4, 307.2, 1])

# target = np.array([0, 70.695 , 116.973 ,  78.013 , 112.7285,  68.656 ,  96.596 ,
#         61.338 , 100.82  ])
# predicted = np.array([0, 70., 80., 60., 80., 60., 60., 70., 60., 1])
# rect1 = plt.Polygon(target[1:9].reshape(4,2), fill=None, edgecolor='blue')
# rect2 = plt.Polygon(predicted[1:9].reshape(4,2), fill=None, edgecolor='orange')
# ax.add_patch(rect1)
# ax.add_patch(rect2)

In [66]:
# # n = len(image_path_with_detected_objects)
# gr_img_path = glob(gr_imgs_path)
# n = len(gr_img_path)
# fig, ax = plt.subplots(nrows = n, ncols = 1 ,figsize=(120, 120))
# ax = ax.flatten()
# for i in range(n):
#     # img = Image.open(image_path_with_detected_objects[i]).convert('RGB') # predicted image
#     img = Image.open(gr_img_path[i]).convert('RGB') # planet image
#     w, h = img.size
#     ax[i].imshow(img) 
#     for bbox in new_target_results[i]:
#         classvalue, x1, y1, x2, y2, x3, y3, x4, y4 = bbox*w 
#         ax[i].plot([x1, x2, x3, x4, x1], [y1, y2, y3, y4, y1], color = 'green')
#     for bbox in new_predicted_results[i]:
#         classvalue, x1, y1, x2, y2, x3, y3, x4, y4, conf = bbox*w 
#         ax[i].plot([x1, x2, x3, x4, x1], [y1, y2, y3, y4, y1], color = 'red')
#     ax[i].set_axis_off()
# plt.savefig('geochat_output_refer_wb_planet_superresolution.png')


# fig, ax = plt.subplots(figsize = (10,10))
# idx = 0
# # img = Image.open(image_path_with_detected_objects[idx])
# img = Image.open(glob(gr_imgs_path)[idx])
# print(img.size)

# for bbox in new_target_results[idx]:
#     # print(bbox)
#     classvalue, x1, y1, x2, y2, x3, y3, x4, y4 = bbox*img.size[0] 
#     ax.plot([x1, x2, x3, x4, x1], [y1, y2, y3, y4, y1], color = 'green')

# for bbox in new_predicted_results[idx]:
#     # print(bbox)
#     classvalue, x1, y1, x2, y2, x3, y3, x4, y4, conf = bbox*img.size[0] 
#     ax.plot([x1, x2, x3, x4, x1], [y1, y2, y3, y4, y1], color = 'blue', linewidth = 2)
# plt.imshow(img)


In [61]:
cm_predicted_results = []
for res in new_predicted_results:
    res[:,1:9] = res[:,1:9]*500
    cm_predicted_results.append(res)


cm_target_results = []
for res in new_target_results:
    res[:,0] = 0 # convert class labels to 0
    res[:,1:9] = res[:,1:9]*500
    res = res.astype(np.float32)
    cm_target_results.append(res)

In [62]:
classes, conf_threshold, iou_threshold = ['brick_kilns'], 0.25, 0.1
cm = ConfusionMatrix.from_obb_tensors(cm_predicted_results, cm_target_results, classes, conf_threshold, iou_threshold)
# cm = ConfusionMatrix.from_obb_tensors(new_predicted_results, new_target_results, classes, conf_threshold, iou_threshold)
df = pd.DataFrame(cm.matrix, columns = ['predicted kilns','predicted_bg'], index=['true kilns','true_bg'])
print(f'conf_threshold = {conf_threshold}, iou_threshold = {iou_threshold}')
# print(df.to_markdown())
df

conf_threshold = 0.25, iou_threshold = 0.1


Unnamed: 0,predicted kilns,predicted_bg
true kilns,1.0,192.0
true_bg,14.0,0.0


In [63]:
cm.summary

Unnamed: 0,brick_kilns
Actual Positives,193.0
Predicted Positives,15.0
True Positives,1.0
False Positives,14.0
False Negatives,192.0
Precision,0.066667
Recall,0.005181
F1 Score,0.009615


In [64]:
@dataclass
class MeanAveragePrecision:
    """
    Mean Average Precision for object detection tasks.

    Attributes:
        map50_95 (float): Mean Average Precision (mAP) calculated over IoU thresholds
            ranging from `0.50` to `0.95` with a step size of `0.05`.
        map50 (float): Mean Average Precision (mAP) calculated specifically at
            an IoU threshold of `0.50`.
        map75 (float): Mean Average Precision (mAP) calculated specifically at
            an IoU threshold of `0.75`.
        per_class_ap50_95 (np.ndarray): Average Precision (AP) values calculated over
            IoU thresholds ranging from `0.50` to `0.95` with a step size of `0.05`,
            provided for each individual class.
    """

    map50_95: float
    map50: float
    map75: float
    per_class_ap50_95: np.ndarray

    @classmethod
    def from_tensors(
        cls,
        predictions: List[np.ndarray],
        targets: List[np.ndarray],
    ):
        """
        Calculate Mean Average Precision based on predicted and ground-truth
            detections at different threshold.

        Args:
            predictions (List[np.ndarray]): Each element of the list describes
                a single image and has `shape = (M, 6)` where `M` is
                the number of detected objects. Each row is expected to be
                in `(x_min, y_min, x_max, y_max, class, conf)` format.
            targets (List[np.ndarray]): Each element of the list describes a single
                image and has `shape = (N, 5)` where `N` is the
                number of ground-truth objects. Each row is expected to be in
                `(x_min, y_min, x_max, y_max, class)` format.
        Returns:
            MeanAveragePrecision: New instance of MeanAveragePrecision.

        Example:
            ```python
            import supervision as sv
            import numpy as np

            targets = (
                [
                    np.array(
                        [
                            [0.0, 0.0, 3.0, 3.0, 1],
                            [2.0, 2.0, 5.0, 5.0, 1],
                            [6.0, 1.0, 8.0, 3.0, 2],
                        ]
                    ),
                    np.array([[1.0, 1.0, 2.0, 2.0, 2]]),
                ]
            )

            predictions = [
                np.array(
                    [
                        [0.0, 0.0, 3.0, 3.0, 1, 0.9],
                        [0.1, 0.1, 3.0, 3.0, 0, 0.9],
                        [6.0, 1.0, 8.0, 3.0, 1, 0.8],
                        [1.0, 6.0, 2.0, 7.0, 1, 0.8],
                    ]
                ),
                np.array([[1.0, 1.0, 2.0, 2.0, 2, 0.8]])
            ]

            mean_average_precision = sv.MeanAveragePrecision.from_tensors(
                predictions=predictions,
                targets=targets,
            )

            print(mean_average_precision.map50_95)
            # 0.6649
            ```
        """
        # validate_input_tensors(predictions, targets)
        iou_thresholds = np.linspace(0.5, 0.95, 10)
        stats = []

        # Gather matching stats for predictions and targets
        for true_objs, predicted_objs in zip(targets, predictions):
            if predicted_objs.shape[0] == 0:
                if true_objs.shape[0]:
                    stats.append(
                        (
                            np.zeros((0, iou_thresholds.size), dtype=bool),
                            *np.zeros((2, 0)),
                            true_objs[:, 0], # index 0 is class
                        )
                    )
                continue

            if true_objs.shape[0]:
                matches = cls._match_detection_batch(
                    predicted_objs, true_objs, iou_thresholds
                )
                stats.append(
                    (
                        matches,
                        predicted_objs[:, -1],
                        predicted_objs[:, 0],
                        true_objs[:, 0],
                    )
                )

        # Compute average precisions if any matches exist
        if stats:
            concatenated_stats = [np.concatenate(items, 0) for items in zip(*stats)]
            average_precisions = cls._average_precisions_per_class(*concatenated_stats)
            map50 = average_precisions[:, 0].mean()
            map75 = average_precisions[:, 5].mean()
            map50_95 = average_precisions.mean()
        else:
            map50, map75, map50_95 = 0, 0, 0
            average_precisions = []

        return cls(
            map50_95=map50_95,
            map50=map50,
            map75=map75,
            per_class_ap50_95=average_precisions,
        )

    
    @staticmethod
    def _match_detection_batch(
        predictions: np.ndarray, targets: np.ndarray, iou_thresholds: np.ndarray
    ) -> np.ndarray:
        """
        Match predictions with target labels based on IoU levels.

        Args:
            predictions (np.ndarray): Batch prediction. Describes a single image and
                has `shape = (M, 6)` where `M` is the number of detected objects.
                Each row is expected to be in
                `(x_min, y_min, x_max, y_max, class, conf)` format.
            targets (np.ndarray): Batch target labels. Describes a single image and
                has `shape = (N, 5)` where `N` is the number of ground-truth objects.
                Each row is expected to be in
                `(x_min, y_min, x_max, y_max, class)` format.
            iou_thresholds (np.ndarray): Array contains different IoU thresholds.

        Returns:
            np.ndarray: Matched prediction with target labels result.
        """
        num_predictions, num_iou_levels = predictions.shape[0], iou_thresholds.shape[0]
        correct = np.zeros((num_predictions, num_iou_levels), dtype=bool)
        iou = obb_iou(targets[:, 1:9].reshape(-1,4,2), predictions[:, 1:9].reshape(-1,4,2))
        correct_class = targets[:, 0:1] == predictions[:, 0]

        for i, iou_level in enumerate(iou_thresholds):
            matched_indices = np.where((iou >= iou_level) & correct_class)

            if matched_indices[0].shape[0]:
                combined_indices = np.stack(matched_indices, axis=1)
                iou_values = iou[matched_indices][:, None]
                matches = np.hstack([combined_indices, iou_values])

                if matched_indices[0].shape[0] > 1:
                    matches = matches[matches[:, 2].argsort()[::-1]]
                    matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
                    matches = matches[np.unique(matches[:, 0], return_index=True)[1]]

                correct[matches[:, 1].astype(int), i] = True

        return correct
    
    @staticmethod
    def compute_average_precision(recall: np.ndarray, precision: np.ndarray) -> float:
        """
        Compute the average precision using 101-point interpolation (COCO), given
            the recall and precision curves.

        Args:
            recall (np.ndarray): The recall curve.
            precision (np.ndarray): The precision curve.

        Returns:
            float: Average precision.
        """
        extended_recall = np.concatenate(([0.0], recall, [1.0]))
        extended_precision = np.concatenate(([1.0], precision, [0.0]))
        max_accumulated_precision = np.flip(
            np.maximum.accumulate(np.flip(extended_precision))
        )
        interpolated_recall_levels = np.linspace(0, 1, 101)
        interpolated_precision = np.interp(
            interpolated_recall_levels, extended_recall, max_accumulated_precision
        )
        average_precision = np.trapz(interpolated_precision, interpolated_recall_levels)
        return average_precision

    @staticmethod
    def _average_precisions_per_class(
        matches: np.ndarray,
        prediction_confidence: np.ndarray,
        prediction_class_ids: np.ndarray,
        true_class_ids: np.ndarray,
        eps: float = 1e-16,
    ) -> np.ndarray:
        """
        Compute the average precision, given the recall and precision curves.
        Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.

        Args:
            matches (np.ndarray): True positives.
            prediction_confidence (np.ndarray): Objectness value from 0-1.
            prediction_class_ids (np.ndarray): Predicted object classes.
            true_class_ids (np.ndarray): True object classes.
            eps (float): Small value to prevent division by zero.

        Returns:
            np.ndarray: Average precision for different IoU levels.
        """
        sorted_indices = np.argsort(-prediction_confidence)
        matches = matches[sorted_indices]
        prediction_class_ids = prediction_class_ids[sorted_indices]

        unique_classes, class_counts = np.unique(true_class_ids, return_counts=True)
        num_classes = unique_classes.shape[0]

        average_precisions = np.zeros((num_classes, matches.shape[1]))

        for class_idx, class_id in enumerate(unique_classes):
            is_class = prediction_class_ids == class_id
            total_true = class_counts[class_idx]
            total_prediction = is_class.sum()

            if total_prediction == 0 or total_true == 0:
                continue

            false_positives = (1 - matches[is_class]).cumsum(0)
            true_positives = matches[is_class].cumsum(0)
            recall = true_positives / (total_true + eps)
            precision = true_positives / (true_positives + false_positives)

            for iou_level_idx in range(matches.shape[1]):
                average_precisions[class_idx, iou_level_idx] = (
                    MeanAveragePrecision.compute_average_precision(
                        recall[:, iou_level_idx], precision[:, iou_level_idx]
                    )
                )

        return average_precisions

In [65]:
MeanAveragePrecision.from_tensors(cm_predicted_results, cm_target_results).map50_95

0.0