## Testing Semantic Segmentation Models
- in this book I test some semantic segmentatin model. It is best run on Colab

In [None]:
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
from PIL import Image
import requests
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from torch import nn
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt

import sys

sys.path.append("/src/")

from visualization.visualize import get_seg_img, get_logits

In [None]:
from transformers import (
    AutoImageProcessor,
    Mask2FormerForUniversalSegmentation,
    ViTModel,
    BeitFeatureExtractor,
)
from transformers import (
    BeitForSemanticSegmentation,
    AutoProcessor,
    CLIPSegForImageSegmentation,
)

In [None]:
image_path = ""  # path to the image

In [None]:
# this is to test how one model worsks
i = 1
model_name = "google/vit-base-patch16-224-in21k"
print(model_name)
# Load the model and its preprocessor
processor = SegformerImageProcessor.from_pretrained(model_name)
model = ViTModel.from_pretrained(model_name)

image = Image.open(image_path)  # Load an image
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.last_hidden_state

upsampled_logits = torch.nn.functional.interpolate(
    logits,
    size=image.size[::-1],  # (height, width)
    mode="bilinear",
    align_corners=False,
)

img_seg, seg = get_seg_img(
    image, upsampled_logits.argmax(dim=1)[0]
)  # Assuming get_seg_img is defined
plt.imshow(seg)
plt.axis("off")  # Optional: to hide axes for a cleaner visualization
plt.show()

In [None]:
from transformers import AutoModelForSemanticSegmentation, AutoFeatureExtractor
from PIL import Image
import torch

# Define the model names
model_names = [
    "nvidia/segformer-b3-finetuned-ade-512-512",
    "nvidia/segformer-b1-finetuned-cityscapes-1024-1024",
    "nvidia/segformer-b3-finetuned-cityscapes-1024-1024",
    "nvidia/segformer-b5-finetuned-cityscapes-1024-1024",
]

logits_list = []

for model_name in model_names:
    print(model_name)
    # Load the model and its preprocessor
    processor = SegformerImageProcessor.from_pretrained(model_name)
    try:
        model = SegformerForSemanticSegmentation.from_pretrained(model_name)
    except:
        model = Mask2FormerForUniversalSegmentation.from_pretrained(model_name)

    log_per_model = []

    for i in range(1):  # Assuming you have 10 images numbered from 0 to 9
        image = Image.open(image_path)  # Load an image
        inputs = processor(images=image, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits  # Or however you extract the logits/predictions
        log_per_model.append(logits)

    logits_list.append(log_per_model)

    # show the image
    upsampled_logits = torch.nn.functional.interpolate(
        logits,
        size=image.size[::-1],  # (height, width)
        mode="bilinear",
        align_corners=False,
    )
    img_seg, seg = get_seg_img(
        image, upsampled_logits.argmax(dim=1)[0]
    )  # Assuming get_seg_img is defined
    plt.imshow(seg)
    plt.axis("off")  # Optional: to hide axes for a cleaner visualization

# Process or save logits_list as needed

In [None]:
model_names2 = [
    "facebook/mask2former-swin-large-cityscapes-instance",
    "facebook/mask2former-swin-large-ade-semantic",
]

logits_list2 = []

for model_name in model_names2:
    print(model_name)
    # Load the model and its preprocessor
    processor = SegformerImageProcessor.from_pretrained(model_name)
    try:
        model = SegformerForSemanticSegmentation.from_pretrained(model_name)
    except:
        try:
            model = Mask2FormerForUniversalSegmentation.from_pretrained(model_name)
        except:
            model = ViTModel.from_pretrained(model_name)

    log_per_model = []

    for i in range(10):  # Assuming you have 10 images numbered from 0 to 9
        image = Image.open(image_path)  # Load an image
        inputs = processor(images=image, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        try:  # segformer
            logits = outputs.logits  # Or however you extract the logits/predictions
        except:  # mask2former
            logits = outputs.masks_queries_logits

        log_per_model.append(logits)

    logits_list.append(log_per_model)

    # show the image
    upsampled_logits = torch.nn.functional.interpolate(
        logits,
        size=image.size[::-1],  # (height, width)
        mode="bilinear",
        align_corners=False,
    )
    img_seg, seg = get_seg_img(
        image, upsampled_logits.argmax(dim=1)[0]
    )  # Assuming get_seg_img is defined
    plt.imshow(seg)
    plt.axis("off")  # Optional: to hide axes for a cleaner visualization

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-segmentation", model="facebook/maskformer-swin-base-coco")

# show the image
image = Image.open(image_path)  # Load an image
result = pipe(image)
plt.imshow(result["segmentation"])
plt.axis("off")  # Optional: to hide axes for a cleaner visualization

In [None]:
model_short_names = [
    "Seg-B3-ADE",
    "Seg-B1-CS",
    "Seg-B3-CS",
    "Seg-B5-CS",
    "M2F-SL-ADE",
    "M2F-SL-CS",
]

fig, axs = plt.subplots(
    10, len(model_short_names), figsize=(20, 27)
)  # Adjust size as needed

for img_idx in range(10):  # Assuming you have 10 images per model
    image = Image.open(image_path)
    for model_idx, model_name in enumerate(model_short_names):
        upsampled_logits = nn.functional.interpolate(
            logits_list[model_idx][
                img_idx
            ],  # Assuming each model has a list of logits for each image
            size=image.size[::-1],  # Assuming 'image' is your PIL Image or similar
            mode="bilinear",
            align_corners=False,
        )

        # Convert upsampled logits to segmentation image
        img_seg, _ = get_seg_img(image, upsampled_logits.argmax(dim=1)[0])

        ax = axs[img_idx, model_idx]
        ax.imshow(img_seg)
        ax.axis("off")  # Hide axes for a cleaner visualization

        if model_idx == 0:
            # Label the row with the model name
            ax.set_ylabel(
                f"frame_X.jpg", rotation=90, size="large", labelpad=20
            )

        if img_idx == 0:
            ax.set_title(model_name, size="large", pad=20)


# for ax, img_idx in zip(axs[:, 0], range(10)):
#     ax.annotate(f'frame_24{img_idx}00', xy=(-0, 0), xytext=(-ax.yaxis.labelpad + 5, 0),
#                  ha='right', va='center', rotation=90)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to accommodate titles
plt.suptitle("Comparison Segmentation", size="x-large")
plt.show()

In [None]:
import torch
import torch.nn as nn

def load_logits(img_shape, seg_path):
    # Load logits from the specified path
    logits = torch.load(seg_path)    

    # Check if the shape matches the expected shape
    if logits.shape == torch.Size([1, 19, 128, 128]):
        logits = logits.float()  # Convert to 'float' data type
        
        # Upsample the logits to match the image shape
        upsampled_logits = nn.functional.interpolate(logits, size=img_shape, mode='bilinear', align_corners=False)

        # Get both the maximum probabilities and their corresponding labels
        max_probs, labels = torch.max(upsampled_logits, dim=1)
        label = labels[0]  # Extract the most probable label matrix
        prob = max_probs[0]  # Extract the probability matrix corresponding to the most probable labels
        
        print("Upsampled!")
        return label, prob  # Return both the label matrix and the probability matrix


In [None]:
# Load model directly
from transformers import AutoImageProcessor, MaskFormerForInstanceSegmentation

processor = AutoImageProcessor.from_pretrained("facebook/maskformer-swin-base-coco")
model = MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-base-coco")

In [None]:
# Load model directly
from transformers import AutoProcessor, CLIPSegForImageSegmentation

processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

In [None]:
# Load model directly
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation

processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-swin-small")
model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-small")

In [None]:
# Load model directly
from transformers import AutoProcessor, OneFormerForUniversalSegmentation

processor = AutoProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_large")

In [None]:
for i in range(10):  # Assuming you have 10 images numbered from 0 to 9
    image = Image.open(
        f"/home/pvondrlik/Desktop/BA_Thesis/repo-movie-analysis/data/Expl_2_ET_1_2023-09-06_10-36-37_ET/video_frames_img/frame_25{i}00.jpg"
    )  # Load an image
    inputs = processor(images=image, task_inputs="instance")

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits  # Or however you extract the logits/predictions

    upsampled_logits_org = nn.functional.interpolate(
        logits,
        size=image.size[::-1],  # (height, width)
        mode="bilinear",
        align_corners=False,
    )

    img_seg, _ = get_seg_img(image, upsampled_logits_org.argmax(dim=1)[0])
    plt.imshow(img_seg)
    plt.show()

logits_list.append(log_per_model)

In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import AutoProcessor, CLIPSegForImageSegmentation

processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

prompts = [
    "street",
    "buildings",
    "people",
    "living things",
    "sidewalk",
    "shadow",
    "fence",
]

inputs = processor(
    text=prompts,
    images=[image] * len(prompts),
    padding="max_length",
    return_tensors="pt",
)

# predict
with torch.no_grad():
    outputs = model(**inputs)

preds = outputs.logits.unsqueeze(1)

# visualize prediction
_, ax = plt.subplots(1, 5, figsize=(15, 4))
[a.axis("off") for a in ax.flatten()]
ax[0].imshow(image)
[ax[i + 1].imshow(torch.sigmoid(preds[i][0])) for i in range(4)]
[ax[i + 1].text(0, -15, prompts[i]) for i in range(4)]

In [None]:
# Load model directly
from transformers import AutoImageProcessor, UperNetForSemanticSegmentation

processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-swin-small")
model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-small")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch

pixel_values = processor(image, return_tensors="pt").pixel_values
print(pixel_values.shape)

with torch.no_grad():
  outputs = model(pixel_values)

  
def ade_palette():
    """ADE20K palette that maps each class to RGB values."""
    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
            [102, 255, 0], [92, 0, 255]]

seg = processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[seg == label, :] = color
# Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(image) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

In [None]:
from transformers import AutoProcessor, OneFormerForUniversalSegmentation

processor = AutoProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_large")
model = OneFormerForUniversalSegmentation.from_pretrained(
    "shi-labs/oneformer_ade20k_swin_large"
)

In [None]:
from super_gradients.training import models
from ultralytics import YOLO
import streamlit as st
from PIL import Image
import numpy as np

# Load a YOLO model
model = YOLO("yolov8n.pt")  # Or your custom model

# Run prediction
results1 = model(
    path,
)  # Replace with your source

# Convert results image to PIL Image for Streamlit
annotated_img = Image.fromarray(results[0].plot()[..., ::-1])  # Convert BGR to RGB

# Display the image in Streamlit
st.image(annotated_img, caption="Annotated Image")

yolo_nas_l = models.get("yolo_nas_l", pretrained_weights="coco")

In [None]:
show_result_pyplot(model, img, result, score_thr=0.3)