In [None]:
import os
from urllib.request import urlopen

import torch
import torch.nn.functional as F
from open_clip import create_model_from_pretrained, get_tokenizer
from PIL import Image
from tqdm.notebook import tqdm

tensor(14.2849, grad_fn=<ExpBackward0>)

In [None]:
# Initialize the model and tokenizer
model, preprocess = create_model_from_pretrained("hf-hub:apple/DFN5B-CLIP-ViT-H-14-384")
tokenizer = get_tokenizer("ViT-H-14")
model.eval()
# Define the folder containing the images
image_folder = "public_test"

labels_list = [
    "Hotel exterior, outdoor area, or building facade",
    "Hotel room, living space, or bedroom with furniture",
    "Swimming pool or hotel pool area",
    "Billiard table, pool table, or game room",
    "Bathroom with toilet, shower, sink, or bath amenities",
    "Hotel restaurant, dining room, or eating area",
    "Hotel lobby, reception area, or entrance hall",
    "Beachfront, shoreline, or sandy beach area",
    "Corridors, hallways, or staircases in the hotel",
    "Food dishes, meals on plates, or table settings",
    "Conference room, meeting room, or seminar space",
    "Gym, fitness center, or exercise equipment area",
    "Balcony view, outdoor balcony, or terrace",
    "Terrace, patio, or outdoor courtyard",
    "Spa, sauna, wellness center, or relaxation area",
]
text = tokenizer(labels_list, context_length=model.context_length)
text_features = model.encode_text(text)
# List to store the results
results = []

# Iterate over each image in the folder
for image_file in tqdm(os.listdir(image_folder)):
    if image_file.lower().endswith((".png", ".jpg", ".jpeg")):
        image_path = os.path.join(image_folder, image_file)
        try:
            # Open and preprocess the image
            image = Image.open(image_path)
            image = preprocess(image).unsqueeze(0)

            # Perform inference
            with torch.no_grad(), torch.cuda.amp.autocast():
                image_features = model.encode_image(image)

                image_features = F.normalize(image_features, dim=-1)
                text_features = F.normalize(text_features, dim=-1)

                # Calculate probabilities
                logits = model.logit_scale.exp() * image_features @ text_features.T
                probs = logits.softmax(dim=-1)
                # text_probs = torch.sigmoid(model.logit_scale.exp() *
                # image_features @ text_features.T * )

            # Zip the labels with their corresponding probabilities
            # and store in the results list
            zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in probs[0]]))
            results.append({"image_file": image_file, "label_probabilities": zipped_list})
            break

        except Exception as e:
            print(f"Error processing {image_file}: {e}")

# # At this point, `results` contains the label probabilities for all processed images
# # Example usage: print the results
# for result in results:
#     print(f"Label probabilities for {result['image_file']}: {result['label_probabilities']}")

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


  0%|          | 0/1124 [00:00<?, ?it/s]

  with torch.no_grad(), torch.cuda.amp.autocast():


In [None]:
results

[{'image_file': 'public_test_432.jpg',
  'label_probabilities': [('Hotel exterior, outdoor area, or building facade',
    0.04),
   ('Hotel room, living space, or bedroom with furniture', 0.158),
   ('Swimming pool or hotel pool area', 0.033),
   ('Billiard table, pool table, or game room', 0.026),
   ('Bathroom with toilet, shower, sink, or bath amenities', 0.107),
   ('Hotel restaurant, dining room, or eating area', 0.032),
   ('Hotel lobby, reception area, or entrance hall', 0.081),
   ('Beachfront, shoreline, or sandy beach area', 0.018),
   ('Corridors, hallways, or staircases in the hotel', 0.143),
   ('Food dishes, meals on plates, or table settings', 0.008),
   ('Conference room, meeting room, or seminar space', 0.069),
   ('Gym, fitness center, or exercise equipment area', 0.062),
   ('Balcony view, outdoor balcony, or terrace', 0.067),
   ('Terrace, patio, or outdoor courtyard', 0.048),
   ('Spa, sauna, wellness center, or relaxation area', 0.109)]}]

In [None]:
results["label_probabilities"]

TypeError: list indices must be integers or slices, not str

In [None]:
# Initialize the model and tokenizer, and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, preprocess = create_model_from_pretrained("hf-hub:apple/DFN5B-CLIP-ViT-H-14-384")
model = model.to(device)
tokenizer = get_tokenizer("ViT-H-14")

image_folder = "public_test"

labels_list = [
    "Hotel exterior, outdoor area, or building facade",
    "Hotel room, living space, or bedroom with furniture",
    "Swimming pool or hotel pool area",
    "Billiard table, pool table, or game room",
    "Bathroom with toilet, shower, sink, or bath amenities",
    "Hotel restaurant, dining room, or eating area",
    "Hotel lobby, reception area, or entrance hall",
    "Beachfront, shoreline, or sandy beach area",
    "Corridors, hallways, or staircases in the hotel",
    "Food dishes, meals on plates, or table settings",
    "Conference room, meeting room, or seminar space",
    "Gym, fitness center, or exercise equipment area",
    "Balcony view, outdoor balcony, or terrace",
    "Terrace, patio, or outdoor courtyard",
    "Spa, sauna, wellness center, or relaxation area",
]
text = tokenizer(labels_list, context_length=model.context_length).to(device)

batch_size = 16

# List to store the results
results = []

# Preprocess the images and prepare them in batches


def load_images(image_folder):
    image_files = []
    images = []

    for image_file in os.listdir(image_folder):
        if image_file.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(image_folder, image_file)
            try:
                image = Image.open(image_path)
                image = preprocess(image)
                images.append(image)
                image_files.append(image_file)
            except Exception as e:
                print(f"Error processing {image_file}: {e}")

    return image_files, images


# Load images and preprocess
image_files, images = load_images(image_folder)

# Split the images into batches
for i in tqdm(range(0, len(images), batch_size)):
    batch_images = images[i : i + batch_size]
    batch_files = image_files[i : i + batch_size]

    # Stack images into a single tensor and move to GPU
    batch_images = torch.stack(batch_images).to(device)

    # Perform inference on the batch
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model.encode_image(batch_images)
        text_features = model.encode_text(text)
        image_features = F.normalize(image_features, dim=-1)
        text_features = F.normalize(text_features, dim=-1)

        # Calculate probabilities
        text_probs = torch.sigmoid(
            image_features @ text_features.T * model.logit_scale.exp() + model.logit_bias
        )

    # Store the results for the current batch
    for j, file_name in enumerate(batch_files):
        zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[j]]))
        results.append({"image_file": file_name, "label_probabilities": zipped_list})

# # At this point, `results` contains the label probabilities for all processed images
# # Example usage: print the results
# for result in results:
#     print(f"Label probabilities for {result['image_file']}: {result['label_probabilities']}")

  checkpoint = torch.load(checkpoint_path, map_location=map_location)


KeyboardInterrupt: 

In [None]:
model, preprocess = create_model_from_pretrained(
    "hf-hub:apple/DFN5B-CLIP-ViT-H-14-384", device="cpu"
)
tokenizer = get_tokenizer("ViT-H-14")

image = Image.open(
    urlopen(
        "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
    )
)
image = preprocess(image).unsqueeze(0)

labels_list = ["a dog", "a cat", "a donut", "a beignet"]
text = tokenizer(labels_list, context_length=model.context_length)

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features = F.normalize(image_features, dim=-1)
    text_features = F.normalize(text_features, dim=-1)

    print(model.logit_scale.exp())

    # text_probs = torch.softmax(image_features @ text_features.T * model.logit_scale.exp())

zipped_list = list(zip(labels_list, [round(p.item(), 3) for p in text_probs[0]]))
print("Label probabilities: ", zipped_list)

  checkpoint = torch.load(checkpoint_path, map_location=map_location)
  with torch.no_grad(), torch.cuda.amp.autocast():


tensor(14.2849)


NameError: name 'text_probs' is not defined