In [1]:
import json
import random
import os
import PIL
from PIL import Image
from io import BytesIO
import requests
import argparse
import yaml
import torch.nn.functional as F
from pathlib import Path
from transformers import CLIPProcessor, CLIPModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [15]:
input_dir = "./larger-set/japan"
train_dir = "/usr1/manhbaon/hw/img2img-turbo/data/food_extended/train_B"
test_dir = "/usr1/manhbaon/hw/img2img-turbo/data/food_extended/test_B"
prompt_filter = "This is clearly a photo centered on an authentic Japanese food dish. Typical food from Japan."

In [16]:
def list_files(directory):
    return [str(file.resolve()) for file in Path(directory).iterdir() if file.is_file()]

def load_image(path):
    image = PIL.Image.open(path)
    image = PIL.ImageOps.exif_transpose(image)
    image = image.convert("RGB")
    return image

def load_batch_image(list_path):
    images = []
    paths = []
    for path in list_path:
        try:
            images.append(load_image(path))
            paths.append(path)
        except:
            continue
    assert len(images) == len(paths)
    return images, paths

In [17]:
all_image_paths = list_files(input_dir)

selected_image_paths = [] 
selected_sims = []

In [9]:
batch_size = 16

for i in range(0, len(all_image_paths), batch_size):
    images_batch, images_path = load_batch_image(all_image_paths[i:min(i + batch_size, len(all_image_paths))])
    selected_image_paths.extend(images_path) 
    assert len(images_batch) == len(images_path)

    prompt_batch = [prompt_filter] * len(images_batch) 
    prompt_features = processor(text = prompt_batch, images = None, return_tensors="pt", padding = True)
    prompt_features = model.get_text_features(**prompt_features).cuda() 

    image_inputs = processor(text = None, images = images_batch, return_tensors = "pt", padding = True)
    image_features = model.get_image_features(**image_inputs).cuda()

    selected_sims.extend(F.cosine_similarity(prompt_features, image_features).cpu().tolist())

    print(f" {len(images_batch)} processed !")


 16 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 15 processed !
 16 processed !
 16 processed !
 14 processed !
 16 processed !
 14 processed !
 15 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 14 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !




 16 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 14 processed !
 16 processed !
 15 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 16 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 15 processed !
 16 processed !
 16 processed !
 15 processed !
 15 processed !
 15 processed !
 15 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 16 processed !
 15 processed !
 16 processed !
 16 processed !
 15 processed !
 16 processed !
 15 processed !
 16 processed !
 14 processed !
 16 processed !
 16 processed !
 15 processed !
 15 processed !
 16 processed !
 16 processed !
 15 processed !
 16 processed !
 16 processed !
 16 processed !
 16 processed !
 15 processed !
 16 processed !
 15 processed !
 14 processed !
 15 processed !
 16 processed !
 16 processed !
 15 processed !
 14 processed !
 16 processed !
 16 processed !
 13 processed !
 15 processed !
 16 processed !
 16 proc

KeyboardInterrupt: 

In [26]:
topk = 2_000
topk_indices = sorted(range(len(selected_image_paths)), key=lambda i: selected_sims[i], reverse=True)[:topk]
topk_paths = [selected_image_paths[i] for i in topk_indices]

In [18]:
import random

def get_train_test_paths(paths, split_ratio = 0.9):
    # Shuffle the data randomly
    random.shuffle(paths)
    # Calculate the split index
    split_index = int(len(paths) * split_ratio)
    # Split the data into train and test sets
    train_set = paths[:split_index]
    test_set = paths[split_index:]
    return train_set, test_set

In [19]:
import shutil 

train_paths, test_paths = get_train_test_paths(all_image_paths)

for train_path in train_paths:
    shutil.copy(train_path, train_dir)

for test_path in test_paths:
    shutil.copy(test_path, test_dir)


In [20]:
len(all_image_paths)

3587

In [32]:
import pandas as pd

path_captions = "./larger-set/japan-india/metadata.csv" 

# Load the CSV file into a DataFrame
df = pd.read_csv(path_captions)

idx2captions = {}

def extract_id_from_path(path:str):
    filename = path.split('/')[-1]
    return filename


# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Process each row as needed
    filename = extract_id_from_path(row["src_image_path"])
    caption_original= row["caption"]
    caption_edited = row["llm_edit"]
    idx2captions[filename] = {"cap_original":caption_original, "cap_edited": caption_edited}

In [33]:
output_dict = "./larger-set/captions_b.json"

with open(output_dict, "w") as json_file:
    json.dump(idx2captions, json_file)