In [7]:
import pandas as pd
import glob
from pathlib import Path
import os
from natsort import natsorted
from PIL import Image
from utils import image_grid
import random
from imagededup.methods import PHash

In [3]:
def parse_answer(text):
    text = text.replace(" ", "").lower()
    if text == "yes":
        return True
    elif text == "no":
        return False
    else:
        raise ValueError

In [28]:
df1 = pd.read_csv("./llava_result1.csv")
df2 = pd.read_csv("./llava_result2.csv")
df = pd.concat([df1, df2])

name_mapping = {
    "0": "has_subject",
    "1": "is_abstract",
    "2": "is_birdeye_view",
    "3": "is_closeup_shot",
    "4": "is_macro_shot",
    "5": "looking_sky",
}

df["img_path"] = df["img_path"].apply(lambda x: x.replace("collocation-mturk/", "./"))
for i in range(6):
    df[name_mapping[str(i)]] = df[str(i)].apply(lambda x: parse_answer(x))
    df = df.drop(str(i), axis=1)
df.head()

Unnamed: 0,id,source,img_path,has_subject,is_abstract,is_birdeye_view,is_closeup_shot,is_macro_shot,looking_sky
0,0,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0801.png,True,False,False,True,False,False
1,1,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0802.png,True,False,False,True,True,False
2,2,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0803.png,True,False,False,True,True,False
3,3,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0804.png,True,False,False,False,False,False
4,4,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0805.png,True,False,False,True,False,False


## get unique images

In [8]:
root = "./data_preprocessed"
dirs = natsorted(list(glob.glob(os.path.join(root, "*"))))

phasher = PHash()
encodings = None
for dir in dirs:
    print(dir)
    temp_encoding = phasher.encode_images(image_dir=dir)
    temp_encoding = {os.path.join(dir, k): v for k, v in temp_encoding.items()}

    if encodings is None:
        encodings = temp_encoding
    else:
        encodings.update(temp_encoding)

2024-07-22 04:04:40,317: INFO Start: Calculating hashes...


./data_preprocessed/DIV2K_valid_HR


100%|██████████| 100/100 [00:05<00:00, 19.60it/s]
2024-07-22 04:04:46,453: INFO End: Calculating hashes!
2024-07-22 04:04:46,492: INFO Start: Calculating hashes...


./data_preprocessed/Flickr2K


100%|██████████| 2650/2650 [02:51<00:00, 15.49it/s]  
2024-07-22 04:07:38,635: INFO End: Calculating hashes!
2024-07-22 04:07:38,641: INFO Start: Calculating hashes...


./data_preprocessed/Flickr1024_val


100%|██████████| 112/112 [00:06<00:00, 16.44it/s]
2024-07-22 04:07:46,481: INFO End: Calculating hashes!
2024-07-22 04:07:46,490: INFO Start: Calculating hashes...


./data_preprocessed/HRWSI_val


100%|██████████| 400/400 [00:27<00:00, 14.52it/s]
2024-07-22 04:08:15,083: INFO End: Calculating hashes!


In [9]:
duplicates = phasher.find_duplicates(encoding_map=encodings)
img_unique = []
excluded = set()
for k, v in duplicates.items():
    if len(v) == 0:
        img_unique.append(k)
    elif k not in excluded:
        excluded.update(set(v))
        img_unique.append(k)

print(len(img_unique))
print(len(duplicates))

2024-07-22 04:10:29,301: INFO Start: Evaluating hamming distances for getting duplicates
2024-07-22 04:10:29,303: INFO Start: Retrieving duplicates using Cython Brute force algorithm
100%|██████████| 3262/3262 [00:00<00:00, 9690.90it/s]
2024-07-22 04:10:30,702: INFO End: Retrieving duplicates using Cython Brute force algorithm
2024-07-22 04:10:30,703: INFO End: Evaluating hamming distances for getting duplicates


3200
3262


In [30]:
unique_df = df[df["img_path"].isin(img_unique)]
len(unique_df)

3200

## filter and categorize

In [38]:
mask = ~(
    unique_df["is_abstract"] |
    unique_df["is_birdeye_view"] |
    unique_df["is_macro_shot"] |
    unique_df["looking_sky"]
)

data_df = unique_df[mask].reset_index(drop=True)
data_df.head()

Unnamed: 0,id,source,img_path,has_subject,is_abstract,is_birdeye_view,is_closeup_shot,is_macro_shot,looking_sky
0,0,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0801.png,True,False,False,True,False,False
1,3,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0804.png,True,False,False,False,False,False
2,4,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0805.png,True,False,False,True,False,False
3,5,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0806.png,True,False,False,False,False,False
4,6,DIV2K_valid_HR,./data_preprocessed/DIV2K_valid_HR/0807.png,True,False,False,False,False,False


In [45]:
small_data_df=data_df.sample(frac=0.05,random_state=0)
big_data_df=data_df.drop(small_data_df.index)

print(len(small_data_df))
print(len(big_data_df))

save_path = "./dataset/rev1/small_dataset.csv"
os.makedirs(Path(save_path).parent, exist_ok=True)
small_data_df.to_csv(save_path, index=False)

save_path = "./dataset/rev1/big_dataset.csv"
os.makedirs(Path(save_path).parent, exist_ok=True)
big_data_df.to_csv(save_path, index=False)

140
2667


In [None]:
small_data_df