In [1]:
import pandas as pd
import numpy as np
import os
import subprocess
from tqdm import tqdm

### Create CSV of Image Locations

In [2]:
NUM_CLASSES = 10

In [3]:
data_path = "/mnt/hdd2/pillutla/data/imagenet_captions"
save_path = "/mnt/ssd/ronak/datasets/imagenet_captions"

train_path = f"{data_path}/imagenet_captions_train.csv"
val_path = f"{data_path}/imagenet_captions_val.csv"

In [4]:
df_train = pd.read_csv(train_path, sep="\t")
df_test = pd.read_csv(val_path, sep="\t")

In [5]:
images = df_train["filepath"].tolist()
train_images = [os.path.split(p)[0] for p in images]
train_folders, train_num_images_per_class = np.unique(train_images, return_counts=True)

images = df_test["filepath"].tolist()
test_images = [os.path.split(p)[0] for p in images]
test_folders, test_num_images_per_class = np.unique(test_images, return_counts=True)

In [6]:
# find 10 classes with the smallest number of images
sub_ind = np.argsort(test_num_images_per_class)[-NUM_CLASSES:]
sub_folders = test_folders[sub_ind]

In [7]:
for folder in sub_folders:
    idx = list(train_folders).index(folder)
    print(train_num_images_per_class[idx])

1185
1068
1150
1262
1150
1177
1524
1301
1120
1281


In [8]:
df_sub_train = df_train.loc[[folder in sub_folders for folder in train_images]].reset_index(drop=True)
df_sub_test = df_test.loc[[folder in sub_folders for folder in test_images]].reset_index(drop=True)

In [9]:
df_sub_train

Unnamed: 0,title,filepath
0,AL0226C-010-Meerkat-m,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
1,Erdmännchen / Meerkat,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
2,Meerkat,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
3,Ringtail Lemur,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
4,meerkat,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
...,...,...
12213,IMG_3457 Wallace & Diesel,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
12214,Terry and Posey,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
12215,thug,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
12216,DSCF2785.JPG,/mnt/hdd2/pillutla/data/imagenet_captions/imag...


In [10]:
superfolders_train = [f.split("/")[7] for f in df_sub_train['filepath'].tolist()]
superfolders_test = [f.split("/")[7] for f in df_sub_test['filepath'].tolist()]
for folder in tqdm(superfolders_train):
    os.makedirs(os.path.join(save_path, "imagenet_images_flickr", folder), exist_ok=True)
for folder in tqdm(superfolders_test):
    os.makedirs(os.path.join(save_path, "imagenet_images_flickr", folder), exist_ok=True)

100%|██████████| 12218/12218 [00:00<00:00, 98992.43it/s]
100%|██████████| 940/940 [00:00<00:00, 88830.34it/s]


In [12]:
# copy files over
for fp in tqdm(df_sub_train['filepath'].tolist()):
    subprocess.run(['cp', fp, os.path.join(save_path, "imagenet_images_flickr", fp.split("/")[7])], check=True)

100%|██████████| 12218/12218 [02:55<00:00, 69.59it/s]


In [13]:
# copy files over
for fp in tqdm(df_sub_test['filepath'].tolist()):
    subprocess.run(['cp', fp, os.path.join(save_path, "imagenet_images_flickr", fp.split("/")[7])], check=True)

100%|██████████| 940/940 [00:14<00:00, 64.60it/s]


In [14]:
# rewrite filepath to correct destination
df_sub_train['filepath'] = df_sub_train['filepath'].map(lambda x: os.path.join(save_path, x.split(data_path + "/")[-1]))
df_sub_test['filepath'] = df_sub_test['filepath'].map(lambda x: os.path.join(save_path, x.split(data_path + "/")[-1]))

In [15]:
df_sub_train.to_csv(f"{save_path}/imagenet_captions_train_c{NUM_CLASSES}.csv", sep="\t", index=False)
df_sub_test.to_csv(f"{save_path}/imagenet_captions_val_c{NUM_CLASSES}.csv", sep="\t", index=False)

In [None]:
images = df_sub_train["filepath"].tolist()
sub_train_images = [os.path.split(p)[0] for p in images]
np.unique(sub_train_images, return_counts=True)

In [None]:
images = df_sub_test["filepath"].tolist()
sub_test_images = [os.path.split(p)[0] for p in images]
np.unique(sub_test_images, return_counts=True)