In [1]:
import pandas as pd
import numpy as np
import os
import subprocess
from tqdm import tqdm

### Create CSV of Image Locations

In [2]:
NUM_CLASSES = 250

In [3]:
data_path = "/mnt/hdd2/pillutla/data/imagenet_captions"
save_path = f"/mnt/ssd/ronak/datasets/imagenet_captions_{NUM_CLASSES}k"

train_path = f"{data_path}/imagenet_captions_train.csv"
val_path = f"{data_path}/imagenet_captions_val.csv"

In [4]:
df_train = pd.read_csv(train_path, sep="\t")
df_test = pd.read_csv(val_path, sep="\t")

Load list of classes.

In [5]:
class_df = pd.read_csv("map_clsloc.txt", sep=" ", header=None)
class_df.columns = ["directory", "class_idx", "class_name"]
class_df

Unnamed: 0,directory,class_idx,class_name
0,n02119789,1,kit_fox
1,n02100735,2,English_setter
2,n02110185,3,Siberian_husky
3,n02096294,4,Australian_terrier
4,n02102040,5,English_springer
...,...,...,...
995,n03063599,996,coffee_mug
996,n04116512,997,rubber_eraser
997,n04325704,998,stole
998,n07831146,999,carbonara


Load GloVe embeddings.

In [6]:
# load class dictionary
embeddings = set()
fpath = "/mnt/ssd/ronak/datasets/glove/glove.6B.50d.txt"
with open(fpath, 'r', encoding="utf-8") as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        embeddings.add(word)

400000it [00:00, 403578.64it/s]


In [7]:
# find relevant directories
all_dirs = class_df['directory'].tolist()
all_classes = class_df['class_name'].tolist()

glove_dirs = set()
for class_name, dir_name in zip(all_classes, all_dirs):
    if class_name in embeddings:
        glove_dirs.add(dir_name)

print(f"Number of folders with GloVe embeddable captions: {len(glove_dirs)}/{len(all_dirs)}.")

Number of folders with GloVe embeddable captions: 539/1000.


In [8]:
# find classes with GloVe Embeddings.

images = df_train["filepath"].tolist()
all_train_images = [os.path.split(p)[0] for p in images]
train_images = []
for fpath in all_train_images:
    if fpath.split("/")[-1] in glove_dirs:
        train_images.append(fpath) 
train_folders, train_num_images_per_class = np.unique(train_images, return_counts=True)

images = df_test["filepath"].tolist()
all_test_images = [os.path.split(p)[0] for p in images]
test_images = []
for fpath in all_test_images:
    if fpath.split("/")[-1] in glove_dirs:
        test_images.append(fpath) 
test_folders, test_num_images_per_class = np.unique(test_images, return_counts=True)

print(f"number of classes (train): {len(train_num_images_per_class)}")
print(f"number of classes (test):  {len(test_num_images_per_class)}")

number of classes (train): 538
number of classes (test):  538


In [9]:
# find NUM_CLASSES classes with the smallest number of images
sub_ind = np.argsort(test_num_images_per_class)[-NUM_CLASSES:]
sub_folders = test_folders[sub_ind]

In [10]:
total = 0
for folder in sub_folders:
    idx = list(train_folders).index(folder)
    total += train_num_images_per_class[idx]
    print(train_num_images_per_class[idx])
print(f"Total: {total}")

503
599
537
488
564
445
488
562
436
574
505
518
596
482
561
424
493
582
603
424
487
538
455
588
441
610
460
540
611
522
588
532
586
727
567
470
586
539
478
687
488
479
544
652
557
583
537
681
501
688
508
610
666
509
461
483
617
576
638
571
496
650
638
635
556
546
623
654
625
534
596
507
633
581
572
816
634
775
517
545
701
590
513
502
479
671
550
709
641
642
445
527
596
540
738
511
594
696
605
496
515
680
751
646
703
541
634
588
617
580
594
690
535
496
594
556
596
524
545
623
587
657
590
775
759
558
753
672
663
489
812
577
817
801
725
713
439
840
692
682
821
613
730
790
633
781
844
709
543
798
777
634
634
543
685
895
760
733
837
721
840
674
716
890
743
743
805
743
905
682
697
676
788
682
827
850
798
746
900
711
711
779
731
800
811
783
725
868
844
707
734
783
698
915
660
932
846
784
885
787
907
968
888
605
947
784
907
823
964
966
848
859
802
802
836
843
868
904
1163
755
828
873
818
910
808
745
813
919
899
853
1014
949
795
918
962
1048
887
949
853
906
1002
912
975
1185
1109
1068
1262
1177

In [14]:
df_sub_train = df_train.loc[[folder in sub_folders for folder in all_train_images]].reset_index(drop=True)
df_sub_test = df_test.loc[[folder in sub_folders for folder in all_test_images]].reset_index(drop=True)

In [15]:
df_sub_train

Unnamed: 0,title,filepath
0,120-2060_IMG,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
1,127-2701_IMG,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
2,Tusker,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
3,Old Tusker,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
4,IMG_0031,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
...,...,...
174589,Royal Albatross (Northern),/mnt/hdd2/pillutla/data/imagenet_captions/imag...
174590,albatross,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
174591,mollymawk 3,/mnt/hdd2/pillutla/data/imagenet_captions/imag...
174592,Laysan albatross w/ chick,/mnt/hdd2/pillutla/data/imagenet_captions/imag...


In [16]:
superfolders_train = [f.split("/")[7] for f in df_sub_train['filepath'].tolist()]
superfolders_test = [f.split("/")[7] for f in df_sub_test['filepath'].tolist()]
for folder in tqdm(superfolders_train):
    os.makedirs(os.path.join(save_path, "imagenet_images_flickr", folder), exist_ok=True)
for folder in tqdm(superfolders_test):
    os.makedirs(os.path.join(save_path, "imagenet_images_flickr", folder), exist_ok=True)

100%|██████████| 174594/174594 [00:01<00:00, 130675.25it/s]
100%|██████████| 13635/13635 [00:00<00:00, 128679.90it/s]


In [17]:
superfolders_train

['n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n01871265',
 'n018

In [24]:
# copy files over
for folder in tqdm(np.unique(np.array(superfolders_train))):
    source_dir = os.path.join(data_path, "imagenet_images_flickr", folder)
    target_dir = os.path.join(save_path, "imagenet_images_flickr")
    subprocess.run(['cp', "-r", source_dir, target_dir], check=True)

  0%|          | 0/250 [00:00<?, ?it/s]

100%|██████████| 250/250 [02:15<00:00,  1.84it/s]


In [25]:
# rewrite filepath to correct destination
df_sub_train['filepath'] = df_sub_train['filepath'].map(lambda x: os.path.join(save_path, x.split(data_path + "/")[-1]))
df_sub_test['filepath'] = df_sub_test['filepath'].map(lambda x: os.path.join(save_path, x.split(data_path + "/")[-1]))

In [26]:
df_sub_train.to_csv(f"{save_path}/imagenet_captions_train_c{NUM_CLASSES}.csv", sep="\t", index=False)
df_sub_test.to_csv(f"{save_path}/imagenet_captions_val_c{NUM_CLASSES}.csv", sep="\t", index=False)

In [27]:
images = df_sub_train["filepath"].tolist()
sub_train_images = [os.path.split(p)[0] for p in images]
np.unique(sub_train_images, return_counts=True)

(array(['/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01498041',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01514668',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01514859',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01518878',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01531178',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01534433',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01558993',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01560419',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01580077',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01582220',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01592084',

In [28]:
images = df_sub_test["filepath"].tolist()
sub_test_images = [os.path.split(p)[0] for p in images]
np.unique(sub_test_images, return_counts=True)

(array(['/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01498041',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01514668',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01514859',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01518878',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01531178',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01534433',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01558993',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01560419',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01580077',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01582220',
        '/mnt/ssd/ronak/datasets/imagenet_captions_250k/imagenet_images_flickr/n01592084',