## Plan
- v1.0.0 -> images/class/image1.jpeg
- in v2.0.0 we will add another upper dir to specify the image's split before class name -> images/train/class/image1.jpeg (note: this also helps deepchecks to work correctly)
- update 'abs_path' in annotation_df.csv accordingly
- in v1.0.0 we also have mixes of 3 image extensions: .jpeg, .jpg, .png
- in v2.0.0 we will convert them into only 1 extension for convention 

In [27]:
import os
import cv2
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

In [28]:
def cvt_img_to_ext(img_path, out_ext='jpeg'):
    ori_ext = img_path[img_path.rfind('.')+1:]
    im = Image.open(img_path)
    im = im.convert('RGB')
    new_img_path = img_path.replace(ori_ext, out_ext)
    im.save(new_img_path)
    return new_img_path

In [29]:
ds_path = '/home/ariya/workspace/datasets/animals10-dvc/images/'

In [33]:
## check exts
all_names = []
not_unique = []
for abs_path, subdirs, files in os.walk(ds_path):
    for file in files:
        if file in all_names:
            not_unique.append(os.path.join(abs_path,file))
        all_names.append(file)

['/home/ariya/workspace/datasets/animals10-dvc/images/chicken/10.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/100.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1001.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1013.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1016.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1017.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1018.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1021.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1024.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1028.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/103.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1030.jpeg',
 '/home/ariya/workspace/datasets/animals10-dvc/images/chicken/1032.jpeg',
 '/home/ariya/workspace/datasets/animals10

In [18]:
len(all_names), len(not_unique)

(26179, 339)

In [19]:
np.unique(np.array(list(map(lambda x: x[x.rfind('.'):], all_names))), return_counts=True)

(array(['.jpeg', '.jpg', '.png'], dtype='<U5'), array([24209,  1919,    51]))

## Do them all in one go
since the majority of images are in .jpeg, so we will convert the rest of them to .jpeg

In [20]:
annot_path = '/home/ariya/workspace/datasets/animals10-dvc/annotation_df.csv'
annot_df = pd.read_csv(annot_path)

In [21]:
splits = annot_df['split'].unique()
for split in splits:
    os.makedirs(os.path.join(ds_path, split), exist_ok=True) 

In [35]:
new_image_name_col = []
new_abs_path_col = []
for idx, row in tqdm(annot_df.iterrows(), total=len(annot_df)):
    split = row['split']
    # img_abs_path = row['abs_path']
    img_abs_path = not_unique
    if not img_abs_path.endswith('.jpeg'):
        img_abs_path = cvt_img_to_ext(img_abs_path, 'jpeg')
    # split_path = img_abs_path.split(os.path.sep)
    # split_path.insert(-2, split)
    # # .../images/chicken/928_renamed.jpeg -> .../images/train/chicken/928_renamed.jpeg
    # new_img_name = split_path[-1]
    # new_out_dir = '/' + os.path.join(*split_path[:-1])
    # new_abs_path = '/' + os.path.join(*split_path)
    # if not os.path.exists(new_out_dir):
    #     os.makedirs(new_out_dir, exist_ok=True)
        
    # shutil.copy2(img_abs_path, new_abs_path)
    # new_image_name_col.append(new_img_name)
    # new_abs_path_col.append(new_abs_path)


  0%|          | 0/26179 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'endswith'

In [None]:
# quick manual check
all_names = []
not_unique = []
for abs_path, subdirs, files in os.walk(os.path.join(ds_path,'train')):
    for file in files:
        if file in all_names:
            not_unique.append(os.path.join(abs_path,file))
        all_names.append(file)

In [None]:
print(len(all_names), len(not_unique))
np.unique(np.array(list(map(lambda x: x[x.rfind('.'):], all_names))), return_counts=True)

In [None]:
# quick manual check
all_names = []
not_unique = []
for abs_path, subdirs, files in os.walk(os.path.join(ds_path,'test')):
    for file in files:
        if file in all_names:
            not_unique.append(os.path.join(abs_path,file))
        all_names.append(file)

In [None]:
print(len(all_names), len(not_unique))
np.unique(np.array(list(map(lambda x: x[x.rfind('.'):], all_names))), return_counts=True)

In [None]:
# quick manual check
all_names = []
not_unique = []
for abs_path, subdirs, files in os.walk(os.path.join(ds_path,'valid')):
    for file in files:
        if file in all_names:
            not_unique.append(os.path.join(abs_path,file))
        all_names.append(file)

In [None]:
print(len(all_names), len(not_unique))
np.unique(np.array(list(map(lambda x: x[x.rfind('.'):], all_names))), return_counts=True)

In [None]:
20943 + 2618 + 2618

In [None]:
len(new_image_name_col)

In [None]:
new_annot_df = annot_df.copy()
new_annot_df['image_name'] = new_image_name_col
new_annot_df['abs_path'] = new_abs_path_col

In [None]:
for path in new_annot_df['abs_path']:
    if 'checkpoint' in path:
        print('JUPYTER',path)
    if not os.path.exists(path):
        print('MISSING:',path)

In [None]:
np.unique(new_annot_df['split'], return_counts=True)

In [None]:
new_annot_df.to_csv('/home/ariya/workspace/datasets/animals10-dvc/annotation_df.csv', index=False)