### Agenda
1. Reconcile images folder with imgtrain, imgval, imgtest
2. Delete error photos in imgtrain, imgval, imgtest (truncated + 2D images)

### Example of Corrupted Images
1. [truncated] /home/r8user2/Documents/HY/dress_data/datasets/imgtrain/吊带长裙/Co6q2FryfKaAR0MqAASwZkxJ6No175.jpg

### Actions Taken
1. 1682 2d images are deleted
2. 165 truncated images are deleted

### Reconciliation

In [1]:
import os
from collections import defaultdict
from PIL import Image
from PIL import ImageFile
PATH = '/home/r8user2/Documents/HY/dress_data/datasets'

In [3]:
# files in images folder
class_dict_img = {}
for class_ in os.listdir(PATH+'/images'):
    target_path = PATH + '/images/' + class_
    class_dict_img[class_] = len(os.listdir(target_path))

In [5]:
class_dict_train = {}
for class_ in os.listdir(PATH + '/imgtrain'):
    target_path = PATH + '/imgtrain/' + class_
    class_dict_train[class_] = len(os.listdir(target_path))

In [6]:
class_dict_val = {}
for class_ in os.listdir(PATH + '/imgval'):
    target_path = PATH + '/imgval/' + class_
    class_dict_val[class_] = len(os.listdir(target_path))

In [7]:
class_dict_test = {}
for class_ in os.listdir(PATH + '/imgtest'):
    target_path = PATH + '/imgtest/' + class_
    class_dict_test[class_] = len(os.listdir(target_path))

In [8]:
# Aggregate train, val and test
class_dict_img2 = {}
for class_ in class_dict_img.keys():
    cnt = class_dict_train[class_] + class_dict_val[class_] + class_dict_test[class_]
    class_dict_img2[class_] = cnt

In [9]:
for class_ in class_dict_img.keys():
    if class_dict_img2[class_] != class_dict_img[class_]:
        print('[%s] %d images v.s %d partition' % (class_, class_dict_img[class_], class_dict_img2[class_]))

[蝴蝶结长袖连衣裙] 2998 images v.s 4595 partition
[短袖旗袍] 2998 images v.s 2996 partition
[牛仔连衣裙子] 1340 images v.s 1339 partition
[单排扣连衣裙] 3003 images v.s 3002 partition


### Delete Error Photos (2D)

In [2]:
SOURCE_PATH = '/home/r8user2/Documents/HY/dress_data/datasets/'
PARTITION_MAP = {1: 'imgtrain', 2: 'imgval', 3: 'imgtest'}
CLASS_MAP = {}
for id_, class_type in enumerate(os.listdir(SOURCE_PATH + 'imgtrain')):
    CLASS_MAP[id_] = class_type

In [6]:
dist_dict = defaultdict(dict)
for partition_idx, folder in PARTITION_MAP.items():
    for class_idx, class_ in CLASS_MAP.items():
        target_path = SOURCE_PATH + folder +'/' + class_
        cnt = len([item for item in os.listdir(target_path) if 'jpg' in item])
        dist_dict[partition_idx][class_idx] = cnt
        #print('[%s/ %s]: %d' % (folder, class_, cnt))

In [12]:
# Input class index, partition set and photo index, output an image display
# [partition_idx] train: 1, val: 2, test: 3
def jpg_query(class_idx, partition_idx, file_idx, vis = True, print_path = False):
    # Set up target path
    class_type = CLASS_MAP[class_idx]
    partition_type = PARTITION_MAP[partition_idx]
    sub_target_path = SOURCE_PATH + partition_type + '/' + class_type
    target_filename = os.listdir(sub_target_path)[file_idx]
    target_path = sub_target_path + '/' + target_filename
    if print_path:
        print('Path: %s' % target_path)
    
    img = Image.open(target_path)
    if vis: 
    # Display image
        plt.imshow(img);
        print(np.asarray(img, dtype=np.uint8).shape)
    # np.unit8 = 2**8
    return np.asarray(img, dtype=np.uint8).shape

In [13]:
def is_error(class_idx, partition_idx, file_idx):
    dim = jpg_query(class_idx, partition_idx, file_idx, vis = False, print_path = False)
    if len(dim) == 3:
        is_error_ = 0
    else:
        is_error_ = 1
    return (partition_idx, class_idx, file_idx), is_error_

In [7]:
# Create input list for multiprocess
input_list = []
for partition_idx, partition_type in PARTITION_MAP.items():
    for class_idx, class_type in CLASS_MAP.items():
        jpg_num = dist_dict[partition_idx][class_idx]
        for i in range(jpg_num):
            tmp_tuple = (class_idx, partition_idx, i)
            input_list.append(tmp_tuple)
input_list = tuple(input_list)

In [25]:
# Multi-processing
import time
import multiprocess as mp
ImageFile.LOAD_TRUNCATED_IMAGES = True

start_time = time.time()
p = mp.Pool(processes = 60)

with p as pool:
    results = p.starmap(is_error, input_list)

p.close()
p.join()
print("---Multiprocess Complete: %d mins ---" % ((time.time() - start_time)/60))

---Multiprocess Complete: 6 mins ---


In [60]:
def get_path(tuples):
    error_tuples = [i for i in tuples if i[1] == 1]
    file_paths = []
    for t1, t2 in error_tuples:
        partition_folder = PARTITION_MAP[t1[0]]
        class_folder = CLASS_MAP[t1[1]]
        sub_path = SOURCE_PATH + partition_folder + '/' + class_folder
        filename = os.listdir(sub_path)[t1[2]]
        path = sub_path + '/' + filename
        file_paths.append(path)
    return file_paths

In [38]:
# 1682 corrupted images (2D array)
files_2d = get_path(results)
len(files_2d)

1682

In [52]:
# !!!! REMOVE THE FILES
for path in files_2d:
    os.remove(path)

### Delete Error Photos (Truncated)

In [45]:
ImageFile.LOAD_TRUNCATED_IMAGES = False

In [53]:
from skimage import io

def is_truncate(class_idx, partition_idx, file_idx):
    # set up path
    class_type = CLASS_MAP[class_idx]
    partition_type = PARTITION_MAP[partition_idx]
    sub_target_path = SOURCE_PATH + partition_type + '/' + class_type
    target_filename = os.listdir(sub_target_path)[file_idx]
    target_path = sub_target_path + '/' + target_filename
    #print('target_path: %s' % target_path)
    #if target_path == '/home/r8user2/Documents/HY/dress_data/datasets/imgtrain/吊带长裙/Co6q2FryfKaAR0MqAASwZkxJ6No175.jpg':
    #    print('I AM HERE')
    #save_path = '/home/r8user2/Documents/HY/dress_data/alex_workplace/dressdata_project/git_workplace/tt.jpg'
    is_error_ = 0
    try:
        img = io.imread(target_path)
    except:
        is_error_ = 1
        return (partition_idx, class_idx, file_idx), is_error_
    return (partition_idx, class_idx, file_idx), is_error_

In [56]:
# Multi-processing
import time
import multiprocess as mp

start_time = time.time()
p = mp.Pool(processes = 60)

with p as pool:
    trun_results = p.starmap(is_truncate, input_list)

p.close()
p.join()
print("---Multiprocess Complete: %d mins ---" % ((time.time() - start_time)/60))

---Multiprocess Complete: 29 mins ---


In [61]:
files_trun = get_path(trun_results)

In [63]:
# 165 truncated images
len(files_trun)

165

In [64]:
# !!!! REMOVE THE FILES
for path in files_trun:
    os.remove(path)