## Create balanced dataset for CRIC

Based on the paper: https://doi.org/10.3390/jimaging7070111

And source code: https://github.com/debnasser/deep-learning-ensemble-jimaging

In [1]:
import os
import random
import shutil
import cv2

from skimage.util import random_noise
from skimage.restoration import denoise_tv_chambolle, denoise_bilateral
from sklearn.model_selection import train_test_split

In [2]:
TEST_SIZE = 0.2
SEED = 42
IN_MEM = False # Create Test and Train directories instead of reading images in memory

In [3]:
base_folder = './CRIC_data/'
classes = os.listdir(base_folder)
print(classes)

['ASC-H', 'SCC', 'HSIL', 'NILM', 'LSIL', 'ASC-US']


In [4]:
class_id_dict = {
    'ASC-H': 0,
    'ASC-US': 1,
    'SCC': 2,
    'HSIL': 3,
    'LSIL': 4,
    'NILM': 5
}

In [5]:
def opAugmentation(op, img):
    if img is None:
        print("opAug img is None!!")
    # rotation
    if(op == 1):
        new_img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
    elif(op == 2):
        new_img = cv2.rotate(img, cv2.ROTATE_180)
    elif(op == 3):
        new_img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
    # mirror
    elif(op == 4):
        new_img= cv2.flip(img, 1)
    elif(op == 5):
        img_rotate_90 = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
        new_img = cv2.flip(img_rotate_90, 1)
    elif(op == 6):
        img_rotate_180 = cv2.rotate(img, cv2.ROTATE_180)
        new_img = cv2.flip(img_rotate_180, 1)
    elif(op == 7):
        img_rotate_270 = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
        new_img = cv2.flip(img_rotate_270, 1)
    elif(op == 8):
        sigma = 0.05 
        noisy = random_noise(img, var=sigma**2)
        new_img = noisy
        new_img = new_img * 255
    elif(op == 9):
        sigma = 0.005 
        noisy = random_noise(img, var=sigma**2)
        new_img = denoise_tv_chambolle(noisy, weight=0.05, multichannel=True)
        new_img = new_img * 255
    elif(op == 10):
        sigma = 0.005 
        noisy = random_noise(img, var=sigma**2)
        new_img = denoise_bilateral(noisy, sigma_color=0.01, sigma_spatial=5, multichannel=True)
        new_img = new_img * 255
    return new_img


### Create Train and Test directories

- Use symbolic links where possible

In [6]:
def dataTest(imgs_test, class_folder, class_id, in_mem = False):
    X_test = []
    y_test = []

    for x in range(len(imgs_test)):
        if in_mem:
            X_test.append(cv2.imread(class_folder + imgs_test[x]))
            y_test.append(class_id)
        else:
            os.symlink('../' + imgs_test[x], class_folder + 'Test/' + imgs_test[x])
    return X_test, y_test

def dataaugmentationASCH(imgs_train, imgs_test,
                         class_folder = base_folder + 'ASC-H/', class_id = class_id_dict['ASC-H'], 
                         in_mem = IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])
        
    imgs_selected = random.sample(range(n_imgs), 21) # 24 + 6*21 = 150
    for img_sel in imgs_selected:
        ops = random.sample(range(1,10), 6)
        parts = imgs_train[img_sel].split('.')
        for op in ops:
            new_img = opAugmentation(op, X_train_aug[img_sel])
            if not in_mem:
                cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
            else:
                X_train_aug.append(new_img)
                y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

def dataaugmentationASCUS(imgs_train, imgs_test,
                          class_folder = base_folder + 'ASC-US/', class_id = class_id_dict['ASC-US'],
                          in_mem = IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])
        
    imgs_selected = random.sample(range(n_imgs), 35) # 80 + 2*35 = 150
    for img_sel in imgs_selected:
        ops = random.sample(range(1,10), 2)
        parts = imgs_train[img_sel].split('.')
        for op in ops:
            new_img = opAugmentation(op, X_train_aug[img_sel])
            if not in_mem:
                cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
            else:
                X_train_aug.append(new_img)
                y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

def dataaugmentationSCC(imgs_train, imgs_test,
                        class_folder = base_folder + 'SCC/', class_id = class_id_dict['SCC'], 
                        in_mem = IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])
        
    imgs_selected = random.sample(range(n_imgs), 15) # 16 + 9*15 = 150
    for img_sel in imgs_selected:
        parts = imgs_train[img_sel].split('.')
        for op in range(1,10):
            new_img = opAugmentation(op, X_train_aug[img_sel])
            if not in_mem:
                cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
            else:
                X_train_aug.append(new_img)
                y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

def dataaugmentationHSIL(imgs_train, imgs_test,
                         class_folder = base_folder + 'HSIL/', class_id = class_id_dict['HSIL'], 
                         in_mem=IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])        
        
    imgs_selected = random.sample(range(n_imgs), 17) # 19 + 8*17 = 155
    for img_sel in imgs_selected:
        ops = random.sample(range(1,10), 8)
        parts = imgs_train[img_sel].split('.')
        for op in ops:
            new_img = opAugmentation(op, X_train_aug[img_sel])
            if not in_mem:
                cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
            else:
                X_train_aug.append(new_img)
                y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

def dataaugmentationLSIL(imgs_train, imgs_test,
                         class_folder = base_folder + 'LSIL/', class_id = class_id_dict['LSIL'],
                         in_mem = IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])
            
    imgs_selected = random.sample(range(n_imgs), 18) # 132 + 1*18 = 150
    for img_sel in imgs_selected:
        parts = imgs_train[img_sel].split('.')
        op = random.randint(1,10)
        new_img = opAugmentation(op, X_train_aug[img_sel])
        if not in_mem:
            cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
        else:
            X_train_aug.append(new_img)
            y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

def dataaugmentationNILM(imgs_train, imgs_test, 
                         class_folder = base_folder + 'NILM/', class_id = class_id_dict['NILM'],
                         in_mem = IN_MEM):
    n_imgs = len(imgs_train)
    X_train_aug = []
    y_train_aug = []
    X_test = []
    y_test = []

    for x in range(n_imgs):
        img = cv2.imread(class_folder + imgs_train[x])
        X_train_aug.append(img)
        y_train_aug.append(class_id)
        if not in_mem:
            os.symlink('../' + imgs_train[x], class_folder + 'Train/' + imgs_train[x])
            
    imgs_selected = random.sample(range(n_imgs), 26) # 47 + 4*26 = 151
    for img_sel in imgs_selected:
        ops = random.sample(range(1,10), 4)
        parts = imgs_train[img_sel].split('.')
        for op in ops:
            new_img = opAugmentation(op, X_train_aug[img_sel])
            if not in_mem:
                cv2.imwrite(class_folder + 'Train/' + parts[-2] + '_' + str(op) + '.' + parts[-1], new_img)
            else:
                X_train_aug.append(new_img)
                y_train_aug.append(class_id)
    X_test, y_test = dataTest(imgs_test, class_folder, class_id)
    return X_train_aug, y_train_aug, X_test, y_test

fn_class = {
    'ASC-H': dataaugmentationASCH,
    'ASC-US': dataaugmentationASCUS,
    'SCC': dataaugmentationSCC,
    'HSIL': dataaugmentationHSIL,
    'LSIL': dataaugmentationLSIL,
    'NILM': dataaugmentationNILM
}

### Process each of the 6 folders in the CRIC data

In [7]:
random.seed(SEED) # Ensure reproducibility

X_train_aug_bal, y_train_aug_bal = [], []
X_val_aug_bal, y_val_aug_bal = [], []
X_test_final, y_test_final = [], []

for c in classes:
    print("Processing", c)
    if os.path.isdir(base_folder + c + '/Test/'):
        shutil.rmtree(base_folder + c + '/Test/') # remove the Test directory
    if os.path.isdir(base_folder + c + '/Train/'):
        shutil.rmtree(base_folder + c + '/Train/') # remove the Train directory

    imgs = os.listdir(base_folder + c + '/')
    imgs_train, imgs_test = train_test_split(imgs, test_size = TEST_SIZE, random_state=SEED)
    
    if not IN_MEM:
        os.makedirs(base_folder + c + '/Train/', exist_ok=True)
        os.makedirs(base_folder + c + '/Test/', exist_ok=True)
        
    X_train_aug, y_train_aug, X_test, y_test = fn_class[c](imgs_train, imgs_test)
    X_train_aug, y_train_aug, X_val_aug, y_val_aug = train_test_split(X_train_aug, y_train_aug, 
                                                                      test_size = TEST_SIZE, random_state=SEED)
    X_train_aug_bal = X_train_aug_bal + X_train_aug
    y_train_aug_bal = y_train_aug_bal + y_train_aug
    X_val_aug_bal = X_val_aug_bal + X_val_aug
    y_val_aug_bal = y_val_aug_bal + y_val_aug
    X_test_final = X_test_final + X_test
    y_test_final = y_test_final + y_test

Processing ASC-H


  new_img = denoise_tv_chambolle(noisy, weight=0.05, multichannel=True)


Processing SCC
Processing HSIL
Processing NILM
Processing LSIL


  new_img = denoise_bilateral(noisy, sigma_color=0.01, sigma_spatial=5, multichannel=True)


Processing ASC-US
