In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import yaml
import shutil
import time
from PIL import Image
import numpy as np

# Note!
unrar the folder for ISBI challenge data using either `7zip` in windows or `unrar` package in ubuntu before running this notebook! Also, make sure that you change the name of the extracted directory to `dataset_2`. The original directory name is `RawImage` which needs to be changed to `dataset_2`.


In [3]:
with open("../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [4]:
params

{'RAW_DATA_DIRECTORY': '../data/raw',
 'PRIMARY_DATA_DIRECTORY': '../data/primary',
 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip',
 'DATASET_1_RAW_DIR_NAME': 'dataset_1',
 'DATASET_2_RAW_RAR_FILE_NAME': 'RawImage.rar',
 'DATASET_2_RAW_DIR_NAME_TEMP': 'dataset_3_bmp',
 'DATASET_2_RAW_DIR_NAME': 'dataset_3_raw_data'}

# Extract the rar file

In [5]:
dataset_2_rar_file_path = os.path.join(
    params['RAW_DATA_DIRECTORY'],
    params['DATASET_2_RAW_RAR_FILE_NAME'],
)

dataset_2_rar_extract_dest_path = os.path.join(
    params['RAW_DATA_DIRECTORY'],
    params['DATASET_2_RAW_DIR_NAME_TEMP'],
)

dataset_2_rar_extract_dest_path += '/'

In [6]:
!unrar x -inul {dataset_2_rar_file_path} {dataset_2_rar_extract_dest_path}

# start the process of conversion of bmp to jpeg

## get the name of the subfolders in params['DATASET_2_RAW_DIR_NAME_TEMP']

In [7]:
dataset_2_temp_path = os.path.join(
    params['RAW_DATA_DIRECTORY'],
    params['DATASET_2_RAW_DIR_NAME_TEMP'],
    params['DATASET_2_RAW_RAR_FILE_NAME'].split('.')[0],
)
sub_dirs = os.listdir(dataset_2_temp_path)

In [8]:
sub_dirs

['TrainingData', 'Test1Data', 'Test2Data']

# Create a new empty directory to store the jpeg files of images

In [9]:
dataset_2_dest_path = os.path.join(
    params['RAW_DATA_DIRECTORY'],
    params['DATASET_2_RAW_DIR_NAME'],
)

In [10]:
if not os.path.exists(dataset_2_dest_path):
    os.makedirs(dataset_2_dest_path)
else:
    print('The directory {} exists! Please remove it and try again!'.format(dataset_2_dest_path))

In [11]:
for sub_dir in sub_dirs:
    sub_dir_path = os.path.join(
        dataset_2_dest_path,
        sub_dir,
    )
    if not os.path.exists(sub_dir_path):
        os.makedirs(sub_dir_path)
    else:
        print('The directory {} exists! Please remove it and try again!'.format(sub_dir_path))

# Save bmp files as jpeg files and remove the original bmp files after that

In [12]:
for sub_dir in sub_dirs:
    source_sub_dir_path = os.path.join(dataset_2_temp_path, sub_dir)
    dest_sub_dir_path = os.path.join(dataset_2_dest_path, sub_dir)
    images_name = os.listdir(source_sub_dir_path)
    print('Source directory:')
    print(source_sub_dir_path)
    print('There are {} bmp images in the source directory'.format(len(images_name)))
    images_converted = []
    for image_name in images_name:
        bmp_image_path = os.path.join(source_sub_dir_path, image_name)
        jpg_image_path = os.path.join(dest_sub_dir_path, image_name.split('.')[0] + '.jpeg')
        img = Image.open(bmp_image_path)
        img.save(jpg_image_path, 'jpeg', subsampling=0, quality=95)
        images_converted.append(image_name)
    
    print('Destination directory:')
    print(dest_sub_dir_path)
    images_name = os.listdir(dest_sub_dir_path)
    print('There are {} bmp images in the destination directory'.format(len(images_name)))
    print()

Source directory:
../data/raw/dataset_3_bmp/RawImage/TrainingData
There are 150 bmp images in the source directory
Destination directory:
../data/raw/dataset_3_raw_data/TrainingData
There are 150 bmp images in the destination directory

Source directory:
../data/raw/dataset_3_bmp/RawImage/Test1Data
There are 150 bmp images in the source directory
Destination directory:
../data/raw/dataset_3_raw_data/Test1Data
There are 150 bmp images in the destination directory

Source directory:
../data/raw/dataset_3_bmp/RawImage/Test2Data
There are 100 bmp images in the source directory
Destination directory:
../data/raw/dataset_3_raw_data/Test2Data
There are 100 bmp images in the destination directory



# remove the temporary bmp images

In [13]:
shutil.rmtree(dataset_2_rar_extract_dest_path)