In [2]:
%load_ext autoreload
%autoreload 2

In [21]:
import json
import os
import yaml
import shutil
import glob
from zipfile import ZipFile
from typing import List, Dict

### be aware of your current directory

In [4]:
print(os.getcwd())

/home/samehr/Desktop/cephal/cvmt/notebooks


## load parameters

In [22]:
with open("../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

### unzip the downloaded raw data into `../data/raw` directory

In [9]:
path_to_raw_zip_file = glob.glob(
    os.path.join(
        params['RAW_DATA_DIRECTORY'],
        params['DATASET_1_RAW_ZIP_FILE_NAME'],
    )
)[0]
print("path_to_raw_zip_file:  ", path_to_raw_zip_file)

path_to_raw_zip_file:   ../data/raw/radiography-20221212T190109Z-001.zip


In [10]:
with ZipFile(path_to_raw_zip_file, 'r') as f:
    f.extractall(params['RAW_DATA_DIRECTORY'])

### print the directories inside the `RAW_DATA_DIRECTORY`

In [14]:
print(
    [
        d for d in os.listdir(params['RAW_DATA_DIRECTORY']) if os.path.isdir(os.path.join(params['RAW_DATA_DIRECTORY'], d))
    ]
)


['radiography', '.ipynb_checkpoints']


# Trasnforming to "intermediate" stage 
Start cleaning the data in the directory named `radiography` and put the cleaned data 
into a new directory named `dataset_1`.

## Create the empty directory for `dataset_1`

In [18]:
interm_dir_path = os.path.join(
    params['INTERMEDIATE_DATA_DIRECTORY'],
    params['DATASET_1_INTERMEDIATE_DIR_NAME'],
)
if not os.path.exists(interm_dir_path):
    os.makedirs(interm_dir_path)
else:
    raise FileExistsError("The directory with address below exists! \n {}".format(interm_dir_path))

## clean the json files from unwanted fields

#### create a list of the sub-directories which are each holding one image data

In [19]:
img_folders = os.listdir(
    os.path.join(
        params['RAW_DATA_DIRECTORY'],
        'radiography',
    )
)

#### correct the json content and write to the newly created directory inside `../data/intermediate`

In [23]:
def empty_unwanted_fields_from_json(
    unwanted_fields: List[str],
    img_folder: str,
) -> None:
    """ Empty the unwanted fields in from a dict object which is read
    from a json file.
    """
    # create the path to the input json
    input_json_file_path = os.path.join(
        params['RAW_DATA_DIRECTORY'], 'radiography', img_folder, img_folder+'.json'
    )
    # read the file
    with open(input_json_file_path, 'r') as f:
        json_file = json.load(f)
    # remove the image data
    for field in unwanted_fields:
        if field in json_file:
            json_file[field] = None
    # create the path to the output json
    output_json_file_path = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERMEDIATE_DIR_NAME'],
        img_folder+'.json',
    )
    # write it back to disk
    with open(output_json_file_path, 'w') as f:
        json.dump(json_file, f)
    return None

In [24]:
for img_folder in img_folders:
    json_correction_errors = []
    # correct the json content
    try:
        empty_unwanted_fields_from_json(
            unwanted_fields=params['DATASET_1_UNWANTED_JSON_FIELDS'],
            img_folder=img_folder,
        )
    except Exception as e:
        print(e)
        json_correction_errors.append(img_folder)

[Errno 2] No such file or directory: '../data/raw/radiography/61/61.json'
[Errno 2] No such file or directory: '../data/raw/radiography/60/60.json'


## correct the folder structure of images

In [25]:
def unravel_image_folder(
    img_folder: str,
) -> None:
    """ Take out images from within the folders they are stored in. Store
    them in the new directory inside "intermediate" data directory.
    """
    orig_path = os.path.join(
        params['RAW_DATA_DIRECTORY'],
        'radiography', 
        img_folder, 
        'SR0000', 
        img_folder+'.jpg'
    )
    dest_path = os.path.join(
        params['INTERMEDIATE_DATA_DIRECTORY'],
        params['DATASET_1_INTERMEDIATE_DIR_NAME'],
        img_folder+'.jpg'
    )
    shutil.move(orig_path, dest_path)
    return None

In [26]:
for img_folder in img_folders:
    shutil_errors = []  
    # move the image file out of the folder SR0000
    try:
        unravel_image_folder(img_folder=img_folder)
    except Exception as e:
        print(e)
        shutil_errors.append(img_folder)

[Errno 2] No such file or directory: '../data/raw/radiography/61/SR0000/61.jpg'
[Errno 2] No such file or directory: '../data/raw/radiography/25/SR0000/25.jpg'
[Errno 2] No such file or directory: '../data/raw/radiography/60/SR0000/60.jpg'
[Errno 2] No such file or directory: '../data/raw/radiography/24/SR0000/24.jpg'
[Errno 2] No such file or directory: '../data/raw/radiography/69/SR0000/69.jpg'
