In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# specify substep parameters for interactive run
# this cell will be replaced during job run with the parameters from json within params subfolder
substep_params={
}

In [None]:
# load pipeline and step parameters - do not edit
from sinara.substep import get_pipeline_params, get_step_params
pipeline_params = get_pipeline_params(pprint=True)
step_params = get_step_params(pprint=True)

In [None]:
# define substep interface
from sinara.substep import NotebookSubstep, ENV_NAME, PIPELINE_NAME, ZONE_NAME, STEP_NAME, RUN_ID, ENTITY_NAME, ENTITY_PATH, SUBSTEP_NAME

substep = NotebookSubstep(pipeline_params, step_params, substep_params)

substep.interface(
    inputs =
    [
        {STEP_NAME: "data_load", ENTITY_NAME: "facemask_datasets"}, # images from data_load step
    ],
    tmp_entities =
    [    
        { ENTITY_NAME: "facemask_datasets"}, # extracted temporary images from Sinara Archive
        { ENTITY_NAME: "facemask_train_dataset"}, # temporary facemask dataset for classificator train
        { ENTITY_NAME: "facemask_val_dataset"}, # temporary facemask dataset for classificator eval
        { ENTITY_NAME: "facemask_test_dataset"}, # temporary facemask dataset for classificator test
    ],
    outputs = 
    [
        { ENTITY_NAME: "train_dataset"}, # dataset archived for classificator train
        { ENTITY_NAME: "val_dataset"}, # dataset archived  for classificator eval
        { ENTITY_NAME: "test_dataset"}, # dataset archived  for classificator test
    ]
)

substep.print_interface_info()

substep.exit_in_visualize_mode()

In [None]:
# specify all notebook wide libraries imports here
# Sinara lib imports is left in the place of their usage
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os.path as osp
import os
import cv2
import matplotlib.pyplot as plt
import plotly.express as px
import json
import shutil
from tqdm import tqdm
from utils.utils import convert_facemask_detectons_to_coco

In [None]:
# run spark
from sinara.spark import SinaraSpark
from sinara.archive import SinaraArchive

spark = SinaraSpark.run_session(0)
archive = SinaraArchive(spark)
SinaraSpark.ui_url()

### Loading cifar10_datasets_images (from the previous step data_load)

In [None]:
inputs = substep.inputs(step_name = "data_load")
tmp_entities = substep.tmp_entities()

# copy data from previos step to tmp_entities
archive.unpack_files_from_store_to_tmp(store_path=inputs.facemask_datasets, tmp_entity_dir=tmp_entities.facemask_datasets)

### Convert facemask annotations to image classification annotations 

In [None]:
dir_annotations = osp.join(tmp_entities.facemask_datasets, "ds", "ann")
coco_annotations = convert_facemask_detectons_to_coco(dir_annotations)

#### Cropping images by object facemask

In [None]:
dir_images = osp.join(tmp_entities.facemask_datasets, "ds", "img")
dir_crop_images = osp.join(tmp_entities.facemask_datasets, "crop")
os.makedirs(dir_crop_images, exist_ok=True)

categories = {cat_info["id"]: cat_info["name"] for cat_info in coco_annotations["categories"]}
df_annotations = pd.DataFrame(coco_annotations["annotations"])

for image_info in tqdm(coco_annotations["images"]):
    image_name = image_info["file_name"]
    image_path = osp.join(dir_images, image_name)
    image_id = image_info["id"]
    image_annotations = df_annotations[df_annotations["image_id"] == image_id]
    image = cv2.imread(image_path)
    for id_row, row in image_annotations.iterrows():
        x_tl, y_tl, w_obj, h_obj = row["bbox"]
        category_id = row["category_id"]
        obj_id = row["id"]
        
        x_br = x_tl + w_obj
        y_br = y_tl + h_obj
        image_crop = image[y_tl:y_br, x_tl:x_br]        
        category_name = categories[category_id]
        image_crop_path = osp.join(dir_crop_images, category_name, f"{str(obj_id)}_{image_name}")
        os.makedirs(osp.dirname(image_crop_path), exist_ok=True)
        cv2.imwrite(image_crop_path, image_crop)

## Get image pathes for facemask dataset

In [None]:
class_names = os.listdir(dir_crop_images)
facemask_dataset = []
for class_name in class_names:
    # Get images from dataset
    for img_name in os.listdir(osp.join(dir_crop_images, class_name)):        
        img_path = osp.join(dir_crop_images, class_name, img_name)
        if osp.isdir(img_path):
            continue
        facemask_dataset.append(img_path)

### Split Cifar10 Dataset to Train, Valid and Test

In [None]:
# split to train, valid and test parts
train_facemask_images, val_facemask_images = train_test_split(facemask_dataset, test_size=0.33, random_state=42)
val_facemask_images, test_facemask_images = train_test_split(val_facemask_images, test_size=0.1, random_state=42)

## Review FaceMask Datasets

In [None]:
# Let's view more images in a grid format
# Define the dimensions of the plot grid 
W_grid = 5
H_grid = 5

# fig, axes = plt.subplots(L_grid, W_grid)
# subplot return the figure object and axes object
# we can use the axes object to plot specific figures at various locations
fig, axes = plt.subplots(H_grid, W_grid, figsize = (10,10))

axes = axes.ravel() # flaten the 15 x 15 matrix into 225 array

n_train = len(train_facemask_images) # get the length of the train dataset

# Select a random number from 0 to n_train
for i in range(W_grid * H_grid): # create evenly spaces variables 
    # Select a random number
    image_index = np.random.randint(0, n_train)
    # read and display an image with the selected index
    img_path = train_facemask_images[image_index]
    label_name = osp.basename(osp.dirname(img_path))
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[i].imshow(img)
    axes[i].set_title(label_name, fontsize = 8)
    axes[i].axis('off')

plt.subplots_adjust(hspace=0.4)

### Overview of the distribution of labeled data from train, valid and test dataset

In [None]:
def plot_barh_labels(cifar10_images: list, title_name: str = "", num_fig = 0):
    facemask_labels = [osp.basename(osp.dirname(fpath)) for fpath in cifar10_images]
    labels, counts = np.unique(facemask_labels, return_counts=True)
    fig = plt.figure()
    plt.barh(labels, counts)
    plt.title(title_name)
    plt.show()

# distribution of labeled data from train dataset
plot_barh_labels(train_facemask_images, 'Class distribution in training dataset', 0)

# distribution of labeled data from valid dataset
plot_barh_labels(val_facemask_images, 'Class distribution in validation dataset', 1)

# # distribution of labeled data from test dataset
plot_barh_labels(test_facemask_images, 'Class distribution in testing dataset')

### Save temporarily train, validation and test cifar10 datasets

In [None]:
# Save images for train, validation and test cifar10 datasets to tmp_entities
def prepare_facemask_dataset_images(facemask_images, dest_img_folder: str):
    pack = []
    for source_img_path in tqdm(facemask_images):
        label_img = osp.basename(osp.dirname(source_img_path))
        dest_img_path = osp.join(dest_img_folder, label_img, osp.basename(source_img_path))
        os.makedirs(osp.dirname(dest_img_path), exist_ok=True)
        shutil.copyfile(source_img_path, dest_img_path)

prepare_facemask_dataset_images(train_facemask_images, dest_img_folder=tmp_entities.facemask_train_dataset )
prepare_facemask_dataset_images(val_facemask_images, dest_img_folder=tmp_entities.facemask_val_dataset)
prepare_facemask_dataset_images(test_facemask_images, dest_img_folder=tmp_entities.facemask_test_dataset)

### Archiving train, validation and test cifar10 datasets to Sinara Storage

In [None]:
# save tmp_entities (facemask_train_dataset, facemask_val_dataset, facemask_test_dataset) to outputs of step data_prep
outputs = substep.outputs()

archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.facemask_train_dataset, store_path=outputs.train_dataset)
archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.facemask_val_dataset, store_path=outputs.val_dataset)
archive.pack_files_from_tmp_to_store(tmp_entity_dir=tmp_entities.facemask_test_dataset, store_path=outputs.test_dataset)

In [None]:
# stop spark
SinaraSpark.stop_session()