# Create BoundingBox Labeled Datasets from CSV ground truth


This script will create two labeled datasets: <dataset_name>_training and <dataset_name>_test

1. split training and test labeled datapoints
2. create jason-line files for training and test
3. upload jason-line files to default workspace blobstore, under fhl/datasets/<dataset_name>/label folder
4. register labeled datasets

In [None]:
import json
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split


In [None]:
datastore_name = 'workspaceblobstore'
dataset_name = 'NoWindowOD'

In [None]:
# read ground truth from csv file
raw_df = pd.read_csv('./labels/stage_2_train_labels.csv')

# adding label
def get_label(row):
    if row['Target']  == 0:
        val = 'Not Lung Opacity'
    elif row['Target'] == 1:
        val = 'Lung Opacity'
    return val
raw_df['Label'] =  raw_df.apply(get_label, axis=1)

# Filling NaN with average values
raw_df['x'].fillna(value=raw_df['x'].mean(), inplace=True)
raw_df['y'].fillna(value=raw_df['y'].mean(), inplace=True)
raw_df['width'].fillna(value=raw_df['width'].mean(), inplace=True)
raw_df['height'].fillna(value=raw_df['height'].mean(), inplace=True)


In [None]:
# merge by
IMG_WIDTH = 1024
IMG_HEIGHT = 1024
jsonline_obj = []
tags = []
target_count = 0
for patient_id in raw_df['patientId'].unique():
    obj = {}
    target = False
    obj['image_url'] = 'AmlDatastore://' + datastore_name + '/fhl/datasets/' + dataset_name + '/image/' + patient_id + '.png'
    obj['label'] = []
    obj['label_confidence'] = []
    sub_df = raw_df[raw_df['patientId'] == patient_id]
    for _, row in sub_df.iterrows():
        target = row['Label'] == 'Lung Opacity'
        label = {'label': row['Label'], 'topX': row['x']/IMG_WIDTH, 'topY': row['y']/IMG_HEIGHT, 'bottomX': (row['x'] + row['width'])/IMG_WIDTH, 'bottomY': (row['y'] + row['height'])/IMG_HEIGHT}
        obj['label'].append(label)
        obj['label_confidence'].append(1.0)
    jsonline_obj.append(obj)
    if target:
        tags.append('Lung Opacity')
        target_count += 1
    else:
        tags.append('Not Lung Opacity')
image_df = pd.DataFrame({'obj':jsonline_obj,'tag':tags})

In [None]:
image_df['tag'].value_counts()

In [None]:
#  Down-sample Not Lung Opacit
from sklearn.utils import resample
major_df = image_df[image_df['tag'] == 'Not Lung Opacity']
minor_df = image_df[image_df['tag'] == 'Lung Opacity']
down_sample_df = resample(major_df, replace=False, n_samples=minor_df.shape[0])
source_df = pd.concat([down_sample_df, minor_df])
train, test = train_test_split(source_df, test_size = 0.2, random_state = 2021)
train['tag'].value_counts()

In [None]:
train_obj = train['obj']
test_obj = test['obj']

In [None]:
import os
import json
def save_jasonline_file(objects: list, file_name: str):
    base_path = os.path.dirname(file_name)
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    with open(file_name, 'w') as jf:
        for obj in objects:
            jf.write('{}\n'.format(json.dumps(obj)))

In [None]:
# Generate jsonline files, which can be registered as labeled dataset
label_folder = os.path.abspath(os.path.join(os.path.curdir, 'label'))
save_jasonline_file(train_obj, os.path.join(label_folder, 'labeleddatapoints_training.jsonl'))
save_jasonline_file(test_obj, os.path.join(label_folder, 'labeleddatapoints_test.jsonl'))

In [None]:
import azureml.core
from azureml.core import Workspace
# Load workspace
ws = Workspace.from_config()
ds = ws.datastores['workspaceblobstore']
ds.upload(src_dir = label_folder, target_path= '/fhl/datasets/' + dataset_name + '/label/', overwrite= True)

In [None]:
# Create/register labeled dataset for training and test(inference)
from azureml.contrib.dataset.labeled_dataset import _LabeledDatasetFactory, LabeledDatasetTask

tags = {}
tags['labelingCreatedBy'] = "FHL Notebook"
tags['labelingProjectType'] = 'Object Identification (Bounding Box)'
tags['SourceDatastoreName'] = 'workspaceblobstore'
tags['SourceRelativePath'] = 'fhl/datasets/' + dataset_name + '/image/'
tags['labelingLabelName'] = '["Lung Opacity","No Lung Opacity"]'

training_dataset = _LabeledDatasetFactory.from_json_lines(task=LabeledDatasetTask.IMAGE_CLASSIFICATION, path=ds.path('fhl/datasets/' + dataset_name + '/label/labeleddatapoints_training.jsonl'))
training_dataset.register(ws, dataset_name + '_training', tags= tags)

test_dataset = _LabeledDatasetFactory.from_json_lines(task=LabeledDatasetTask.IMAGE_CLASSIFICATION, path=ds.path('fhl/datasets/' + dataset_name + '/label/labeleddatapoints_test.jsonl'))
test_dataset.register(ws, dataset_name + '_test', tags= tags)