# Parameters

In [10]:
# Original Dataset Metadata
# Determine Subjects and Node Positions
ultrasound_positions = ['1TCC', '1TCC1', '2TMLI', '3TANI']
subject_names = ['Sub011', 'Sub012', 'Sub013', 'Sub014', 'Sub015', 'Sub016', 'Sub017']
num_of_classes = 6

# Output dataset path
dataset_path = 'datasets'

# Test Subject (Leave one out)
test_subjects = ['Sub011']

# Classes Needs to be dropped
to_drop = [1, 2]

# Reset
reset = True

## Reset

In [18]:
import os

def delete_files_in_directory(directory_path):
    files = os.listdir(directory_path)
    for file in files:
        file_path = os.path.join(directory_path, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
    print("All files deleted successfully.")

if reset:
  folders = ['train', 'val', 'test']
  sub_folders = ['images', 'labels']
  directory_path = 'datasets/'
  for folder in folders:
    for sub_folder in sub_folders:
      print(folder, sub_folder)
      delete_files_in_directory(directory_path + folder + '/' + sub_folder + '/')

train images
All files deleted successfully.
train labels
All files deleted successfully.
val images
All files deleted successfully.
val labels
All files deleted successfully.
test images
All files deleted successfully.
test labels
All files deleted successfully.


# Parsing Filenames

In [12]:
import os
import re
from pathlib import Path
filenames = []
total_empty_count = 0
total_file_count = 0

for subject in subject_names:
    # print(f"{subject}")
    for position in ultrasound_positions:
        # print(f"{position}")
        # Variables
        empty_count = 0
        annotation_count = 0

        bounding_box_filenames = []
        # Parameters
        mypath = Path(f"Dataset/{subject}/Trachea/{position}/Reviewed/")
        pattern = r".*\.txt$"

        for filename in os.listdir(mypath):
            if re.search(pattern, filename):
                bounding_box_filenames.append(filename)
        annotation_count = len(bounding_box_filenames)
        total_file_count += annotation_count
        for filename in bounding_box_filenames:
            with open(mypath / filename, 'r') as file_obj:
                first_char = file_obj.read(1)
                if not first_char:
                    # print(f"{filename} is empty")
                    empty_count += 1
                else:
                    filenames.append(f"{subject}_{position}_{str(filename[:-4])}")
                # # Count number of images
                # annotations = file_obj.readlines()
                # for annotation in annotations:
                #     class_label, x1, y1, x2, y2 = annotation.strip().split()
                #     class_count[class_label] += 1
# Results
        if empty_count > 0:
            print(f"{subject} has {empty_count} empty annotations out of {annotation_count} annotated files in {position} position")
            total_empty_count += empty_count
print("==============================")
print(f"Total {total_empty_count} empty annotations found out of {total_file_count} files")

Sub011 has 39 empty annotations out of 252 annotated files in 1TCC position
Sub011 has 11 empty annotations out of 252 annotated files in 1TCC1 position
Sub012 has 110 empty annotations out of 252 annotated files in 1TCC position
Sub012 has 53 empty annotations out of 126 annotated files in 1TCC1 position
Sub013 has 13 empty annotations out of 252 annotated files in 1TCC position
Sub014 has 156 empty annotations out of 252 annotated files in 1TCC position
Sub015 has 64 empty annotations out of 253 annotated files in 1TCC position
Sub015 has 43 empty annotations out of 126 annotated files in 1TCC1 position
Sub016 has 69 empty annotations out of 393 annotated files in 1TCC position
Sub016 has 10 empty annotations out of 131 annotated files in 1TCC1 position
Sub016 has 1 empty annotations out of 124 annotated files in 3TANI position
Sub017 has 144 empty annotations out of 252 annotated files in 1TCC position
Sub017 has 60 empty annotations out of 126 annotated files in 1TCC1 position
Tota

In [13]:
filenames[0]

'Sub011_1TCC_F000047_frame123'

# Utils

In [14]:
import random
# Train Test Split
def train_test(data, test_size, seed):
    random.seed(seed)
    X = data
    random.shuffle(X)
    print(len(X))
    split_idx = int(test_size * len(X))
    print(split_idx)
    return X[split_idx:], X[:split_idx]

In [15]:
import collections
# check if there are overlapping names
temp = [item for item, count in collections.Counter(filenames).items() if count > 1]
print(temp)
print(len(temp))

[]
0


# Generate Train-Val-Test datasets

In [16]:
import shutil

print(f'Total non-empty files {len(filenames)}')
# Create Test Set (leave one out)
for subject in test_subjects:
    # print(f"{subject}")
    for position in ultrasound_positions:
        # print(f"{position}")
        mypath = Path(f"Dataset/{subject}/Trachea/{position}/Reviewed/")
        pattern = r".*\.txt$"

        bounding_box_filenames_again = []

        for filename in os.listdir(mypath):
            if re.search(pattern, filename):
                bounding_box_filenames_again.append(filename[:-4])

        for filename in bounding_box_filenames_again:
            new_filename = f"{subject}_{position}_{str(filename)}"
            # print(new_filename)
            if new_filename not in filenames:
                continue
            shutil.copy(str(mypath / filename) + '.png', f'{dataset_path}/test/images/{new_filename}.png')
            shutil.copy(str(mypath / filename) + '.txt', f'{dataset_path}/test/labels/{new_filename}.txt')
            filenames.remove(new_filename)

print(f'After creating test files {len(filenames)}')
# Create Train-Val sets
train_list, val_list = train_test(filenames, test_size=0.1, seed=44)
print(len(train_list), len(val_list))
for subject in subject_names:
    if subject in test_subjects:
        continue
    # print(f"{subject}")
    for position in ultrasound_positions:
        # print(f"{position}")
        mypath = Path(f"Dataset/{subject}/Trachea/{position}/Reviewed/")
        pattern = r".*\.txt$"

        bounding_box_filenames_again = []

        for filename in os.listdir(mypath):
            if re.search(pattern, filename):
                bounding_box_filenames_again.append(filename[:-4])
        
        for filename in bounding_box_filenames_again:
            new_filename = f"{subject}_{position}_{str(filename)}"
            if new_filename not in filenames:
                continue
            if new_filename in train_list:
                shutil.copy(f'Dataset/{subject}/Trachea/{position}/Reviewed/{filename}.png', f'{dataset_path}/train/images/{new_filename}.png')
                shutil.copy(f'Dataset/{subject}/Trachea/{position}/Reviewed/{filename}.txt', f'{dataset_path}/train/labels/{new_filename}.txt')
            elif new_filename in val_list:
                shutil.copy(f'Dataset/{subject}/Trachea/{position}/Reviewed/{filename}.png', f'{dataset_path}/val/images/{new_filename}.png')
                shutil.copy(f'Dataset/{subject}/Trachea/{position}/Reviewed/{filename}.txt', f'{dataset_path}/val/labels/{new_filename}.txt')

# Output
temp_path = Path(f'datasets/train/images/')
print(f'train images {len(os.listdir(temp_path))}')
temp_path = Path(f'datasets/val/images/')
print(f'val images {len(os.listdir(temp_path))}')
temp_path = Path(f'datasets/test/images/')
print(f'test images {len(os.listdir(temp_path))}')

Total non-empty files 3913
After creating test files 3446
3446
344
3102 344
train images 3102
val images 344
test images 467


# Data Preprocessing

In [17]:
import pandas as pd
import os
from pathlib import Path

folders = ['train', 'val', 'test']
sub_folders = ['labels']

# create updated classes list to substitute dropped classes
to_drop.sort()
j = 0
value_map = {}
for i in range(num_of_classes-1, 0, -1):
    if j == len(to_drop):
        break
    if i not in to_drop and to_drop[j] < i:
        value_map[i] = to_drop[j]
        j += 1
    

for folder in folders:
  for sub_folder in sub_folders:
    mypath = Path(f"{dataset_path}/{folder}/{sub_folder}/")

    for filename in os.listdir(mypath):
      # read file
      df_file = pd.read_csv(mypath / filename, delimiter=" ", names=['class', 'x', 'y', 'w', 'h'])

      # bounding box correction
      df_file.loc[df_file['w'] > 1, 'w'] = 1.0
      df_file.loc[df_file['w'] < 0, 'w'] = 0.0
      df_file.loc[df_file['h'] > 1, 'h'] = 1.0
      df_file.loc[df_file['h'] < 0, 'h'] = 0.0

      # drop classes
      df_file = df_file[~df_file['class'].isin(to_drop)]
      for key, value in value_map.items():
        df_file.loc[df_file['class'] == key, 'class'] = value
      
      # write file
      df_file.to_csv(mypath / filename, index=False, header=False, sep=" ")