<a href="https://colab.research.google.com/github/rahiakela/computer-vision-research-and-practice/blob/main/hands-on-computer-vision-with-detectron2/04-custom-object-detection/01_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Processing

The dataset used is the brain tumor object detection dataset available from [Kaggle](https://www.kaggle.com/datasets/davidbroberts/brain-tumor-object-detectiondatasets).

This dataset is chosen because medical image processing is a critical subfield in computer vision. At the same
time, the task is challenging, and the number of images is appropriate for demonstration purposes.



##Setup

In [None]:
!python -m pip install "git+https://github.com/facebookresearch/detectron2.git"
!sudo apt-get install tree
!pip install -q pylabel

In [2]:
import detectron2
from detectron2.data.datasets import register_coco_instances
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.visualizer import Visualizer

import os
from glob import glob
import shutil
from tqdm import tqdm
import yaml
from yaml.loader import SafeLoader
from pylabel import importer

import torch
import cv2

import numpy as np
import matplotlib.pyplot as plt

# Suppress some user warnings
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
print(detectron2.__version__)

0.6


In [None]:
!wget https://github.com/PacktPublishing/Hands-On-Computer-Vision-with-Detectron2/blob/main/datasets/braintumors.zip?raw=true -O braintumors.zip
!unzip braintumors.zip -d braintumors

##Dataset

In [5]:
data_folder = "braintumors"
data_folder_yolo = data_folder + "_yolo"
data_folder_coco = data_folder + "_coco"

folders = os.listdir("braintumors")
print(folders)

['coronal_t1wce_2_class', 'sagittal_t1wce_2_class', 'axial_t1wce_2_class']


In [6]:
!tree braintumors/ -d

[01;34mbraintumors/[0m
├── [01;34maxial_t1wce_2_class[0m
│   ├── [01;34mimages[0m
│   │   ├── [01;34mtest[0m
│   │   └── [01;34mtrain[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mtest[0m
│       └── [01;34mtrain[0m
├── [01;34mcoronal_t1wce_2_class[0m
│   ├── [01;34mimages[0m
│   │   ├── [01;34mtest[0m
│   │   └── [01;34mtrain[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mtest[0m
│       └── [01;34mtrain[0m
└── [01;34msagittal_t1wce_2_class[0m
    ├── [01;34mimages[0m
    │   ├── [01;34mtest[0m
    │   └── [01;34mtrain[0m
    └── [01;34mlabels[0m
        ├── [01;34mtest[0m
        └── [01;34mtrain[0m

21 directories


In [7]:
# let's count the number of images and labels from a dataset with the YOLO annotation format
def count_yolo_data(folder):
  for images_labels in ["images", "labels"]:
    for train_test in ["train", "test"]:
      file_ext = "*.jpg" if images_labels == "images" else "*.txt"
      p = os.path.join(folder, images_labels, train_test, file_ext)
      files = glob(p)
      print(train_test, images_labels, len(files))

In [8]:
for folder in folders:
  print("-"*8 + folder + "-"*8)
  folder = os.path.join(data_folder, folder)
  count_yolo_data(folder)

--------coronal_t1wce_2_class--------
train images 319
test images 78
train labels 318
test labels 78
--------sagittal_t1wce_2_class--------
train images 264
test images 70
train labels 264
test labels 70
--------axial_t1wce_2_class--------
train images 310
test images 75
train labels 296
test labels 75


In [9]:
# due to the small number of images, it might be more reasonable to combine images from these folders into one folder
def copy_yolo_files(from_folder, to_folder, images_labels, train_set):
  from_path = os.path.join(from_folder, images_labels, train_set)
  to_path = os.path.join(to_folder, images_labels, train_set)
  os.makedirs(to_path, exist_ok=True)

  # get files
  file_ext = "*.jpg" if images_labels == "images" else "*.txt"
  files = glob(os.path.join(from_path, file_ext))
  # move files
  for file in tqdm(files):
    shutil.copy(file, to_path)

In [10]:
# now, combines images from three folders into one
for from_folder in folders:
  from_folder = os.path.join(data_folder, from_folder)
  to_folder = data_folder_yolo
  for images_labels in ["images", "labels"]:
    for train_set in ["train", "test"]:
      copy_yolo_files(from_folder, to_folder, images_labels, train_set)

100%|██████████| 319/319 [00:00<00:00, 4176.92it/s]
100%|██████████| 78/78 [00:00<00:00, 3376.53it/s]
100%|██████████| 318/318 [00:00<00:00, 5706.39it/s]
100%|██████████| 78/78 [00:00<00:00, 5498.23it/s]
100%|██████████| 264/264 [00:00<00:00, 5039.19it/s]
100%|██████████| 70/70 [00:00<00:00, 3838.83it/s]
100%|██████████| 264/264 [00:00<00:00, 7666.66it/s]
100%|██████████| 70/70 [00:00<00:00, 7385.64it/s]
100%|██████████| 310/310 [00:00<00:00, 8001.59it/s]
100%|██████████| 75/75 [00:00<00:00, 6672.17it/s]
100%|██████████| 296/296 [00:00<00:00, 7513.72it/s]
100%|██████████| 75/75 [00:00<00:00, 6012.25it/s]


In [11]:
# Now the folder should look like
!tree -d {data_folder_yolo}

[01;34mbraintumors_yolo[0m
├── [01;34mimages[0m
│   ├── [01;34mtest[0m
│   └── [01;34mtrain[0m
└── [01;34mlabels[0m
    ├── [01;34mtest[0m
    └── [01;34mtrain[0m

6 directories


In [12]:
# do the counting just to make sure
count_yolo_data(data_folder_yolo)

train images 893
test images 223
train labels 878
test labels 223


In [13]:
# let's read the yaml file and write the class file
with open(os.path.join(data_folder, folders[0], folders[0] + ".yaml")) as f:
  classes = yaml.load(f, Loader=SafeLoader)["names"]
# Write classes.txt
with open(os.path.join(data_folder_yolo, "classes.txt"), "w") as f:
  f.write("\n".join(classes))

##Data format conversion

In [14]:
def yolo_to_coco(input_folder, output_folder, train_test):
  labels_path = os.path.join(input_folder, "labels", train_test)
  images_path = os.path.join(input_folder, "images", train_test)
  coco_dir = os.path.join(output_folder, train_test)
  os.makedirs(coco_dir, exist_ok=True)

  txt_files = glob(os.path.join(labels_path, "*.txt"))
  img_files = glob(os.path.join(images_path, "*.jpg"))

  # copy annotations
  for f in tqdm(txt_files):
    shutil.copy(f, coco_dir)
  # copy images
  for f in tqdm(img_files):
    shutil.copy(f, coco_dir)

  # get the classes
  with open(os.path.join(input_folder, "classes.txt"), "r") as f:
    classes = f.read().split("\n")

  # load dataset
  dataset = importer.ImportYoloV5(path=coco_dir, cat_names=classes, name="brain tumors")
  # export
  coco_file = os.path.join(coco_dir, "_annotations.coco.json")
  # Detectron requires starting index from 1
  dataset.export.ExportToCoco(coco_file, cat_id_index=1)

  # now delete yolo annotations in coco set
  for f in txt_files:
    os.remove(f.replace(labels_path, coco_dir))

In [15]:
yolo_to_coco(data_folder_yolo, data_folder_coco, "train")
yolo_to_coco(data_folder_yolo, data_folder_coco, "test")

100%|██████████| 878/878 [00:00<00:00, 8862.16it/s]
100%|██████████| 893/893 [00:00<00:00, 7633.00it/s]
Importing YOLO files...: 100%|██████████| 1771/1771 [00:02<00:00, 861.10it/s]
Exporting to COCO file...: 100%|██████████| 925/925 [00:00<00:00, 1012.56it/s]
100%|██████████| 223/223 [00:00<00:00, 5110.70it/s]
100%|██████████| 223/223 [00:00<00:00, 4432.99it/s]
Importing YOLO files...: 100%|██████████| 446/446 [00:00<00:00, 827.79it/s]
Exporting to COCO file...: 100%|██████████| 241/241 [00:00<00:00, 1923.05it/s]


In [16]:
!tree -d {data_folder_coco}

[01;34mbraintumors_coco[0m
├── [01;34mtest[0m
└── [01;34mtrain[0m

2 directories


##Displaying Samples