This shows how PyLabel can import data when given a YAML File and a dataset that has already had its images and annotations/labels folders split into train, test and val. This is our expected use case.

In [1]:
import torch
from IPython.display import Image  # for displaying images
import os, zipfile
import random
import shutil
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET
from xml.dom import minidom
from tqdm import tqdm
from PIL import Image, ImageDraw
import numpy as np
import matplotlib.pyplot as plt

random.seed(108)

import logging
logging.getLogger().setLevel(logging.CRITICAL)
!pip install pylabel > /dev/null

#!pip install pylabel

from pylabel import importer
from pylabel import *

from pathlib import PurePath

import yaml

In [2]:
#A random dataset that we found online.
!wget -O roadsign_splitdata.zip https://raw.githubusercontent.com/pylabel-project/datasets_models/main/roadsign_splitdata.zip
!unzip roadsign_splitdata.zip


--2021-11-30 01:46:42--  https://raw.githubusercontent.com/pylabel-project/datasets_models/main/roadsign_splitdata.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22454439 (21M) [application/zip]
Saving to: ‘roadsign_splitdata.zip’


2021-11-30 01:46:44 (137 MB/s) - ‘roadsign_splitdata.zip’ saved [22454439/22454439]

Archive:  roadsign_splitdata.zip
  inflating: annotations/test/road10.txt  
  inflating: annotations/test/road29.txt  
  inflating: annotations/test/road30.txt  
  inflating: annotations/test/road38.txt  
  inflating: annotations/test/road41.txt  
  inflating: annotations/train/road0.txt  
  inflating: annotations/train/road1.txt  
  inflating: annotations/train/road12.txt  
  inflating: annotations/train/road13.txt  
  inflating: annotation

In [3]:
#An example annotation
!cat /content/annotations/train/road4.txt

1 0.1891 0.4325 0.2285 0.3200
1 0.5225 0.5425 0.1760 0.2750
1 0.7903 0.6050 0.1648 0.2650


In [4]:
#An example YAML file
!cat /content/road_sign_data.yaml

train: images/train/ 
val:  images/val/
test: images/test/

# number of classes
nc: 4

# class names
names: ["trafficlight","stop", "speedlimit","crosswalk"]

In [7]:
#Import the data from a YAML file and convert it to a dataset
data0 = importer.ImportYoloV5WithYaml(yaml_file="/content/road_sign_data.yaml", 
                     path_to_annotations=None, 
                     image_ext='png',
                     name_of_annotations_folder="annotations")

In [10]:
#An example of what the data looks like
data0.df.head(5)

Unnamed: 0_level_0,img_folder,img_filename,img_path,img_id,img_width,img_height,img_depth,ann_segmented,ann_bbox_xmin,ann_bbox_ymin,ann_bbox_xmax,ann_bbox_ymax,ann_bbox_width,ann_bbox_height,ann_area,ann_segmentation,ann_iscrowd,ann_pose,ann_truncated,ann_difficult,cat_id,cat_name,cat_supercategory,split,annotated
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,../../images/train,road23.png,,0,266,400,3,,216.0053,125.98,241.9935,178.98,25.9882,53.0,1377.3746,,,,,,1,stop,,train,1
1,../../images/train,road33.png,,1,267,400,3,,23.00205,6.02,177.99555,387.02,154.9935,381.0,59052.5235,,,,,,1,stop,,train,1
2,../../images/train,road8.png,,2,400,300,3,,91.0,72.015,129.0,136.005,38.0,63.99,2431.62,,,,,,1,stop,,train,1
3,../../images/train,road8.png,,2,400,300,3,,245.02,103.995,274.02,163.005,29.0,59.01,1711.29,,,,,,1,stop,,train,1
4,../../images/train,road16.png,,3,400,248,3,,225.0,42.9908,241.0,82.0012,16.0,39.0104,624.1664,,,,,,1,stop,,train,1


In [8]:
#A sample of what the groupings look like
data0.df.groupby('split').count()

Unnamed: 0_level_0,img_folder,img_filename,img_path,img_id,img_width,img_height,img_depth,ann_segmented,ann_bbox_xmin,ann_bbox_ymin,ann_bbox_xmax,ann_bbox_ymax,ann_bbox_width,ann_bbox_height,ann_area,ann_segmentation,ann_iscrowd,ann_pose,ann_truncated,ann_difficult,cat_id,cat_name,cat_supercategory,annotated
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
test,8,8,0,8,8,8,8,0,8,8,8,8,8,8,8,0,0,0,0,0,8,8,0,8
train,55,55,0,55,55,55,55,0,55,55,55,55,55,55,55,0,0,0,0,0,55,55,0,55
val,7,7,0,7,7,7,7,0,7,7,7,7,7,7,7,0,0,0,0,0,7,7,0,7
