# Preparing Data

This notebooks prepares data and data splits for the training and evaluation of YOLOv5.

#### Importing Packages

In [3]:
import os
from PIL import Image
import shutil
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
import cv2

#### Processing Labels

**Renaming classes**

In [50]:
for file in sorted(os.listdir("data/labels")):
    filepath = os.path.join("data", "labels", file)
    if filepath.endswith(".txt"):
        with open(filepath) as f:
            lines = f.readlines()
            
        for index, line in enumerate(lines):
            if line[0] == "1":
                edited_line = "0" + line[1:] 
            elif line[0] == "2":
                edited_line = "1" + line[1:] 
            else:
                edited_line = line
            lines[index] = edited_line 
            
        with open(filepath, 'w') as f:
            f.writelines(lines)

**Removing empty label files**

In [None]:
for file in sorted(os.listdir("data/labels")):
    filepath = os.path.join("data", "labels", file)
    if filepath.endswith(".txt"):  
        if os.stat(filepath).st_size == 0:  
            print("Empty:",filepath)
            os.remove(filepath)

#### Remove non-annotated images

In [13]:
for file in sorted(os.listdir("data/images")):
    filepath = os.path.join("data", "images", file)
    name, ext = os.path.splitext(file)
    if ext != "":
        labelfile = name + ".txt"
        labelpath = os.path.join("data", "labels", labelfile)
        if not os.path.exists(labelpath):
            print("Label file not found for:", file)
            os.remove(filepath)

Label file not found for: 018_11y11m01d_04.tiff
Label file not found for: 018_12y02m08d_04.tiff
Label file not found for: 031_00y04m07d_03.png
Label file not found for: 031_00y06m09d_03.png
Label file not found for: 031_00y09m08d_03.png
Label file not found for: 031_01y00m14d_03.png
Label file not found for: 049_05y00m26d_04_R.png
Label file not found for: 053_13y06m18d_02_L'.png
Label file not found for: 059_05y06m15d_01.png
Label file not found for: 111_14y02m04d_04_L.jpg
Label file not found for: 114_20y05m12d_04_R.jpg
Label file not found for: 115_17y07m12d_04_R.jpg
Label file not found for: 121_13y04m08d_04_L.jpg
Label file not found for: 124_18y08m09d_04_L.jpg
Label file not found for: 135_08y00m28d_02_L.jpg
Label file not found for: 136_13y02m05d_02_L.jpg
Label file not found for: 136_18y00m00d_01.jpg
Label file not found for: 152_11y11m01d_04_L.jpg
Label file not found for: 152_12y02m08d_04_L.jpg
Label file not found for: 153_28y08m01d_02_L.jpg
Label file not found for: 153_28y

In [21]:
labels = []
for file in sorted(os.listdir("data/labels")):
    labels.append(file)
    
images = []
for file in sorted(os.listdir("data/images")):
    images.append(file)
    
print(len(labels), len(images))

2084 2083


#### Processing Background Images

**Chests: Rename images**

In [24]:
i = 0
for file in sorted(os.listdir("data/chests")):
    filepath = os.path.join("data", "chests", file)
    if filepath.endswith("jpeg"):
        name, ext = os.path.splitext(file)
        newname = str(i) + "_chest" + ext
        newpath = os.path.join("data", "chests", newname)
        os.rename(filepath, newpath)
        i += 1

i = 0
for file in sorted(os.listdir("data/shoulders")):
    filepath = os.path.join("data", "shoulders", file)
    if filepath.endswith("png"):
        name, ext = os.path.splitext(file)
        newname = str(i) + "_shoulder" + ext
        newpath = os.path.join("data", "shoulders", newname)
        os.rename(filepath, newpath)
        i += 1

**Move chests and shoulders to hips folder**

In [25]:
for file in sorted(os.listdir("data/chests")):
    filepath = os.path.join("data", "chests", file)
    copypath = os.path.join("data", "images", file)
    shutil.copyfile(filepath, copypath)
    
for file in sorted(os.listdir("data/shoulders")):
    filepath = os.path.join("data", "shoulders", file)
    copypath = os.path.join("data", "images", file)
    shutil.copyfile(filepath, copypath)

**Counting**

In [31]:
labels = []
for file in sorted(os.listdir("data/labels")):
    if os.path.splitext(file)[1] != "":
        labels.append(file)
    
images = []
for file in sorted(os.listdir("data/images")):
    if os.path.splitext(file)[1] != "":
        images.append(file)

In [32]:
print(len(images))
print(len(labels))
print(len(chests))

2282
2083
100


### Images: Test Train Val Splits and Moving Images

In [43]:
images = []
category = []
for file in sorted(os.listdir("data/images")):
    if os.path.splitext(file)[1] != "":
        if "chest" not in str(file) and "shoulder" not in str(file):
            file_copy = file
            file_copy = file_copy.replace(".", "_").replace(" ", "_")
            participant, date, imtype = file_copy.split("_")[:3]
            images.append(file) 
            category.append(imtype)
        elif "chest" in str(file) or "shoulder" in str(file):
            file_copy = file
            file_copy = file_copy.replace(".", "_")
            imid, imtype, ext = file_copy.split("_")
            images.append(file) 
            category.append(imtype)   

In [44]:
set(category)

{'01', '02', '03', '04', 'chest', 'shoulder'}

In [46]:
print(len(images))
print(len(category))

2282
2282


In [55]:
train, test_val, train_category, test_val_category  = train_test_split(images, category, shuffle=True, train_size=1598, stratify=category, random_state=3)
test, val = train_test_split(test_val, shuffle=True, test_size=342, stratify=test_val_category, random_state=3)

In [56]:
print(len(train), len(val), len(test))

1598 342 342


**Copying image files into train, test, val**

In [61]:
for file in train:
    filepath = os.path.join("data", "images", file)
    copypath = os.path.join("data", "images", "train", file)
    shutil.copyfile(filepath, copypath)
    
for file in val:
    filepath = os.path.join("data", "images", file)
    copypath = os.path.join("data", "images", "val", file)
    shutil.copyfile(filepath, copypath)
    
for file in test:
    filepath = os.path.join("data", "images", file)
    copypath = os.path.join("data", "images", "test", file)
    shutil.copyfile(filepath, copypath)

### Labels: Test Train Val Splits and Moving Images

In [70]:
train_labels = []
for file in train: 
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        train_labels.append(label_file)

val_labels = []
for file in val: 
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        val_labels.append(label_file)
    
test_labels = []
for file in test:
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        test_labels.append(label_file)

In [72]:
print(len(train_labels), len(test_labels), len(val_labels))

1458 312 312


In [73]:
for file in train_labels:
    filepath = os.path.join("data", "labels", file)
    copypath = os.path.join("data", "labels", "train", file)
    shutil.copyfile(filepath, copypath)
    
for file in val_labels:
    filepath = os.path.join("data", "labels", file)
    copypath = os.path.join("data", "labels", "val", file)
    shutil.copyfile(filepath, copypath)
    
for file in test_labels:
    filepath = os.path.join("data", "labels", file)
    copypath = os.path.join("data", "labels", "test", file)
    shutil.copyfile(filepath, copypath)

In [91]:
test = cv2.imread("data/images/10_shoulder.png")
test.shape

(1734, 2162, 3)

### Counting for Paper

In [13]:
from glob import glob
images_renamed = glob("data/images_renamed/*.png") + glob("data/images_renamed/*.tiff") + glob("data/images_renamed/*.jpg") + glob("data/images_renamed/*.jpeg")
images = glob("data/images/*.png") + glob("data/images/*.tiff") + glob("data/images/*.jpg") + glob("data/images/*.jpeg")
print(len(images), len(images_renamed))

2256 2107


In [7]:
im01 = list(filter(lambda name: "_01." in name, images))
im02 = list(filter(lambda name: "_02_" in name or "_02." in name, images))
im03 = list(filter(lambda name: "_03." in name, images))
im04 =list(filter(lambda name: "_04_" in name or "_04." in name or "_04 (1)" in name, images))

In [8]:
print(len(im01), len(im02), len(im03), len(im04))

588 539 419 510


In [9]:
im = im01+im02+im03+im04
len(im)

2056

In [10]:
len(im01+im03)

1007

In [11]:
len(im02+im04)

1049

### Counting Labels

In [14]:
import itertools

In [15]:
all_tags=[]
for file in os.listdir("data/labels/"):
    if file.endswith(".txt"):
        with open(f"data/labels/{file}") as f:
            f_lines = f.readlines()
            all_tags.append(f_lines)

In [16]:
tags_list = list(itertools.chain.from_iterable(all_tags))

In [17]:
only_tags = [tag[0] for tag in tags_list]

In [18]:
only_tags.count("0")

3055

In [19]:
only_tags.count("1")

0

### Make 1 Class Labels

In [15]:
for file in os.listdir("data/labels/"):
    filepath = os.path.join("data", "labels", file)
    if filepath.endswith(".txt"):
        copypath = os.path.join("data", "labels_1class", file)
        shutil.copyfile(filepath, copypath)

In [16]:
for file in os.listdir("data/labels_1class/"):
    filepath = os.path.join("data", "labels_1class", file)
    if filepath.endswith(".txt"):
        with open(filepath) as f:
            lines = f.readlines()
            filtered_lines = [line for line in lines if line[0] == "0"]
            
        with open(filepath, 'w') as f:
            f.writelines(filtered_lines)

In [17]:
all_tags=[]
for file in os.listdir("data/labels_1class/"):
    if file.endswith(".txt"):
        with open(f"data/labels_1class/{file}") as f:
            f_lines = f.readlines()
            all_tags.append(f_lines)
            
tags_list = list(itertools.chain.from_iterable(all_tags))
only_tags = [tag[0] for tag in tags_list]

In [19]:
only_tags.count("0")

3055

In [22]:
for file in sorted(os.listdir("data/labels_1class")):
    filepath = os.path.join("data", "labels_1class", file)
    if filepath.endswith(".txt"): 
        if file != "classes.txt":
            if os.stat(filepath).st_size == 0:  
                print("Empty:",filepath)
                os.remove(filepath)

Empty: data/labels_1class/001_13y09m03d_04_L.txt
Empty: data/labels_1class/009_10y08m19d_04.txt
Empty: data/labels_1class/017_08y00m26d_04.txt
Empty: data/labels_1class/042_09y01m02d_04_R.txt
Empty: data/labels_1class/048_08y00m07d_04_R.txt
Empty: data/labels_1class/056_07y08m01d_04_L.txt
Empty: data/labels_1class/101_10y07m17d_02_L.txt
Empty: data/labels_1class/101_10y08m19d_04_L.txt
Empty: data/labels_1class/111_14y02m19d_02_L.txt
Empty: data/labels_1class/111_14y02m19d_04_L.txt
Empty: data/labels_1class/116_14y01m10d_02_R.txt
Empty: data/labels_1class/121_13y04m25d_04_L.txt
Empty: data/labels_1class/121_13y04m25d_04_R.txt
Empty: data/labels_1class/121_13y06m07d_04_L.txt
Empty: data/labels_1class/121_13y09m09d_04_L.txt
Empty: data/labels_1class/139_09y07m02d_02_R.txt
Empty: data/labels_1class/155_14y03m09d_04_R.txt
Empty: data/labels_1class/155_14y05m04d_04_R.txt
Empty: data/labels_1class/162_13y00m29d_04_L.txt
Empty: data/labels_1class/174_11y00m18d_02_L.txt
Empty: data/labels_1clas

#### Remove unannotated images

In [23]:
for file in os.listdir("data/images/"):
    filepath = os.path.join("data", "images", file)
    if not os.path.isdir(filepath):
        copypath = os.path.join("data", "images_1class", file)
        shutil.copyfile(filepath, copypath)

In [26]:
for file in sorted(os.listdir("data/images_1class")):
    filepath = os.path.join("data", "images_1class", file)
    name, ext = os.path.splitext(file)
    if ext != "" and "chest" not in file and "shoulder" not in file: 
        labelfile = name + ".txt"
        labelpath = os.path.join("data", "labels_1class", labelfile)
        if not os.path.exists(labelpath):
            print("Label file not found for:", file)
            os.remove(filepath)

Label file not found for: 001_13y09m03d_04_L.tiff
Label file not found for: 009_10y08m19d_04.tiff
Label file not found for: 017_08y00m26d_04.tiff
Label file not found for: 042_09y01m02d_04_R.png
Label file not found for: 048_08y00m07d_04_R.png
Label file not found for: 056_07y08m01d_04_L.png
Label file not found for: 101_10y07m17d_02_L.jpg
Label file not found for: 101_10y08m19d_04_L.jpg
Label file not found for: 111_14y02m19d_02_L.jpg
Label file not found for: 111_14y02m19d_04_L.jpg
Label file not found for: 116_14y01m10d_02_R.jpg
Label file not found for: 121_13y04m25d_04_L.jpg
Label file not found for: 121_13y04m25d_04_R.jpg
Label file not found for: 121_13y06m07d_04_L.jpg
Label file not found for: 121_13y09m09d_04_L.jpg
Label file not found for: 139_09y07m02d_02_R.jpg
Label file not found for: 155_14y03m09d_04_R.jpg
Label file not found for: 155_14y05m04d_04_R.jpg
Label file not found for: 162_13y00m29d_04_L.jpg
Label file not found for: 174_11y00m18d_02_L.jpg
Label file not found 

In [27]:
counter = 0
for file in sorted(os.listdir("data/images_1class")):
    filepath = os.path.join("data", "images_1class", file)
    name, ext = os.path.splitext(file)
    if ext != "":
        counter += 1

In [28]:
counter

2256

#### Split Data

In [29]:
images = []
category = []
for file in sorted(os.listdir("data/images_1class")):
    if os.path.splitext(file)[1] != "":
        if "chest" not in str(file) and "shoulder" not in str(file):
            file_copy = file
            file_copy = file_copy.replace(".", "_").replace(" ", "_")
            participant, date, imtype = file_copy.split("_")[:3]
            images.append(file) 
            category.append(imtype)
        elif "chest" in str(file) or "shoulder" in str(file):
            file_copy = file
            file_copy = file_copy.replace(".", "_")
            imid, imtype, ext = file_copy.split("_")
            images.append(file) 
            category.append(imtype)   

In [30]:
print(set(category))
print(len(images))
print(len(category))

{'03', '04', '01', 'shoulder', 'chest', '02'}
2256
2256


In [34]:
train, test_val, train_category, test_val_category  = train_test_split(images, category, shuffle=True, train_size=1580, stratify=category, random_state=3)
test, val = train_test_split(test_val, shuffle=True, test_size=338, stratify=test_val_category, random_state=3)

In [35]:
print(len(train), len(test), len(val))

1580 338 338


In [36]:
for file in train:
    filepath = os.path.join("data", "images_1class", file)
    copypath = os.path.join("data", "images_1class", "train", file)
    shutil.copyfile(filepath, copypath)
    
for file in val:
    filepath = os.path.join("data", "images_1class", file)
    copypath = os.path.join("data", "images_1class", "val", file)
    shutil.copyfile(filepath, copypath)
    
for file in test:
    filepath = os.path.join("data", "images_1class", file)
    copypath = os.path.join("data", "images_1class", "test", file)
    shutil.copyfile(filepath, copypath)

#### Split labels 

In [37]:
train_labels = []
for file in train: 
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        train_labels.append(label_file)

val_labels = []
for file in val: 
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        val_labels.append(label_file)
    
test_labels = []
for file in test:
    if "chest" not in file and "shoulder" not in file:
        file_copy = file
        name, ext = os.path.splitext(file_copy)
        label_file = name + ".txt"
        test_labels.append(label_file)

In [38]:
print(len(train_labels), len(test_labels), len(val_labels))

1440 308 308


In [39]:
for file in train_labels:
    filepath = os.path.join("data", "labels_1class", file)
    copypath = os.path.join("data", "labels_1class", "train", file)
    shutil.copyfile(filepath, copypath)
    
for file in val_labels:
    filepath = os.path.join("data", "labels_1class", file)
    copypath = os.path.join("data", "labels_1class", "val", file)
    shutil.copyfile(filepath, copypath)
    
for file in test_labels:
    filepath = os.path.join("data", "labels_1class", file)
    copypath = os.path.join("data", "labels_1class", "test", file)
    shutil.copyfile(filepath, copypath)