# **0. CONNECT TO GOOGLE DRIVE**

In [90]:
from google.colab import drive

In [91]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **1. IMPORT LIBRARIES**

In [179]:
from PIL import Image, ImageDraw
import xml.etree.ElementTree as ET
import pandas as pd
import random
import shutil
import glob
import os
import json

# **2. UNZIP THE DATASET**

In [93]:
# unzip the dataset 

!unzip '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/disease and pest in coffee leaves dataset.zip' -d '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset'

Archive:  /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/disease and pest in coffee leaves dataset.zip
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/miner_img_xml/bicho_mineiro0.jpg  
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/miner_img_xml/bicho_mineiro0.xml  
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/miner_img_xml/bicho_mineiro1.jpg  
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/miner_img_xml/bicho_mineiro1.xml  
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/miner_img_xml/bicho_mineiro10.jpg  
  inflating: /content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object D

# **3. IMPORTANT VARIABLES**

In [94]:
ROOT_PATH = '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection'

In [95]:
folders = ['miner_img_xml','rust_xml_image']

In [96]:
len(os.listdir(os.path.join(ROOT_PATH,'Dataset',folders[0]))), len(os.listdir(os.path.join(ROOT_PATH,'Dataset',folders[1])))

(514, 570)

# **4. MOVE IMAGES AND XML ANNOTATIONS TO CORRESPONDING FOLDERS**

In [97]:
# move images and XML annotations to corresponding folders

for folder in folders:
  for file in sorted(os.listdir(os.path.join(ROOT_PATH, 'Dataset', folder))):
    if '.xml' in file:
      source = os.path.join(ROOT_PATH,'Dataset',folder,file)
      destination = os.path.join(ROOT_PATH,'Dataset','annotations')

      shutil.copy(source, destination)

    elif '.jpg' in file:
      source = os.path.join(ROOT_PATH,'Dataset',folder,file)
      destination = os.path.join(ROOT_PATH,'Dataset','images')

      shutil.copy(source, destination)

In [113]:
len(os.listdir(os.path.join(ROOT_PATH,'Dataset','images'))), len(os.listdir(os.path.join(ROOT_PATH,'Dataset','annotations')))

(542, 542)

# **5. CONVERT XML FORMAT TO YOLO FORMAT**

In [99]:
def xml_to_yolo_bbox(bbox, w, h):
    # xmin, ymin, xmax, ymax
    x_center = ((bbox[2] + bbox[0]) / 2) / w
    y_center = ((bbox[3] + bbox[1]) / 2) / h
    width = (bbox[2] - bbox[0]) / w
    height = (bbox[3] - bbox[1]) / h
    return [x_center, y_center, width, height]

In [100]:
def yolo_to_xml_bbox(bbox, w, h):
    # x_center, y_center, width, heigth
    w_half_len = (bbox[2] * w) / 2
    h_half_len = (bbox[3] * h) / 2
    xmin = int((bbox[0] * w) - w_half_len)
    ymin = int((bbox[1] * h) - h_half_len)
    xmax = int((bbox[0] * w) + w_half_len)
    ymax = int((bbox[1] * h) + h_half_len)
    return [xmin, ymin, xmax, ymax]

In [101]:
classes = []
input_dir = os.path.join(ROOT_PATH,'Dataset','annotations')
output_dir = os.path.join(ROOT_PATH,'Dataset','labels')
image_dir = os.path.join(ROOT_PATH,'Dataset','images')

In [102]:
files = glob.glob(os.path.join(input_dir, '*.xml'))
for fil in files:
    basename = os.path.basename(fil)
    filename = os.path.splitext(basename)[0]
    if not os.path.exists(os.path.join(image_dir, f"{filename}.jpg")):
        print(f"{filename} image does not exist!")
        continue

# every image has its annotation pair

In [103]:
# identify all the xml files in the annotations folder (input directory)
files = glob.glob(os.path.join(input_dir, '*.xml'))
# loop through each 
for fil in files:
    basename = os.path.basename(fil)
    filename = os.path.splitext(basename)[0]
    # check if the label contains the corresponding image file
    if not os.path.exists(os.path.join(image_dir, f"{filename}.jpg")):
        print(f"{filename} image does not exist!")
        continue

    result = []

    # parse the content of the xml file
    tree = ET.parse(fil)
    root = tree.getroot()
    width = int(root.find("size").find("width").text)
    height = int(root.find("size").find("height").text)

    for obj in root.findall('object'):
        label = obj.find("name").text
        # check for new classes and append to list
        if label not in classes:
            classes.append(label)
        index = classes.index(label)
        pil_bbox = [int(x.text) for x in obj.find("bndbox")]
        yolo_bbox = xml_to_yolo_bbox(pil_bbox, width, height)
        # convert data to string
        bbox_string = " ".join([str(x) for x in yolo_bbox])
        result.append(f"{index} {bbox_string}")

    if result:
        # generate a YOLO format text file for each xml file
        with open(os.path.join(output_dir, f"{filename}.txt"), "w", encoding="utf-8") as f:
            f.write("\n".join(result))

# generate the classes file as reference
with open('classes.txt', 'w', encoding='utf8') as f:
    f.write(json.dumps(classes))

In [104]:
len(os.listdir(output_dir))

542

# **6. VISUALIZE BOUNDING BOXES**

In [105]:
def yolo_to_xml_bbox(bbox, w, h):
    # x_center, y_center width heigth
    w_half_len = (bbox[2] * w) / 2
    h_half_len = (bbox[3] * h) / 2
    xmin = int((bbox[0] * w) - w_half_len)
    ymin = int((bbox[1] * h) - h_half_len)
    xmax = int((bbox[0] * w) + w_half_len)
    ymax = int((bbox[1] * h) + h_half_len)
    return [xmin, ymin, xmax, ymax]

In [106]:
def draw_image(img, bboxes):
    draw = ImageDraw.Draw(img)
    for bbox in bboxes:
        draw.rectangle(bbox, outline="red", width=8)
    display(img)

In [107]:
image_filename = os.path.join(image_dir, sorted(os.listdir(image_dir))[-10])
label_filename = os.path.join(output_dir, sorted(os.listdir(output_dir))[-10])
bboxes = []

In [108]:
image_filename, label_filename

('/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/images/name88.jpg',
 '/content/drive/MyDrive/Omdena/Challenge: Disease Detection in Coffee Plants/Object Detection/Dataset/labels/name88.txt')

In [109]:
img = Image.open(image_filename)
with open(label_filename, 'r', encoding='utf8') as f:
    for line in f:
        data = line.strip().split(' ')
        bbox = [float(x) for x in data[1:]]
        bboxes.append(yolo_to_xml_bbox(bbox, img.width, img.height))

In [110]:
bboxes

[[638, 1906, 825, 2102],
 [1108, 3144, 1279, 3319],
 [1167, 2889, 1325, 3064],
 [1188, 2615, 1373, 2789],
 [190, 1163, 347, 1424],
 [420, 354, 551, 589]]

In [111]:
draw_image(img, bboxes)

Output hidden; open in https://colab.research.google.com to view.

# **7. SPLIT TO TRAIN, VAL, AND TEST**

In [183]:
image_dir =  os.path.join(ROOT_PATH,'Dataset','images')
label_dir = os.path.join(ROOT_PATH,'Dataset','labels')
lower_limit = 0

files = glob.glob(os.path.join(image_dir, '*.jpg'))

In [184]:
random.shuffle(files)

In [185]:
round(0.8*len(files)), round(0.1*len(files)), round(0.1*len(files)) 

(434, 54, 54)

In [186]:
train_filename = files[0:434]
val_filename = files[434:434+54]
test_filename = files[488:488+54]


model_sets = [train_filename, val_filename, test_filename]

In [187]:
len(train_filename), len(val_filename), len(test_filename)

(434, 54, 54)

In [188]:
model_sets = {
    'train' : {
        'data' : train_filename
    },
    'test' : {
        'data' : test_filename
    },
    'val' : {
        'data' : val_filename
    }
}

In [189]:
for model_set in model_sets:
  # make directory
  os.mkdir(os.path.join(ROOT_PATH, 'Dataset', model_set))
  os.mkdir(os.path.join(ROOT_PATH, 'Dataset', model_set, 'images'))
  os.mkdir(os.path.join(ROOT_PATH, 'Dataset', model_set, 'labels'))

  # images and labels
  img_dest = os.path.join(ROOT_PATH, 'Dataset', model_set, 'images')
  ann_dest = os.path.join(ROOT_PATH, 'Dataset', model_set, 'labels')

  for img in model_sets[model_set]['data']:
    annotation = img.split('/')[-1].split('.')[0] + '.txt'
    annotation_source = os.path.join(label_dir, annotation)

    # move images
    shutil.copy(img, img_dest)
    # move labels
    shutil.copy(annotation_source, ann_dest)

In [190]:
# verify

train_images = os.path.join(ROOT_PATH,'Dataset','train','images')
test_images = os.path.join(ROOT_PATH,'Dataset','test','images')
val_images = os.path.join(ROOT_PATH,'Dataset','val','images')

train_labels = os.path.join(ROOT_PATH,'Dataset','train','labels')
test_labels = os.path.join(ROOT_PATH,'Dataset','test','labels')
val_labels = os.path.join(ROOT_PATH,'Dataset','val','labels')

In [191]:
print(len(os.listdir(train_images)), len(os.listdir(train_labels)))
print(len(os.listdir(test_images)), len(os.listdir(test_labels)))
print(len(os.listdir(val_images)), len(os.listdir(val_labels)))

434 434
54 54
54 54


In [192]:
pd.DataFrame({
    'images' : os.listdir(train_images),
    'labes' :  os.listdir(train_labels)
})

Unnamed: 0,images,labes
0,bicho_mineiro150.jpg,bicho_mineiro150.txt
1,bicho_mineiro99.jpg,bicho_mineiro99.txt
2,name296.jpg,name296.txt
3,bicho_mineiro0.jpg,bicho_mineiro0.txt
4,name172.jpg,name172.txt
...,...,...
429,name135.jpg,name135.txt
430,name160.jpg,name160.txt
431,name134.jpg,name134.txt
432,name89.jpg,name89.txt


In [193]:
pd.DataFrame({
    'images' : os.listdir(test_images),
    'labes' :  os.listdir(test_labels)
})

Unnamed: 0,images,labes
0,name260.jpg,name260.txt
1,bicho_mineiro221.jpg,bicho_mineiro221.txt
2,bicho_mineiro137.jpg,bicho_mineiro137.txt
3,bicho_mineiro77.jpg,bicho_mineiro77.txt
4,name122.jpg,name122.txt
5,name215.jpg,name215.txt
6,name42.jpg,name42.txt
7,name177.jpg,name177.txt
8,name298.jpg,name298.txt
9,name269.jpg,name269.txt


In [194]:
pd.DataFrame({
    'images' : os.listdir(val_images),
    'labes' :  os.listdir(val_labels)
})

Unnamed: 0,images,labes
0,name277.jpg,name277.txt
1,name98.jpg,name98.txt
2,bicho_mineiro234.jpg,bicho_mineiro234.txt
3,bicho_mineiro108.jpg,bicho_mineiro108.txt
4,bicho_mineiro215.jpg,bicho_mineiro215.txt
5,bicho_mineiro136.jpg,bicho_mineiro136.txt
6,bicho_mineiro65.jpg,bicho_mineiro65.txt
7,name138.jpg,name138.txt
8,bicho_mineiro232.jpg,bicho_mineiro232.txt
9,name25.jpg,name25.txt
