<a href="https://colab.research.google.com/github/ptmhoang97/convert_VOC_to_YOLOv4/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive._mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Choose dataset to unzip
dataset_name = "coco2017_train_truck_VOC"

In [4]:
dataset_path = "/content/gdrive/MyDrive/_dataset/{0}.zip".format(dataset_name)
dataset_unzip_path = "/content"
!unzip {dataset_path} -d {dataset_unzip_path}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/coco2017_train_truck_VOC/000000158443.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000389760.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000113672.xml  
  inflating: /content/coco2017_train_truck_VOC/000000113261.xml  
  inflating: /content/coco2017_train_truck_VOC/000000359695.xml  
  inflating: /content/coco2017_train_truck_VOC/000000056664.xml  
  inflating: /content/coco2017_train_truck_VOC/000000438025.xml  
  inflating: /content/coco2017_train_truck_VOC/000000362555.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000144907.xml  
  inflating: /content/coco2017_train_truck_VOC/000000397648.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000580414.xml  
  inflating: /content/coco2017_train_truck_VOC/000000397132.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000244986.jpg  
  inflating: /content/coco2017_train_truck_VOC/000000357799.jpg  
  inflating

# Convert xml to csv

In [1]:
import os
import glob
import pandas as pd
import io
import xml.etree.ElementTree as ET
import argparse

In [2]:
def xml_to_csv(path):
    """Iterates through all .xml files (generated by labelImg) in a given directory and combines
    them in a single Pandas dataframe.

    Parameters:
    ----------
    path : str
        The path containing the .xml files
    Returns
    -------
    Pandas DataFrame
        The produced dataframe
    """

    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height',
                   'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

In [3]:
dataset_path = "/content"
csv_path = "/content/csv"

paths = {
	'TRAIN_PATH': os.path.join(dataset_path,"train"),
	'TEST_PATH': os.path.join(dataset_path,"test"),
}

files_xlsx = {
	'TRAIN_XLSX': os.path.join(csv_path,"train.xlsx"),
	'TEST_XLSX': os.path.join(csv_path,"test.xlsx"),
}

files_csv = {
	'TRAIN_CSV': os.path.join(csv_path,"train.csv"),
	'TEST_CSV': os.path.join(csv_path,"test.csv"),
}

In [4]:
# Create folder to store generated csv
if not os.path.exists(csv_path):
    if os.name == 'posix':
        !mkdir -p {csv_path}
    if os.name == 'nt':
        !mkdir {csv_path}

In [5]:
train_csv = xml_to_csv(paths['TRAIN_PATH'])
train_csv.to_csv(files_csv['TRAIN_CSV'], index=None)
print('Successfully created the CSV file: {}'.format(files_csv['TRAIN_CSV']))

Successfully created the CSV file: /content/csv/train.csv


In [6]:
test_csv = xml_to_csv(paths['TEST_PATH'])
test_csv.to_csv(files_csv['TEST_CSV'], index=None)
print('Successfully created the CSV file: {}'.format(files_csv['TEST_CSV']))

Successfully created the CSV file: /content/csv/test.csv


# Convert csv to xlsx (Keep value and change format)

In [7]:
read_file_train_csv = pd.read_csv(files_csv['TRAIN_CSV'])
read_file_train_csv.to_excel(files_xlsx['TRAIN_XLSX'], index = None, header=True)

In [8]:
read_file_test_csv = pd.read_csv(files_csv['TEST_CSV'])
read_file_test_csv.to_excel(files_xlsx['TEST_XLSX'], index = None, header=True)

# Convert xlsx to txt (YOLO annotation)

In [11]:
import openpyxl

In [14]:
def convert_xlsx_to_yolo(input_xlsx,output_yolo):
    wb=openpyxl.load_workbook(input_xlsx)
    sheet = wb['Sheet1']
    #bar = Bar('Processing: ' + input_xlsx, max=sheet.max_row)
    # print(wb.active.title)

    class_dict = {
      "car": 0
    }

    for i in range(2,sheet.max_row+1):
        filename = sheet.cell(i, 1).value[:-4]
        # print(filename)

        width = int(sheet.cell(i, 2).value)
        # print(width)

        height = int(sheet.cell(i, 3).value)
        # print(height)

        class_obj = sheet.cell(i, 4).value
        # print(class_obj)

        x_min = sheet.cell(i, 5).value
        # print(x_min)

        y_min = sheet.cell(i, 6).value
        # print(y_min)

        x_max = sheet.cell(i, 7).value
        # print(x_max)

        y_max = sheet.cell(i, 8).value
        # print(y_max)
        
        output_file_name = "{}/{}.txt".format(output_yolo,filename)
        
        for key in class_dict:
            if class_obj == key:
                output_class = str(class_dict[key])

        output_center_x = str((x_max + x_min)/2/width)
        output_center_y = str((y_max + y_min)/2/height)
        output_width = str((x_max - x_min)/width)
        output_height = str((y_max - y_min)/height)

        if not os.path.exists(output_file_name): 
            f = open(output_file_name, "x")
            f.write(output_class + ' ')
            f.write(output_center_x + ' ')
            f.write(output_center_y + ' ')
            f.write(output_width + ' ')
            f.write(output_height + '\n')
            f.close()
        else:
            f = open(output_file_name, "a")
            f.write(output_class + ' ')
            f.write(output_center_x + ' ')
            f.write(output_center_y + ' ')
            f.write(output_width + ' ')
            f.write(output_height + '\n')
            f.close()
     #   bar.next()
    #bar.finish()

In [15]:
convert_xlsx_to_yolo(files_xlsx['TRAIN_XLSX'],paths['TRAIN_PATH'])

In [None]:
convert_xlsx_to_yolo(files_xlsx['TEST_XLSX'],paths['TEST_PATH'])