# Data processing
Create yolo format annotation files for training and testing using xml files

## Import libraries

In [1]:
import os
from pathlib import Path
import xml.etree.ElementTree as ET

In [2]:
root_dir = Path.cwd().parent.parent
root_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor')

## Load Data

In [3]:
data_dir = root_dir / 'data' / 'logo_detection'
data_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/logo_detection')

In [4]:
items_list = os.listdir(data_dir / 'brands')
items_list

['0samples',
 'abus',
 'accenture',
 'adidas',
 'airhawk',
 'aldi',
 'allett',
 'allianz',
 'aluratek',
 'amazon',
 'amcrest',
 'American Express',
 'apc',
 'ape',
 'aquapac',
 'aral',
 'armitron',
 'aspirin',
 'athalon',
 'audi',
 'axa',
 'azeca',
 'bank of america',
 'BASF',
 'bella taylor',
 'bellabeat',
 'bello digital',
 'bem wireless',
 'ben sherman',
 'benrus',
 'bertha watches',
 'bionade',
 'BMW',
 'boeing',
 'bosch',
 'budweiser',
 'burger king',
 'canon',
 'cartier',
 'caterpillar',
 'chanel',
 'chevrolet',
 'cisco',
 'coca-cola',
 'colgate',
 'corona',
 'costco',
 'CVS',
 'danone',
 'esso',
 'FedEx',
 'ford',
 'frito lays',
 'gillette',
 'gucci',
 'H&M',
 'heineken',
 'hershey',
 'home depot',
 'honda',
 'hp',
 'hsbc',
 'huawei',
 'hyundai',
 'IBM',
 'IKEA',
 'intel',
 'kelloggs',
 'kia',
 'kraft',
 'lego',
 'lexus',
 'LOreal',
 'louis vuitton',
 'marlboro',
 'mastercard',
 'McDonalds',
 'mercedes benz',
 'nescafe',
 'nestle',
 'netflix',
 'nike',
 'nissan',
 'nivea',
 'Pam

## Annotations

In [5]:
def get_annotations(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    img_name = root.find('filename').text
    img_size = root.find('size')
    img_width = int(img_size.find('width').text)
    img_height = int(img_size.find('height').text)
    boxes = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        if '-symbol' in name: # if you want to ignore symbols
            continue
        bnd_box = obj.find('bndbox')
        x_min = int(bnd_box.find('xmin').text)
        y_min = int(bnd_box.find('ymin').text)
        x_max = int(bnd_box.find('xmax').text)
        y_max = int(bnd_box.find('ymax').text)
        boxes.append([0, x_min, y_min, x_max, y_max])
    return img_name, img_width, img_height, boxes

In [6]:
def convert_annotations(img_width, img_height, boxes):
    yolo_boxes = []
    for box in boxes:
        x_min = box[1]
        y_min = box[2]
        x_max = box[3]
        y_max = box[4]
        x_center = (x_min + x_max) / 2
        y_center = (y_min + y_max) / 2
        width = x_max - x_min
        height = y_max - y_min
        yolo_boxes.append([0, x_center / img_width, y_center / img_height, width / img_width, height / img_height])
    return yolo_boxes

In [7]:
def save_annotations(annotations, save_file):
    with open(save_file, 'w') as f:
        for annotation in annotations:
            line = ' '.join([str(x) for x in annotation])
            f.write(line + '\n')

## Data processing

In [8]:
for index, item in enumerate(items_list):
    item_dir = data_dir / 'brands' / item
    for sub_item in os.listdir(item_dir):
        if sub_item.endswith('.xml'):
            xml_path = item_dir / sub_item 
            img_name, img_width, img_height, boxes = get_annotations(xml_path)
            yolo_boxes = convert_annotations(img_width, img_height, boxes)
            save_file = item_dir / (img_name.split('.')[0] + f'_{index+1}' + '.txt')
            save_annotations(yolo_boxes, save_file)

## Create train , val and test datasets

In [8]:
train_dir = data_dir / 'train'
train_dir.mkdir(exist_ok=True)
train_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/train')

In [9]:
test_dir = data_dir / 'test'
test_dir.mkdir(exist_ok=True)
test_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/test')

In [10]:
val_dir = data_dir / 'val'
val_dir.mkdir(exist_ok=True)
val_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/val')

In [11]:
data_items = os.listdir(data_dir / 'brands')
data_images = []

In [13]:
for item in data_items:
    item_dir = data_dir / 'brands' / item
    for sub_item in os.listdir(item_dir):
        if sub_item.endswith('.jpg'):
            data_images.append((sub_item.split('.')[0], str(data_dir / 'brands' / item / sub_item)))

In [14]:
data_images

[('img000001_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000001_1.jpg'),
 ('img000002_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000002_1.jpg'),
 ('img000003_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000003_1.jpg'),
 ('img000004_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000004_1.jpg'),
 ('img000005_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000005_1.jpg'),
 ('img000006_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000006_1.jpg'),
 ('img000007_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000007_1.jpg'),
 ('img000008_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000008_1.jpg'),
 ('img000009_1',
  '/mnt/d/Projects_D/Brand_Extractor/data/logo_detection/brands/0samples/img000009_1.jpg'),
 ('img000010_1',
  

In [15]:
len(data_images)

8502

## Save data

Create 70 % train, 20 % test and 10 % validation data

In [15]:
import random
random.shuffle(data_images)

In [16]:
data_images = [(x[0], x[1], 'train') if i < 0.7 * len(data_images) else (x[0], x[1], 'test') if i < 0.9 * len(data_images) else (x[0], x[1], 'val') for i, x in enumerate(data_images)]

In [17]:
# create image and label dir in train, test and val and copy images and labels (img_name.txt) to these directories from brands
for img_name, img_path, data_type in data_images:
    img_src = img_path.replace(' ', '\ ').replace('&', '\&')
    label_src = img_path.replace('.jpg', '.txt').replace(' ', '\ ').replace('&', '\&')
    img_dest = data_dir / data_type / 'images' / (img_name + '.jpg')
    label_dest = data_dir / data_type / 'labels' / (img_name + '.txt')
    
    img_dest.parent.mkdir(exist_ok=True, parents=True)
    label_dest.parent.mkdir(exist_ok=True, parents=True)
    
    os.system(f'cp {img_src} {img_dest}')
    os.system(f'cp {label_src} {label_dest}')