In [None]:
import sys
import json
import argparse
import io
import os
import re
import PIL
from PIL import Image
from pathlib import Path
from google.cloud import vision
from lxml import objectify


def json2xml(json_obj, line_padding=""):
    result_list = list()
    json_obj_type = type(json_obj)

    if json_obj_type is list:
        for sub_elem in json_obj:
            result_list.append(json2xml(sub_elem, line_padding))

        return "\n".join(result_list)

    if json_obj_type is dict:
        for tag_name in json_obj:
            sub_obj = json_obj[tag_name]
            result_list.append("%s<%s>" % (line_padding, tag_name))
            result_list.append(json2xml(sub_obj, "\t" + line_padding))
            result_list.append("%s</%s>" % (line_padding, tag_name))
        return "\n".join(result_list)
    return "%s%s" % (line_padding, json_obj)


def run_tagger():
    in_dir = os.path.abspath('/home/seongjuk/project/images')
    xml_ori_path = os.path.abspath('/home/seongjuk/project/xml_ori')
    out_dir = os.path.abspath('/home/seongjuk/project/xml_ori_retagged')
    hint = None
    overwrite_flag = True

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    episodes = os.listdir(in_dir)
    episodes.sort()
    
    xml_oris = os.listdir(xml_ori_path)
    xml_oris.sort()

    # iterate meme dir.
    for episode, xml_ori in zip(episodes, xml_oris):
        epi_dir = os.path.join(in_dir, episode)
        xml_ori_epi_dir = os.path.join(xml_ori_path, xml_ori)        

        if not os.path.isdir(epi_dir):
            continue

        print('\n## Episode : ', episode, xml_ori)

        # xml episode folders should not have whitespace in name.
        xml_ep = episode.replace(' ', '_')
        xml_path = os.path.join(out_dir, xml_ep)

        if not os.path.exists(xml_path):
            os.makedirs(xml_path)

        images = os.listdir(epi_dir)
        images.sort()
        
        xml_ori_files = os.listdir(xml_ori_epi_dir)
        xml_ori_files.sort()

        for image_file_name, xml_ori_file_name in zip(images, xml_ori_files):
            # check if it's image file by file extension
            img_path = os.path.join(in_dir, episode, image_file_name)
            xml_ori_file_path = os.path.join(xml_ori_path, xml_ori, xml_ori_file_name)
            
            x_path = os.path.join(out_dir, xml_ep, image_file_name)
            pre, ext = os.path.splitext(x_path)
            x_path = pre + '.xml'

            print('img: ' + img_path)
            print('xml: ' + xml_ori_file_path)            
        
            with open(x_path, 'w') as xml_out_file, PIL.Image.open(img_path) as pil_image, open(xml_ori_file_path, 'r') as xml_file:
                xml_string = xml_file.read()
                xml_root = objectify.fromstring(xml_string)
                m_text = xml_root['object']['name']
                m_text = str(m_text).replace('\t', ' ').replace('\n', ' ').replace('\s+', ' ')
                m_text = re.search(r'\s*(.*)', m_text).group(1)                
                print(m_text)

                pil_width, pil_height = pil_image.size
                pil_depth = len(pil_image.getbands())

                s = '{"annotation" : {"folder" : "' + episode + '", "filename" : "' + image_file_name + \
                    '", "size" : {"width" : "' + str(pil_width) + '", "height" : "' + str(pil_height) + \
                    '", "depth" : "' + str(pil_depth) + '"}, "object" : {"name" : "' + m_text + \
                    '", "difficult" : "0"} }}'

                j = json.loads(s)
                z = json2xml(j)
                xml_out_file.write(z)
                xml_out_file.close()

run_tagger()