# Read DEIMOS metadata from XML file

Extract relevant information about the DEIMOS bands into dataframes.

**NOTE**: DEIMOS bands are provided as `NIR-R-G-B`, while we store them in `EOPatches` as `B-G-R-NIR` as in Sentinel-2 datasets. This means that we will have to swap the info read from XML files in `split_per_band`.

In [None]:
import os
from xml.etree import ElementTree as ET

import pandas as pd
from fs_s3fs import S3FS

## Config

In [None]:
instance_id = ''
aws_access_key_id = ''
aws_secret_access_key = ''

In [None]:
filesystem = S3FS(bucket_name='',
                  aws_access_key_id=aws_access_key_id,
                  aws_secret_access_key=aws_secret_access_key)

In [None]:
tiles_folder = ''

In [None]:
def tag_parser(el_iterator, vals_dict, attr='text', attrib_key=None):
    for sub in el_iterator:
        if attr == 'attrib':
            vals_dict[sub.tag] = getattr(sub, attr)[attrib_key]
        else:
            vals_dict[sub.tag] = getattr(sub, attr)


def multitag_parser(el_iterator, vals_dict, attr='text'):
    children = []
    for sub in el_iterator:
        tag_name = sub.tag
        x = {}
        tag_parser(sub.getchildren(), x)
        children.append(x)
    vals_dict[tag_name] = children


def parse_bbox(el_iterator, vals_dict, outname, use_xy=True):
    appendices = ['X', 'Y'] if use_xy else ['LAT', 'LON']
    vertex_dict = {f'FRAME_{appendix}': [] for appendix in appendices}
    for vertex in el_iterator:
        for appendix in appendices:
            vertex_dict[f'FRAME_{appendix}'].append(vertex.find(f'./FRAME_{appendix}').text)

    if use_xy:
        vals_dict[outname] = [min(vertex_dict['FRAME_X']), min(vertex_dict['FRAME_Y']),
                              max(vertex_dict['FRAME_X']), max(vertex_dict['FRAME_Y'])]
    else:
        vals_dict[outname] = [min(vertex_dict['FRAME_LAT']), min(vertex_dict['FRAME_LON']),
                              max(vertex_dict['FRAME_LAT']), max(vertex_dict['FRAME_LON'])]


def split_per_band(columns, column, query_keys, revert_bands=True,
                   index_col='BAND_INDEX', n_bands=4):

    for valdict in columns[column]:
        if all([key in set(valdict.keys()) for key in query_keys]):
            for key in query_keys:
                idx = int(valdict[index_col])
                if revert_bands:
                    idx = n_bands-idx+1
                columns[f'{key}_{idx}'] = valdict[key]
    columns.pop(column, None)


def parse_deimos_metadata_file(metadata_file, filesystem):
    tree = ET.parse(filesystem.open(metadata_file))
    root = tree.getroot()
    columns = {}
    tag_parser(root.findall('./Dataset_Id/'), columns)
    tag_parser(root.findall('./Production/'), columns)
    tag_parser(root.findall('./Data_Processing/'), columns)
    tag_parser(root.findall('./Raster_CS/'), columns)
    parse_bbox(root.findall('./Dataset_Frame/'), columns, 'bbox')
    tag_parser(root.findall('./Raster_Encoding/'), columns)
    tag_parser(root.findall('./Data_Access/'), columns)
    tag_parser(root.findall('./Data_Access/Data_File/'), columns, attr='attrib', attrib_key='href')
    tag_parser(root.findall('./Raster_Dimensions/'), columns)
    multitag_parser(root.findall('./Image_Interpretation/'), columns)
    multitag_parser(root.findall('./Image_Display/'), columns)
    tag_parser(root.findall('./Dataset_Sources/Source_Information/Coordinate_Reference_System/'), columns)
    tag_parser(root.findall('./Dataset_Sources/Source_Information/Scene_Source/'), columns)
    multitag_parser(root.findall('./Dataset_Sources/Source_Information/Quality_Assessment/'), columns)
    parse_bbox(root.findall('./Dataset_Sources/Source_Information/Source_Frame/'),
               columns, 'source_frame_bbox_latlon', use_xy=False)

    split_per_band(columns,
                   'Band_Statistics',
                   ['STX_STDV', 'STX_MEAN', 'STX_MIN', 'STX_MAX'])
    split_per_band(columns,
                   'Spectral_Band_Info',
                   ['PHYSICAL_GAIN', 'PHYSICAL_BIAS', 'PHYSICAL_UNIT', 'ESUN'])
    return pd.DataFrame([columns])

In [None]:
ms4_dfs = []
pan_dfs = []

tiles = filesystem.listdir(tiles_folder)

for tile in tiles:
    # this is needed because folder was copied from somewhere else
    if not filesystem.exists(f'{tiles_folder}/{tile}'):
        filesystem.makedirs(f'{tiles_folder}/{tile}')

    metadata = filesystem.listdir(f'{tiles_folder}/{tile}')
    metadata = [meta for meta in metadata if os.path.splitext(meta)[-1] == '.dim']

    metadata_file_ms4 = metadata[0] if '_MS4_' in metadata[0] else metadata[1]
    metadata_file_pan = metadata[0] if '_PAN_' in metadata[0] else metadata[1]

    ms4_dfs.append(parse_deimos_metadata_file(f'{tiles_folder}/{tile}/{metadata_file_ms4}',
                                              filesystem))
    pan_dfs.append(parse_deimos_metadata_file(f'{tiles_folder}/{tile}/{metadata_file_pan}',
                                              filesystem))

ms4_metadata = pd.concat(ms4_dfs)
pan_metadata = pd.concat(pan_dfs)

In [None]:
ms4_metadata.to_parquet(filesystem.openbin('metadata/deimos_ms4_metadata.pq', 'wb'))
pan_metadata.to_parquet(filesystem.openbin('metadata/deimos_pan_metadata.pq', 'wb'))