In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

from lxml import etree
import pandas as pd

In [2]:
tree = etree.parse('eda.als.xml')
root = tree.getroot()

In [3]:
class XmlDocumentInfo:
    
    def __init__(self, etree_root):
        self.elements_df = self._get_elements_df(etree_root)
        self.attributes_df = self._get_attributes_df()

    def get_all_element_tag_names(self, parent_tag_name=None):
        """
        Returns a set of all unique element tag names within the XML document.
        If parent_tag_name is specified, then only the element tags that appear
        with parents of that type are returned.
        """
        tags = set()
        for index, element_row in self.elements_df.iterrows():
            tag_name = element_row["tag name"]
            if parent_tag_name is not None:
                has_parent = self.elements_df["parent index"].notna().iloc[index]
                if has_parent:
                    parent_index = element_row["parent index"]
                    parent = self.elements_df.iloc[parent_index]
                    if parent["tag name"] == parent_tag_name:
                        tags.add(tag_name)
            else:
                tags.add(tag_name)
        return tags
        
    def get_all_attribute_names(self, element_tag_name=None):
        """
        Returns a set of all unique attribute names within the XML document.
        If element_tag_name is specified, then only the attributes that appear
        in elements of that type are returned.
        """
        attributes = set()
        for index, attribute_row in self.attributes_df.iterrows():
            name = attribute_row["name"]
            if element_tag_name is not None:
                element_index = attribute_row["element index"]
                element = self.elements_df.iloc[element_index]
                if element["tag name"] == element_tag_name:
                    attributes.add(name)
            else:
                attributes.add(name)
        return attributes
        
        
    def get_tags_with_parent_dependent_attributes(self):
        """
        Returns a set of all unique element tag names such that one
        or more of its instances contains an attribute that appears
        only when the element's parent has a specific tag name, unless
        all the elements share the same parent tag.
        
        For the example XML below, the set {"C"} will be returned
        
        <document>
            <A>
                <C c1="1"/>
                <D d1="1">
                <E e1="1">
            </A>
            <A>
                <E e1="1">
            </A>
            <B>
                <C c1="1" c2="2">
                <D d1="1">
            </B>
        </document>
        
        """

        def get_all_parent_tag_names_for_element_tag(tag_name):
            parent_indeces = self.elements_df[ self.elements_df["tag name"] == tag_name ]["parent index"].dropna()
            parent_tags = set(self.elements_df["tag name"].iloc[parent_indeces].tolist())
            return parent_tags

        def get_parent_and_attributes_for_each_tag_instance(tag_name):
            """
            Returns a list of dictionaries formatted as follows:
            {
                "parent tag name" : string or None,
                "attributes": list of string
            }
            """
            tag_instances_info = []
            tag_instances = self.elements_df[self.elements_df["tag name"] == tag_name]

            for index, tag_instance_row in tag_instances.iterrows():
                tag_instance_info = {"parent tag name": None, "attributes": []}
                
                # Set tag_instance_info["parent tag name"].
                etree_node = tag_instance_row["etree node"]
                parent_index = self._get_parent_index_from_etree_node(self.elements_df, etree_node)
                if parent_index is not None:
                    tag_instance_info["parent tag name"] = self.elements_df.iloc[parent_index]["tag name"]
                
                # Set tag_instance_info["attributes"].
                attributes = self.attributes_df[self.attributes_df["element index"] == index]["name"]
                tag_instance_info["attributes"] = attributes.tolist()
                
                # Append to list of return value.
                tag_instances_info.append(tag_instance_info)

            return tag_instances_info
        
        all_tags = set(self.elements_df["tag name"].tolist())

        # Ignore tag names with only one parent type.
        tags_with_multiple_parent_types = set()
        for tag_name in all_tags:
            parent_tags = get_all_parent_tag_names_for_element_tag(tag_name)
            if len(parent_tags) > 1:
                tags_with_multiple_parent_types.add(tag_name)
        
        tag_parents_and_attributes = {tag_name:get_parent_and_attributes_for_each_tag_instance(tag_name) for tag_name in tags_with_multiple_parent_types}
        
        tags_with_parent_dependent_attributes = set()
        for tag_name, tag_instances_info in tag_parents_and_attributes.items():

            # Get all attribute names for this tag name.
            attributes = set()
            for tag_instance_info in tag_instances_info:
                for attribute in tag_instance_info["attributes"]:
                    attributes.add(attribute)
            
            # Get all parent tag names associated with each attribute.
            # If an attribute is only associated with one parent tag,
            # then add tag_name to the list of return values.
            for attribute in attributes:
                parents = set()
                for tag_instance_info in tag_instances_info:
                    if attribute in tag_instance_info["attributes"] and tag_instance_info["parent tag name"] is not None:
                        parents.add(tag_instance_info["parent tag name"])
                if len(parents) == 1:
                    tags_with_parent_dependent_attributes.add(tag_name)

        return tags_with_parent_dependent_attributes
        
#     def get_common_attribute_names(self, tag_name):
#         """
#         Returns a Set of attribute names for this element's tag that
#         are common among all instances of this element's tag, as opposed
#         to attributes that only appear when the element has a specific parent tag.
#         """
#         common_attribute_names = None
#         for k,v in self.attribute_names_per_parent.items():
#             common_attribute_names = v if common_attribute_names is None else common_attribute_names.intersection(v)
#         return common_attribute_names

    def _get_elements_df(self, etree_root):

        df = pd.DataFrame(columns=["etree node", "tag name", "content", "parent index"])
        
        for node in etree_root.iter():
            df_row_dict = {
                "etree node" : node,
                "tag name": node.tag,
                "content" : node.text.strip() if node.text is not None and not node.text.isspace() else None,
                "parent index" : None
            }
            df = df.append(df_row_dict, ignore_index=True)
        
        df["parent index"] = df["etree node"].apply(lambda node: self._get_parent_index_from_etree_node(df, node)) \
                                             .astype("Int64")
        
        return df
    
    def _get_attributes_df(self):

        df = pd.DataFrame(columns=["name", "value", "element index"])
        
        for element_index, element_row in self.elements_df.iterrows():
            for name, value in element_row["etree node"].attrib.items():
                df_row_dict = {
                    "name": name,
                    "value": value,
                    "element index": element_index
                }
                df = df.append(df_row_dict, ignore_index=True)
        
        return df

    def _get_parent_index_from_etree_node(self, df, node):
        is_parent_node = df["etree node"] == node.getparent()
        parent_indeces = df.index[ is_parent_node ].tolist()
        num_parents = len(parent_indeces)
        if num_parents == 0:
            return None
        elif num_parents == 1:
            return int(parent_indeces[0])
        else:
            raise Exception(f"Found more than one parent for node: {etree.tostring(node)}")


In [4]:
xml_info = XmlDocumentInfo(root)

In [5]:
xml_info.elements_df.iloc[-30:]

Unnamed: 0,etree node,tag name,content,parent index
3320,[],AutomationMode,,1
3321,[],ArrangementOverdub,,1
3322,[],ColorSequenceIndex,,1
3323,[[]],AutoColorPickerForPlayerAndGroupTracks,,1
3324,[],NextColorIndex,,3323
3325,[[]],AutoColorPickerForReturnAndMasterTracks,,1
3326,[],NextColorIndex,,3325
3327,[],ViewData,,1
3328,[],MidiFoldIn,,1
3329,[],MidiPrelisten,,1


In [6]:
xml_info.attributes_df.iloc[-30:]

Unnamed: 0,name,value,element index
2636,Value,false,3320
2637,Value,false,3321
2638,Value,3,3322
2639,Value,8,3324
2640,Value,0,3326
2641,Value,{},3327
2642,Value,false,3328
2643,Value,false,3329
2644,Value,false,3330
2645,Top,-2147483648,3331


In [7]:
pp.pprint(xml_info.get_all_element_tag_names())
pp.pprint(xml_info.get_all_element_tag_names(parent_tag_name="SequencerNavigator"))

{   'Ableton',
    'Active',
    'AllPassGain',
    'AllPassSize',
    'AnchorTime',
    'Annotation',
    'ArrangementOverdub',
    'ArrangerAutomation',
    'ArrangerIO',
    'ArrangerMixer',
    'ArrangerReturns',
    'ArrangerShowOverView',
    'ArrangerTrackDelay',
    'AudioInputRouting',
    'AudioOutputRouting',
    'AudioSequencer',
    'AudioTrack',
    'AutoColorPickerForPlayerAndGroupTracks',
    'AutoColorPickerForReturnAndMasterTracks',
    'AutoQuantisation',
    'Automation',
    'AutomationEnvelope',
    'AutomationEnvelopes',
    'AutomationLane',
    'AutomationLanes',
    'AutomationMode',
    'AutomationTarget',
    'BandFreq',
    'BandHighOn',
    'BandLowOn',
    'BandWidth',
    'BeatDelayEnumL',
    'BeatDelayEnumR',
    'BeatTimeHelper',
    'BranchDeviceId',
    'BranchSourceContext',
    'BrowserContentPath',
    'ChooserBar',
    'ChorusOn',
    'ClientSize',
    'ClipEnvelopeChooserViewState',
    'ClipSlot',
    'ClipSlotList',
    'ClipSlotsListWrapper'

{'ClientSize', 'ScrollerPos', 'BeatTimeHelper'}


In [8]:
pp.pprint(xml_info.get_all_attribute_names())
pp.pprint(xml_info.get_all_attribute_names(element_tag_name="VideoWindowRect"))

{   'Bottom',
    'Creator',
    'Dir',
    'Id',
    'Left',
    'LomId',
    'MajorVersion',
    'MinorVersion',
    'Revision',
    'Right',
    'SchemaChangeCount',
    'Time',
    'Top',
    'Value',
    'X',
    'Y'}
{'Right', 'Top', 'Left', 'Bottom'}


In [9]:
pp.pprint(xml_info.get_tags_with_parent_dependent_attributes())

{'ClipSlot', 'FileRef', 'Value'}


In [31]:


class DocumentInfo:

    def __init__(self, etree_root):
        self.elements_df = self._get_elements_df(etree_root)
        self.attributes_df = self._get_attributes_df()
        self.elements_df.drop("etree node", axis=1, inplace=True)
        self.root_element = self._get_element(0)

    def _get_elements_df(self, etree_root):

        df = pd.DataFrame(columns=["etree node", "tag name", "content", "parent index"])

        for node in etree_root.iter():
            df_row_dict = {
                "etree node": node,
                "tag name": node.tag,
                "content": node.text.strip() if node.text is not None and not node.text.isspace() else None,
                "parent index": None
            }
            df = df.append(df_row_dict, ignore_index=True)

        df["parent index"] = df["etree node"].apply(lambda node: self._get_parent_index_from_etree_node(df, node)) \
                                             .astype("Int64")

        return df

    def _get_attributes_df(self):

        df = pd.DataFrame(columns=["name", "value", "element index"])

        for element_index, element_row in self.elements_df.iterrows():
            for name, value in element_row["etree node"].attrib.items():
                df_row_dict = {
                    "name": name,
                    "value": value,
                    "element index": element_index
                }
                df = df.append(df_row_dict, ignore_index=True)

        return df

    def _get_parent_index_from_etree_node(self, df, node):
        is_parent_node = df["etree node"] == node.getparent()
        parent_indeces = df.index[is_parent_node].tolist()
        num_parents = len(parent_indeces)
        if num_parents == 0:
            return None
        elif num_parents == 1:
            return int(parent_indeces[0])
        else:
            raise Exception(f"Found more than one parent for node: {etree.tostring(node)}")

    def _get_element(self, id):
        if id is None:
            return None
        element_row = self.elements_df.iloc[id]
        name = element_row["tag name"]
        content = element_row["content"]
        attribute_rows = self.attributes_df[self.attributes_df["element index"] == id]
        attributes = {row["name"]: row["value"] for i, row in attribute_rows.iterrows()}
        child_indeces = self.elements_df[self.elements_df["parent index"] == id].index.tolist()
        children = [self._get_element(child_id) for child_id in child_indeces]
        element = ElementInfo(id, name, attributes, children)
        for child in element.children():
            child.set_parent(element)
        return element


class ElementInfo:

    def __init__(self, id, name, attributes, children):
        self._id = id
        self._name = name
        self._attributes = attributes
        self._parent = None
        self._children = children

    def id(self):
        return self._id

    def name(self):
        return self._name

    def attributes(self):
        return self._attributes

    def parent(self):
        return self._parent

    def set_parent(self, parent):
        self._parent = parent

    def children(self):
        return self._children

    def has_children(self):
        children = self.children()
        return children is not None and len(children) > 0

    def get_tag_string(self):
        s = f"<{self.name()}"
        if len(self.attributes().keys()) > 0:
            for attribute, value in self.attributes().items():
                s += f' {attribute}="{value}"'
            s += " "
        if self.has_children():
            s += ">"
        else:
            s += "/>"
        return s

    def __str__(self):
        s = self.get_tag_string()
        if self.has_children():
            for child in self.children():
                s += f"\n\t{child.get_tag_string()}"
            s += f"\n</{self.name()}>"
        return s


In [32]:
doc = DocumentInfo(root)

In [33]:
print(doc.root_element.children()[0])

<LiveSet>
	<NextPointeeId Value="17911" />
	<OverwriteProtectionNumber Value="2560" />
	<LomId Value="0" />
	<LomIdView Value="0" />
	<Tracks>
	<MasterTrack>
	<PreHearTrack>
	<SendsPre>
	<SceneNames>
	<Transport>
	<SongMasterValues>
	<GlobalQuantisation Value="4" />
	<AutoQuantisation Value="0" />
	<Grid>
	<ScaleInformation>
	<SmpteFormat Value="0" />
	<TimeSelection>
	<SequencerNavigator>
	<ViewStateLaunchPanel Value="false" />
	<ViewStateEnvelopePanel Value="false" />
	<ViewStateSamplePanel Value="true" />
	<ContentSplitterProperties>
	<ViewStateFxSlotCount Value="4" />
	<ViewStateSessionMixerHeight Value="120" />
	<Locators>
	<DetailClipKeyMidis/>
	<TracksListWrapper LomId="0" />
	<VisibleTracksListWrapper LomId="0" />
	<ReturnTracksListWrapper LomId="0" />
	<ScenesListWrapper LomId="0" />
	<CuePointsListWrapper LomId="0" />
	<ChooserBar Value="1" />
	<Annotation Value="" />
	<SoloOrPflSavedValue Value="true" />
	<SoloInPlace Value="true" />
	<CrossfadeCurve Value="2" />
	<LatencyCo