## Extracting Rule from XML

In [1]:
from PIL import Image
import xml.etree.ElementTree as ET

In [2]:
# used for searching list of dict
def search(rules, key, value):
    for idx, r in enumerate(rules):
        if r[key] == value:
            return idx, r

# (inflexible) used for querying connection (edge) by ID        
def query_edge_by_id(root, id_):
     for child in root:
            if child.tag == 'edge' and child.attrib['id'] == id_:
                return child.attrib

In [19]:
# Initializing
path = "C:/Users/Isada/Workspace/arg-microtexts/corpus/en/"
file_name = "micro_b019"

# Display argumentative text in jpg format
jpg_file = file_name + ".pdf.jpg"
im = Image.open(path+jpg_file)  
im.show()  

# Read argumentative text in XML format
xml_file = file_name + ".xml"
tree = ET.parse(path + xml_file)
root = tree.getroot()

# print XML elements
for child in root:
    print(child.tag, child.attrib)
    
print("----" * 5)

ADU = [] # For containing all ADU elements
Not_BE = [] # For containing Not BE elements
processed_c = [] # For collecting the XML element ID that be processed

# 1. Extract BE
for child in root:
    if child.tag == 'adu':
        ADU.append(child.attrib['id'])
    elif child.tag == 'edge' and child.attrib['type'] != 'seg' and child.attrib['trg'][0] != 'c':
        Not_BE.append(child.attrib['trg'])
    elif child.tag == 'edge' and child.attrib['type'] == 'seg':
        processed_c.append(child.attrib['id'])

BE = list(set(ADU).difference(set(Not_BE))) # BE = ADU - Not_BE

# Display BE in form of set
print("BE: {}".format(set(BE)))

ADD = [] # For containing added-relation connection (join)

# 2. Extract join relations
for child in root:
    if child.tag == "edge" and child.attrib["type"] == "add":
        ADD.append(child.attrib)
        processed_c.append(child.attrib["id"])
        
R_D = [] # Collect defeasible rules 
R_S = [] # Collect strict rules 

# 3. Extract defeasible rules
for child in root:
    if (child.tag == "edge") and (child.attrib["id"] not in processed_c):
        body = []
        head = None
        
        # If connection is undercut -> indicate a deafeasible rule
        if child.attrib["type"] == "und":
            # var.temp (node that be undercut by current child) is a deafeasible rule 
            temp = query_edge_by_id(root, child.attrib["trg"]) 
            d_body = []
            
            # In condition that temp also undercut to else node
            if temp["type"] == "und":
                # Recall already extracted r in R_S w/ body same as temp
                idx, d = search(R_S, 'body', [temp["src"]]) 
                R_S.pop(idx) # Delete that r -> add it to R_D instead
                d["rule"] = "=>"
            
            # Basic case
            else:
                d_body.append(temp["src"]) # append temp["src"]
                if temp["type"] == "reb": 
                    d_head = "neg_" + temp["trg"]
                else: # If temp has support attack
                    d_head = temp["trg"]
                
                # To check that the current child or the temp
                # has joining relation whether or not?
                if len(ADD) != 0:
                    for ele in ADD:
                        if ele['trg'] == child.attrib['id']: # If the current child is joined
                            body.append(ele["src"])
                        elif ele['trg'] == temp['id']: # If the temp is joined
                            d_body.append(ele["src"])
                
                # R_D format
                d = {
                    'body' : d_body,
                    'head' : d_head,
                    'rule' : "=>"
                }
                
            R_D.append(d)
            processed_c.append(temp["id"]) 
            
            # Next, adding current child to R_S
            body.append(child.attrib["src"])
            # R_S format
            r = {
                'body' : body,
                'head' : 'ab_'+str(d),
                'rule' : "->"
            }
            processed_c.append(child.attrib["id"])
            R_S.append(r)

        
# 4. Extract strict rules
for child in root:
    if (child.tag == "edge") and (child.attrib["id"] not in processed_c):
        body = []
        head = None
        
        # Check this current child be joined whether or not?
        if len(ADD) != 0:
            for ele in ADD:
                if ele['trg'] == child.attrib['id']:
                    body.append(ele["src"])
        
        # For the support and example relation
        if (child.attrib["type"] == "sup") or((child.attrib["type"] == "exa")):
            body.append(child.attrib["src"])
            head = child.attrib["trg"]
            r = {
                'body' : body,
                'head' : head,
                'rule' : "->"
            }
        
        # For the rebuttal relation
        elif child.attrib["type"] == "reb":
            body.append(child.attrib["src"])
            head = "neg_" + child.attrib["trg"]
            r = {
                'body' : body,
                'head' : head,
                'rule' : "->"
            }       
        
        R_S.append(r)
        processed_c.append(child.attrib["id"])

# Display pretty R_D
for i in range(len(R_D)):
    print("d{}: {} {} {}".format(str(i+1), 
                                 R_D[i]["body"],
                                 R_D[i]["rule"],
                                 R_D[i]["head"]))
# Display pretty R_S
for i in range(len(R_S)):
    print("r{}: {} {} {}".format(str(i+1), 
                                 R_S[i]["body"],
                                 R_S[i]["rule"],
                                 R_S[i]["head"]))

edu {'id': 'e1'}
edu {'id': 'e2'}
edu {'id': 'e3'}
edu {'id': 'e4'}
edu {'id': 'e5'}
edu {'id': 'e6'}
adu {'id': 'a1', 'type': 'opp'}
adu {'id': 'a2', 'type': 'opp'}
adu {'id': 'a3', 'type': 'pro'}
adu {'id': 'a4', 'type': 'pro'}
adu {'id': 'a5', 'type': 'pro'}
adu {'id': 'a6', 'type': 'pro'}
edge {'id': 'c6', 'src': 'e1', 'trg': 'a1', 'type': 'seg'}
edge {'id': 'c7', 'src': 'e2', 'trg': 'a2', 'type': 'seg'}
edge {'id': 'c8', 'src': 'e3', 'trg': 'a3', 'type': 'seg'}
edge {'id': 'c9', 'src': 'e4', 'trg': 'a4', 'type': 'seg'}
edge {'id': 'c10', 'src': 'e5', 'trg': 'a5', 'type': 'seg'}
edge {'id': 'c11', 'src': 'e6', 'trg': 'a6', 'type': 'seg'}
edge {'id': 'c1', 'src': 'a1', 'trg': 'a2', 'type': 'sup'}
edge {'id': 'c2', 'src': 'a2', 'trg': 'a6', 'type': 'reb'}
edge {'id': 'c3', 'src': 'a3', 'trg': 'a4', 'type': 'sup'}
edge {'id': 'c4', 'src': 'a4', 'trg': 'a6', 'type': 'sup'}
edge {'id': 'c5', 'src': 'a5', 'trg': 'a6', 'type': 'sup'}
--------------------
BE: {'a1', 'a3', 'a5'}
r1: ['a1'] 

In [20]:
def extract_rule_based(file_name, path):
    # Read argumentative text in XML format
    xml_file = file_name + ".xml"
    tree = ET.parse(path + xml_file)
    root = tree.getroot()

    # print XML elements
    for child in root:
        print(child.tag, child.attrib)

    print("----" * 5)

    ADU = [] # For containing all ADU elements
    Not_BE = [] # For containing Not BE elements
    processed_c = [] # For collecting the XML element ID that be processed

    # 1. Extract BE
    for child in root:
        if child.tag == 'adu':
            ADU.append(child.attrib['id'])
        elif child.tag == 'edge' and child.attrib['type'] != 'seg' and child.attrib['trg'][0] != 'c':
            Not_BE.append(child.attrib['trg'])
        elif child.tag == 'edge' and child.attrib['type'] == 'seg':
            processed_c.append(child.attrib['id'])

    BE = list(set(ADU).difference(set(Not_BE))) # BE = ADU - Not_BE

    # Display BE in form of set
    print("BE: {}".format(set(BE)))

    ADD = [] # For containing added-relation connection (join)

    # 2. Extract join relations
    for child in root:
        if child.tag == "edge" and child.attrib["type"] == "add":
            ADD.append(child.attrib)
            processed_c.append(child.attrib["id"])

    R_D = [] # Collect defeasible rules 
    R_S = [] # Collect strict rules 

    # 3. Extract defeasible rules
    for child in root:
        if (child.tag == "edge") and (child.attrib["id"] not in processed_c):
            body = []
            head = None

            # If connection is undercut -> indicate a deafeasible rule
            if child.attrib["type"] == "und":
                # var.temp (node that be undercut by current child) is a deafeasible rule 
                temp = query_edge_by_id(root, child.attrib["trg"]) 
                d_body = []

                # In condition that temp also undercut to else node
                if temp["type"] == "und":
                    # Recall already extracted r in R_S w/ body same as temp
                    idx, d = search(R_S, 'body', [temp["src"]]) 
                    R_S.pop(idx) # Delete that r -> add it to R_D instead
                    d["rule"] = "=>"

                # Basic case
                else:
                    d_body.append(temp["src"]) # append temp["src"]
                    if temp["type"] == "reb": 
                        d_head = "neg_" + temp["trg"]
                    else: # If temp has support attack
                        d_head = temp["trg"]

                    # To check that the current child or the temp
                    # has joining relation whether or not?
                    if len(ADD) != 0:
                        for ele in ADD:
                            if ele['trg'] == child.attrib['id']: # If the current child is joined
                                body.append(ele["src"])
                            elif ele['trg'] == temp['id']: # If the temp is joined
                                d_body.append(ele["src"])

                    # R_D format
                    d = {
                        'body' : d_body,
                        'head' : d_head,
                        'rule' : "=>"
                    }

                R_D.append(d)
                processed_c.append(temp["id"]) 

                # Next, adding current child to R_S
                body.append(child.attrib["src"])
                # R_S format
                r = {
                    'body' : body,
                    'head' : 'ab_'+str(d),
                    'rule' : "->"
                }
                processed_c.append(child.attrib["id"])
                R_S.append(r)


    # 4. Extract strict rules
    for child in root:
        if (child.tag == "edge") and (child.attrib["id"] not in processed_c):
            body = []
            head = None

            # Check this current child be joined whether or not?
            if len(ADD) != 0:
                for ele in ADD:
                    if ele['trg'] == child.attrib['id']:
                        body.append(ele["src"])

            # For the support and example relation
            if (child.attrib["type"] == "sup") or((child.attrib["type"] == "exa")):
                body.append(child.attrib["src"])
                head = child.attrib["trg"]
                r = {
                    'body' : body,
                    'head' : head,
                    'rule' : "->"
                }

            # For the rebuttal relation
            elif child.attrib["type"] == "reb":
                body.append(child.attrib["src"])
                head = "neg_" + child.attrib["trg"]
                r = {
                    'body' : body,
                    'head' : head,
                    'rule' : "->"
                }       

            R_S.append(r)
            processed_c.append(child.attrib["id"])

    # Display pretty R_D
    for i in range(len(R_D)):
        print("d{}: {} {} {}".format(str(i+1), 
                                     R_D[i]["body"],
                                     R_D[i]["rule"],
                                     R_D[i]["head"]))
    # Display pretty R_S
    for i in range(len(R_S)):
        print("r{}: {} {} {}".format(str(i+1), 
                                     R_S[i]["body"],
                                     R_S[i]["rule"],
                                     R_S[i]["head"]))

In [37]:
# Initializing
path = "C:/Users/Isada/Workspace/arg-microtexts/corpus/en/"

# TEST CASE:
# - micro_b011 : normal case w/ undercut
# - micro_b005 : undercut of undercut
# - micro_b047 : undercut of join
# - micro_b062 : three-join
# - micro_d09 : example rel

file_name = "micro_d09"

# Display argumentative text in jpg format
jpg_file = file_name + ".pdf.jpg"
im = Image.open(path+jpg_file)  
im.show()  

extract_rule_based(file_name, path)

edu {'id': 'e1'}
edu {'id': 'e2'}
edu {'id': 'e3'}
edu {'id': 'e4'}
edu {'id': 'e5'}
adu {'id': 'a1', 'type': 'pro'}
adu {'id': 'a2', 'type': 'pro'}
adu {'id': 'a3', 'type': 'pro'}
adu {'id': 'a4', 'type': 'opp'}
adu {'id': 'a5', 'type': 'pro'}
edge {'id': 'c6', 'src': 'e1', 'trg': 'a1', 'type': 'seg'}
edge {'id': 'c7', 'src': 'e2', 'trg': 'a2', 'type': 'seg'}
edge {'id': 'c8', 'src': 'e3', 'trg': 'a3', 'type': 'seg'}
edge {'id': 'c9', 'src': 'e4', 'trg': 'a4', 'type': 'seg'}
edge {'id': 'c10', 'src': 'e5', 'trg': 'a5', 'type': 'seg'}
edge {'id': 'c2', 'src': 'a2', 'trg': 'a1', 'type': 'sup'}
edge {'id': 'c3', 'src': 'a3', 'trg': 'a2', 'type': 'exa'}
edge {'id': 'c4', 'src': 'a4', 'trg': 'c2', 'type': 'und'}
edge {'id': 'c5', 'src': 'a5', 'trg': 'a4', 'type': 'reb'}
--------------------
BE: {'a5', 'a3'}
d1: ['a2'] => a1
r1: ['a4'] -> ab_{'body': ['a2'], 'head': 'a1', 'rule': '=>'}
r2: ['a3'] -> a2
r3: ['a5'] -> neg_a4
