In [None]:
from bs4 import BeautifulSoup
import csv

In [None]:
docs = []

# List of XML files for extraction

In [None]:
def Extract_collapse(input_tag, target_tag, level_w, ignore_tag=False):
    
    for item in input_tag:
        
        if target_tag != "":
            targets = item.find_all(target_tag)
            for item in targets:
                if ((item.find_all(ignore_tag) == []) or ignore_tag == False) and (item.find_all('BlockAmendment') == []) and (item.find_all('InlineAmendment') == []):
                    txt_sec = item.find_all('Text')
                    txt_mid = []
                    try:
                        section = item['id']
                    except KeyError:
                        section = "Null"
                    for sub_item in txt_sec:
                        txt = sub_item.text
                        txt_con = "".join(txt)
                        txt_mid.append(txt_con)
                    txt_out = " ".join(txt_mid)
                    if (txt_out != "" and txt_out != ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ."):
                        legWriter.writerow([level_w, section, txt_out])

        elif ((item.find_all(ignore_tag) == []) or ignore_tag == False) and (item.find_all('BlockAmendment') == []) and (item.find_all('InlineAmendment') == []):
            txt_sec = item.find_all('Text')
            txt_mid = []
            try:
                section = item['id']
            except KeyError:
                section = "Null"
            for sub_item in txt_sec:
                txt = sub_item.text
                txt_con = "".join(txt)
                txt_mid.append(txt_con)
            txt_out = " ".join(txt_mid)
            if (txt_out != "" and txt_out != ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ."):
                legWriter.writerow([level_w, section, txt_out])
                
# Extract_collapse takes each tag in the input_tag set and extract any text within any Text tag in that tag. It also extracts
# the section number and writes the output to a new line in the csv file. Target_tag can be set to sub-select a tag in the
# input_tag set for processing. Ignore_tag can be set so that items in the input_tag set which contain the Ignore_tag are not
# processed (useful for selecting sections with only 1 or 2 sub-sections levels etc.)

In [None]:
def Extract_fragment(input_tag, level_w):
    for item in input_tag:
        txt_sec = item.find('Text').text
        txt_out = "".join(txt_sec)
        try:
            section = item['id']
        except KeyError:
            section = "Null"
        if (txt_out != "" and txt_out != ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ."):
            legWriter.writerow([level_w, section, txt_out])
            
# Extract_fragment takes each tag in the input_tag set and extracts any text within the first Text tag in that tag (whereas
# Extract_collapse extract all text in all Text tags).It also extracts the section number and writes the output to a new line 
# in the csv file.

In [None]:
print("Parsing...")

# Loop over documents. Open each document as a BeautifulSoup object.

for doc in docs:
    with open(doc, 'r', encoding = 'utf-8') as file:
        xml = BeautifulSoup(file, "xml")
    
    # Open a csv and define the writer object. Writer encoding should match the open doc encoding.
    
    with open('leg_data.csv', 'a', newline='', encoding = 'utf-8') as outfile:
        legWriter = csv.writer(outfile, delimiter='|', quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        title = xml.title.text
        print("Parsed: " + title)
        
        # Select all P1 tags.
        
        P1s = xml.find_all('P1')
        
        # Level 0 captures those provisions which are expected to exist as sentences at section level 
        # (i.e. either the section does not have sub-sections or sub-sections are within an over all sentence). 
        # Clauses which set out amendments are excluded (since structure is nested within these amendments).
        
        Extract_collapse(input_tag = P1s, target_tag = '', level_w = 'L0', ignore_tag = 'P2')
                                
        # Level 1 captures complete sub-sections (and condenses sub-sub-sections where necessary). Expect this
        # text to be complete sentences.
        # Clauses which set out amendments are excluded (since structure is nested within these amendments)
        
        Extract_collapse(input_tag = P1s, target_tag = 'P2', level_w = 'L1')
        
        # Level 2 captures sub-section level fragments (i.e. text at sub-section level which may or may not form a complete
        # sentence.) E.g. the text following "section 1(1)..." until "-:(a)"
        
        for item in P1s:
            P2s = item.find_all('P2')
            Extract_fragment(input_tag = P2s, level_w = 'L2')
            
        # Level 3 captures sub-sub-section level fragments (i.e. text at sub-sub-section level which may or may not form a 
        # complete sentence.) E.g. the text following "section 1(1)(a)..." until "-:(i)"
        
            for item in P2s:
                if item.find_all('P3') != []:
                    P3s = item.find_all('P3')
                    Extract_fragment(input_tag = P3s, level_w = 'L3')
                    
        # Level 4 captures sub-sub-sub-section level fragments, e.g. the text following "section 1(1)(a)(i)".
                    
                    for item in P3s:
                        if item.find_all('P4') != []:
                            P3s = item.find_all('P4')
                            Extract_fragment(input_tag = P3s, level_w = 'L4')
                            
        # There is necessarily some duplication between layers. Every L1 should also be decomposed in L2, L3 and L4 as relevant.
        # There should be no inherent duplication between L2, L3 and L4 text. There should be no duplication between L0 and L1.
        # Note that legislation itself can also be repetitive (e.g. setting out identical provisions to apply to different 
        # areas of the UK.
        
file.close()
outfile.close()
        
print("Done!")