In [62]:
import re

import torch
import pandas as pd

from transformers import MarkupLMFeatureExtractor
from transformers import AutoProcessor, AutoModelForSequenceClassification

In [2]:
def filter_p_tags(node_xpath_pairs, max_length=30):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
#         last_two_tags = "/".join(pair[1].rsplit("/", 2)[1:])
#         print(last_two_tags)
#         if 'body' in last_two_tags and 'p' in last_two_tags and not 'table' in last_two_tags:
        if 'body' in pair[1] and 'p' in pair[1] and not 'table' in pair[1]:
            node_text = pair[0]
            
            if len(node_text.split()) <= max_length:
                filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs


def get_text_tag(node_xpath_pairs, text):
    matches = []
    for idx, pair in enumerate(node_xpath_pairs):
        if re.search(fr"{text}", pair[0].lower()):
            matches.append((*pair, f"idx={idx}"))

    return matches

def get_table_tags(node_xpath_pairs):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
        last_five_tags = "/".join(pair[1].rsplit("/", 5)[1:])
        
        if 'table' in last_five_tags:
            filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs
    


def get_div_tags(node_xpath_pairs):
    filtered_pairs = []
    for idx, pair in enumerate(node_xpath_pairs):
        last_five_tags = "/".join(pair[1].rsplit("/", 5)[1:])
        
        if 'div' in last_five_tags:
            filtered_pairs.append((*pair, f"idx={idx}"))

    return filtered_pairs


In [3]:
# Sukrit's input

# input_html_filelist = [
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/alaska_project8.htm",
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Community Bankers Trust Corporation_United Bankshares, Inc.htm",
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/County Bancorp, Inc._Nicolet Bankshares, Inc.html",
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Perspecta Inc._Veritas Capital.htm",
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/MTS Systems Corporation_Amphenol Corporation.htm",
#     "/Users/sukritrao/Documents/NYU/Coursework/Spring2023/Independent-Study/project/playground/Select_Bancorp_Inc_First_Bancorp.htm"
# ]

In [4]:
# Rohith's input

input_html_filelist = ['./alaska_project8.htm', 
                       './glacier_altabancorp.html', 
                       './open_zix.html', 
                       './california_pacific.html', 
                       './ready_anworth.htm']

In [66]:
file_idx = 0
input_html = input_html_filelist[file_idx]
with open(input_html) as f:
    html_code = f.read()
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

feature_extractor = MarkupLMFeatureExtractor()
encoding = feature_extractor(html_code)

Using device: cpu


In [67]:
start_node_idx = 0
end_node_idx = 400

nodes = encoding['nodes'][0][start_node_idx:end_node_idx]
xpaths = encoding['xpaths'][0][start_node_idx:end_node_idx]
important_nodes = []
possible_text_starts = ['TABLE OF CONTENTS', 'TOC', 'TABLE OF CONTENT']
possible_text_ends = ['AGREEMENT AND PLAN OF MERGER', 'PLAN AND AGREEMENT OF MERGER', 'AGREEMENT AND PLAN OF REORGANIZATION']
possible_xpath_starts = ['table', 'table[1]', 'table[2]', 'table[3]'] # Modify code to allow for any number of tables before end value
possible_xpath_ends = []
text_types = ['section_title', 'paragraph', 'table', 'other']

def table_in_xpath(xpath):
    S1, S2 = set(possible_xpath_starts), set(xpath)
    return len(S1.intersection(S2)) > 0


print("###################################################")
started = False
ended = False
xpath_lists = [path.split('/')[-7:] for path in xpaths]
start = [0, '', '']
end = [0, '', '']
section_titles = []
for i in range(len(xpath_lists)):
    text = nodes[i].strip()
    xpath_list = xpath_lists[i]
    xpath_str = xpaths[i]

    if not started:
        if (text in possible_text_starts) or table_in_xpath(xpath_list): 
            print(i)
            print(f'Possible START at Node {i}: ', text, xpath_list)
            started = True
            start = [i, text, xpath_list]

    elif started and not ended:

#         if 'table' in xpath_list:
#             print(nodes[i], xpath_lists[i])
        if (text in possible_text_ends):
            print(i)
            print(f'Possible END at Node {i}: ', text, xpath_list)
            ended = True
            end = [i, text, xpath_list]
            
    
print("###################################################")



###################################################
13
Possible START at Node 13:  TABLE OF CONTENTS ['filename', 'description', 'text', 'html', 'body', 'p[18]', 'b']
345
Possible END at Node 345:  AGREEMENT AND PLAN OF MERGER ['filename', 'description', 'text', 'html', 'body', 'p[33]', 'b']
###################################################


In [68]:
# Extracts all paths and nodes within TOC boundaries
toc_paths = xpaths[start[0] + (0 if 'table' in start[2] else 1):end[0]-1]
toc_nodes = nodes[start[0] + (0 if 'table' in start[2] else 1):end[0]-1]

In [69]:
toc_nodes[0:10]

['Page',
 'ARTICLE 1',
 'DEFINITIONS',
 '1',
 'Section 1.01',
 'Definitions',
 '1',
 'Section 1.02',
 'Other Definitional and Interpretative Provisions',
 '16']

In [244]:
# Using regex to find table tag and splitting on the table tag

# Change index() to find() if exceptions become an issue
# TODO: Automate finding table tag instead of hardcoding for indexing
reg_str = r'table(\[[0-9]+\])?'
table_end_idx = re.search(reg_str, toc_paths[0]).span()[0]
TOC_root_str = toc_paths[0][:table_end_idx]

shortened_toc_paths = [path[table_end_idx:].split('/') for path in toc_paths]
shortened_toc_paths[:10]

toc_paths_full = []
for i in range(len(toc_paths)):
    full_paths_nodes = [toc_nodes[i], toc_paths[i]] + shortened_toc_paths[i]
    
        
    toc_paths_full.append(full_paths_nodes)
# group_list = [path.index('1') for path in shortened_toc_paths]
# group_list[:5]


# After extracting TOC, then use get_text_tag to match to section title in document

In [245]:
full_paths_df = pd.DataFrame(toc_paths_full).rename(columns={0 : "node_text", 1 : "full_xpath", 2 : "table", 3 : "tr", 4 : "td"})


In [246]:
sorted_df = full_paths_df.sort_values(by=['table', 'tr'], axis=0)
sorted_df.iloc[99:120]

Unnamed: 0,node_text,full_xpath,table,tr,td,5,6
122,Section 4.23,/document/type/sequence/filename/description/t...,table[2],tr[12],td[1],p,
123,Brokers’ Fees,/document/type/sequence/filename/description/t...,table[2],tr[12],td[2],p,
124,48,/document/type/sequence/filename/description/t...,table[2],tr[12],td[3],p,
125,Section 4.24,/document/type/sequence/filename/description/t...,table[2],tr[13],td[1],p,
126,Opinion of Financial Advisor,/document/type/sequence/filename/description/t...,table[2],tr[13],td[2],p,
127,48,/document/type/sequence/filename/description/t...,table[2],tr[13],td[3],p,
128,Section 4.25,/document/type/sequence/filename/description/t...,table[2],tr[14],td[1],p,
129,Trade Practices,/document/type/sequence/filename/description/t...,table[2],tr[14],td[2],p,
130,49,/document/type/sequence/filename/description/t...,table[2],tr[14],td[3],p,
131,Section 4.26,/document/type/sequence/filename/description/t...,table[2],tr[15],td[1],p,


In [265]:
grouped_df = full_paths_df.groupby(by=['table', 'tr'], axis=0)
def combine_rows(x):
    return pd.Series({'node_text': ' '.join(x['node_text']), 'full_xpath': x.iloc[0]['full_xpath']})
concat_text_df = grouped_df.apply(combine_rows)
concat_text_df = concat_text_df.reset_index(drop=False).sort_values(by=['table', 'tr'])
concat_text_df['tr_idx'] = concat_text_df['tr'].apply(lambda x: int(x[3:-1]) if 'div' not in x else -999)
concat_text_df['table_idx'] = concat_text_df['table'].apply(lambda x: int(x[6:-1]) if 'div' not in x else -999)
concat_text_df = concat_text_df.sort_values(by=['table_idx', 'tr_idx']).drop(columns=['tr_idx', 'table_idx'])
concat_text_df.to_csv('toc_node_xpath_pairs.csv')

In [225]:
full_paths_df

Unnamed: 0,node_text,full_xpath,table,tr,td,5,6
0,Page,/document/type/sequence/filename/description/t...,table[1],tr[1],td[3],b,
1,ARTICLE 1,/document/type/sequence/filename/description/t...,table[1],tr[3],td[1],p,b
2,DEFINITIONS,/document/type/sequence/filename/description/t...,table[1],tr[3],td[1],p,
3,1,/document/type/sequence/filename/description/t...,table[1],tr[3],td[2],p,
4,Section 1.01,/document/type/sequence/filename/description/t...,table[1],tr[4],td[1],p,
...,...,...,...,...,...,...,...
325,Section 9.17,/document/type/sequence/filename/description/t...,table[4],tr[15],td[1],p,
326,Transfer Taxes,/document/type/sequence/filename/description/t...,table[4],tr[15],td[2],p,
327,91,/document/type/sequence/filename/description/t...,table[4],tr[15],td[3],p,
328,Exhibit A – Form of Certificate of Incorporati...,/document/type/sequence/filename/description/t...,p[28],,,,


Main idea for classifying text into our (currently 4) labels: section titles, paragraphs, tables, or other. The code first detects the boundaries of the TOC for future reference in identifying section titles. 

#### TOC Extraction
Here we extract out each section title. We can conveniently group by the tr, or table row, xpath value to get each line in the TOC. We then split and join based on the td, data cell, tag to get the full section title and corresponding page number. We can use this page number when finding the section titles.

#### Section Titles
We first run through lines of the merger and detect or group by(pd groupby?) those elements which are in the same p tag, or div tag. If the lines grouped together match a section title which we extracted from the TOC, then it can be considered a label for the title.

#### Paragraph Tags
We follow the same grouping procedure as for titles. We know each title should theoretically have corresponding paragraphs. We locate each section title and track all text in p tags underneath as a paragraph until the next section title.
 


In [37]:
# TOC Extraction

xpaths[14:34]

['/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[3]/td[1]/p',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[3]/td[4]',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[4]/td[1]/p',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[4]/td[3]',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[4]/td[6]',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[5]/td[1]/p',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[5]/td[3]',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[5]/td[6]',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[6]/td[1]/p',
 '/document/type/sequence/filename/description/text/html/body/p/center/div/table/tr[6]/td[3]',
 '/document/type/sequence/filename/descrip

In [38]:
nodes = encoding['nodes'][0]
xpaths = encoding['xpaths'][0]

# cropped_xpaths = ["/".join(x.rsplit("/", 5)[1:]) for x in xpaths]

# node_xpath_pairs = list(zip(nodes, cropped_xpaths))
node_xpath_pairs = list(zip(nodes, xpaths))

In [39]:
for x in node_xpath_pairs[376:382]:
    print(x, '\n')

('”) to commence a cash tender offer (as it may be amended from time to time as permitted under this Agreement, the “', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]') 

('Offer', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]/b[2]/i') 

('”) to purchase\nany and all of the outstanding shares of the Company Common Stock (the “', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]') 

('Shares', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]/b[3]/i') 

('”), at a price per Share of $8.50, without interest and subject to any applicable withholding Taxes (such amount, or any higher amount per share\nthat may be paid pursuant to the Offer, the “', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]') 

('Offer Price', '/document/type/sequence/filename/description/text/html/body/p/p/p/p/center/div/p[4]/b[4]/i') 


In [14]:
raw = []
for i in node_xpath_pairs:
    path_list = i[1].split('/')
    if len(path_list) > 9 and path_list[9][:5] == 'table':
        l = [i[0]] + path_list[9:12]
        raw.append(l)

In [15]:
df = pd.DataFrame(raw)
df.columns = ['text', 'table', 'tr', 'td']
df.to_csv('raw.csv')

In [17]:
# Create function for merging mutiple xpath node pairs in a list
new_paths = []
table = [raw[0][1], [[raw[0][0], raw[0][2], raw[0][3]]]]
for i in raw:
    if i[1] != table[0]:
        new_paths.append(table)
        table = [i[1], [[i[0], i[2], i[3]]]]
    if i[2] != table[1][-1][1]:
        table[1].append([i[0], i[2], i[3]])
    else:
        table[1][-1][0] += (' ' + i[0])
        table[1][-1][2] = i[3]
new_paths.append(table)

import json
with open('new_path.json', 'w') as f:
    json.dump(new_paths, f)

In [21]:
filtered_p_tags = filter_p_tags(node_xpath_pairs)
# filtered_p_tags[2305]

In [18]:
new_paths

[['table[1]',
  [['Page Page', 'tr[1]', 'td[3]'],
   ['ARTICLE 1 DEFINITIONS 1', 'tr[3]', 'td[2]'],
   ['Section 1.01 Definitions 1', 'tr[4]', 'td[3]'],
   ['Section 1.02 Other Definitional and Interpretative Provisions 16',
    'tr[5]',
    'td[3]'],
   ['ARTICLE 2 THE MERGER 17', 'tr[6]', 'td[2]'],
   ['Section 2.01 The Closing 17', 'tr[7]', 'td[3]'],
   ['Section 2.02 The Merger 17', 'tr[8]', 'td[3]'],
   ['Section 2.03 Conversion of Shares 18', 'tr[9]', 'td[3]'],
   ['Section 2.04 Exchange and Payment 18', 'tr[10]', 'td[3]'],
   ['Section 2.05 Dissenting Shares 20', 'tr[11]', 'td[3]'],
   ['Section 2.06 Company Equity Awards; ESPP 20', 'tr[12]', 'td[3]'],
   ['Section 2.07 Adjustments 23', 'tr[13]', 'td[3]'],
   ['Section 2.08 Withholding Rights 23', 'tr[14]', 'td[3]'],
   ['Section 2.09 Termination of Macquarie/GCM Merger Agreement and Payment of Termination Fee 23',
    'tr[15]',
    'td[3]'],
   ['ARTICLE 3 THE SURVIVING CORPORATION 23', 'tr[16]', 'td[2]'],
   ['Section 3.01 Cer

# Alaska Project 

## Section Title
"the following terms have the following meanings", 379


## Signature Page

in witness whereof, 2305

## Table of Contents

table of contents 13

# Community Bankers

## Section Title

"following terms are used in this agreement" 247


### Signature

in witness, 1701

# Perspecta

### Section Title
the following terms have the meanings set forth, 349


### Signature 

in witness whereof, 2809

In [91]:
# get_text_tag(filtered_p_tags, 'section')
# get_text_tag(get_div_tags(node_xpath_pairs), '1')

In [62]:
get_text_tag(node_xpath_pairs, "in witness")

[('IN WITNESS WHEREOF',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/center/div/p[1]/b',
  'idx=1701'),
 ('IN WITNESS WHEREOF, Essex Bank and United Bank have each caused this Agreement and Plan of\nMerger to be executed as of the date first above written.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/center/div/p[1]',
  'idx=2059'),
 ('IN WITNESS WHEREOF, the parties hereto have caused this Agreement to be executed as of the\nday first above written.',
  '/document/type/sequence/filename/description/text/html/body/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/p/

In [92]:
node_idx = 379
node_xpath_pairs[node_idx - 2 : node_idx + 10]

[('DEFINITIONS',
  '/document/type/sequence/filename/description/text/html/body/p[47]'),
 ('Section 1.01',
  '/document/type/sequence/filename/description/text/html/body/p[49]'),
 ('Definitions',
  '/document/type/sequence/filename/description/text/html/body/p[49]/u'),
 ('.', '/document/type/sequence/filename/description/text/html/body/p[49]'),
 ('(a)\xa0\xa0\xa0\xa0As used herein, the following terms have the following meanings:',
  '/document/type/sequence/filename/description/text/html/body/p[51]'),
 ('“2018 Cash Award',
  '/document/type/sequence/filename/description/text/html/body/p[54]/b'),
 ('s” means the cash awards granted to certain employees of the Company in 2018 in lieu of any Company RSU Awards or Company PSU Awards for 2018.',
  '/document/type/sequence/filename/description/text/html/body/p[54]'),
 ('“Acceptable Confidentiality Agreement',
  '/document/type/sequence/filename/description/text/html/body/p[56]/b'),
 ('” means a confidentiality agreement (i) containing terms

## DJANGO

In [9]:
from django.template import Template, Context
from django.conf import settings
 # We have to do this to use django templates standalone - see
# http://stackoverflow.com/questions/98135/how-do-i-use-django-templates-without-the-rest-of-django

TEMPLATES = [
    {
        'BACKEND': 'django.template.backends.django.DjangoTemplates',
        'APP_DIRS': False,
    },
]

settings.configure(TEMPLATES)




# Our template. Could just as easily be stored in a separate file
template = """
<html>
<head>
"""

"""
<title>Template {{ section title }}</title>
</head>
"""

"""
<body>
Body with {{ mystring }}.
{{%if%}}
</body>
"""

"""
</html>
"""

t = Template(template)
c = Context({"title": "title from code",
             "mystring":"string from code"})
print(t.render(c))

AttributeError: 'list' object has no attribute 'TEMPLATES'

## Generate HTML

In [10]:
from lxml import etree

root = etree.Element('DOCUMENT')

type_node = etree.SubElement(root, 'TYPE')
type_node.text = 'EX-2.1'

div = etree.SubElement(type_node, 'sequence')
div.text = '2'

fname = etree.SubElement(div, 'filename')
fname.text = 'ex_219664.htm'

description = etree.SubElement(fname, 'description')
description.text = 'EXHIBIT 2.1'

text = etree.SubElement(description, 'text')

html = etree.SubElement(text, 'html')

head = etree.SubElement(html, 'head')

title = etree.SubElement(html, 'title')
title.text = 'ex_219664.htm'

body = etree.SubElement(html, 'body')

p1 = etree.SubElement(body, 'p')
p1_f = etree.SubElement(p1, 'font')
p1_b = etree.SubElement(p1_f, 'b')
p1_b.text = 'AGREEMENT AND PLAN OF MERGER'

p2 = etree.SubElement(body, 'p')
p2_f = etree.SubElement(p2, 'font')
p2_b = etree.SubElement(p2_f, 'b')
p2_b.text = 'by and among'



html_string = etree.tostring(root, pretty_print=True, method='html').decode('utf-8')

In [11]:
print(html_string)

<DOCUMENT><TYPE>EX-2.1<sequence>2<filename>ex_219664.htm<description>EXHIBIT 2.1<text><html>
<head></head>
<title>ex_219664.htm</title>
<body>
<p><font><b>AGREEMENT AND PLAN OF MERGER</b></font></p>
<p><font><b>by and among</b></font></p>
</body>
</html></text></description></filename></sequence></TYPE></DOCUMENT>



In [12]:
feature_extractor = MarkupLMFeatureExtractor()
encoding = feature_extractor(html_string)

In [13]:
nodes = encoding['nodes'][0]
xpaths = encoding['xpaths'][0]

list(zip(nodes, xpaths))

[('EX-2.1', '/document/type'),
 ('2', '/document/type/sequence'),
 ('ex_219664.htm', '/document/type/sequence/filename'),
 ('EXHIBIT 2.1', '/document/type/sequence/filename/description'),
 ('ex_219664.htm',
  '/document/type/sequence/filename/description/text/html/title'),
 ('AGREEMENT AND PLAN OF MERGER',
  '/document/type/sequence/filename/description/text/html/body/p[1]/font/b'),
 ('by and among',
  '/document/type/sequence/filename/description/text/html/body/p[2]/font/b')]

In [None]:
[('EX-2.1', '/document/type'),
 ('2', '/document/type/sequence'),
 ('ex_219664.htm', '/document/type/sequence/filename'),
 ('EXHIBIT 2.1', '/document/type/sequence/filename/description'),
 ('ex_219664.htm',
  '/document/type/sequence/filename/description/text/html/head/title'),
 ('Exhibit 2.1',
  '/document/type/sequence/filename/description/text/html/body/p[1]/font/b'),
 ('AGREEMENT AND PLAN OF MERGER',
  '/document/type/sequence/filename/description/text/html/body/p[3]/b'),
 ('by and among',
  '/document/type/sequence/filename/description/text/html/body/p[5]/b'),
 ('ALASKA COMMUNICATIONS SYSTEMS GROUP, INC.,',
  '/document/type/sequence/filename/description/text/html/body/p[7]/b'),
 ('PROJECT 8 BUYER, LLC,',
  '/document/type/sequence/filename/description/text/html/body/p[9]/b'),
 ('and',
  '/document/type/sequence/filename/description/text/html/body/p[11]/b'),
 ('PROJECT 8 MERGERSUB, INC.',
  '/document/type/sequence/filename/description/text/html/body/p[13]/b'),
 ('December 31, 2020',
  '/document/type/sequence/filename/description/text/html/body/p[15]/b'),
 ('TABLE OF CONTENTS',
  '/document/type/sequence/filename/description/text/html/body/p[18]/b'),
 ('Page',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[1]/td[3]/b'),
 ('ARTICLE 1',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[3]/td[1]/p/b'),
 ('DEFINITIONS',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[3]/td[1]/p'),
 ('1',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[3]/td[2]/p'),
 ('Section 1.01',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[4]/td[1]/p'),
 ('Definitions',
  '/document/type/sequence/filename/description/text/html/body/table[1]/tr[4]/td[2]/p')]

In [None]:
tag_list = ['DOCUMENT', 'TYPE', 'SEQUENCE', 'filename', 'description', ]
node_list = []

In [10]:
class Ele(object):
    def __init__(self, tag_name, node_text, parent):
        self.tag_name = tag_name
        self.node_text = node_text
        self.parent = parent
        
    def get_root_tag(self):
        
        root_tag = self.parent
        if not root_tag:
            return self
        
        else:
            while root_tag.parent is not None:
                root_tag = self.parent.get_root_tag()
            
            return root_tag
        
    def add_to_tree(self):
        pass
    
    
    def __str__(self):
        return f"Ele(tag_name={self.tag_name}, node_text={self.node_text}, parent={self.parent})"
    
    def __repr__(self):
        return f"Ele(tag_name={self.tag_name}, node_text={self.node_text}, parent={self.parent})"
        

In [11]:
e1 = Ele('DOCUMENT', None, None)
e2 = Ele('TYPE', 'EX-2.1', e1)

In [12]:
e1

Ele(tag_name=DOCUMENT, node_text=None, parent=None)

In [13]:
e1.get_root_tag()

Ele(tag_name=DOCUMENT, node_text=None, parent=None)

In [14]:
e2 

Ele(tag_name=TYPE, node_text=EX-2.1, parent=Ele(tag_name=DOCUMENT, node_text=None, parent=None))

In [None]:
e2.get_root_tag()

In [None]:
from yattag import Doc

In [None]:
doc, tag, text = Doc().tagtext()

In [None]:
with tag('DOCUMENT'):
    with tag('TYPE'):
        text('EX-2.1')
        with tag('sequence'):
            text('2')
            with tag('filename'):
                text('ex_219664.htm')
                with tag('description'):
                    text('EXHIBIT 2.1')
                    
                    

In [None]:
doc.getvalue()

In [None]:
feature_extractor2 = MarkupLMFeatureExtractor()
encoding2 = feature_extractor2(html_string)

In [None]:
encoding2

In [None]:
nodes, xpaths = encoding2['nodes'][0], encoding['xpaths'][0]

In [None]:
list(zip(nodes, xpaths))

In [None]:
processor2 = AutoProcessor.from_pretrained("microsoft/markuplm-base", truncation=True)
processor2.parse_html = True

encoding2 = processor2(html_string, return_tensors="pt")
encoding2

In [None]:
start_node_idx2 = 0
end_node_idx2 = 200

In [None]:

nodes2 = encoding2['nodes'][0][start_node_idx2: end_node_idx2]
xpaths2 = encoding2['xpaths'][0][start_node_idx2: end_node_idx2]
#xpaths = [x[49:] for x in xpaths]
# print("###################################################")
# print(xpaths)
# print("###################################################")
enc2 = processor(nodes=nodes2, xpaths=xpaths2, return_tensors="pt").to(device)