# Data preprocessing

As the data is not in the desired format for training, we need to preprocess it. The data is in the form of html files, which I want to convert to csv files in the following.


In [37]:
import os
ROOT = "../data/doec"

# Create "proccessed" dir if it doesn't exist
if not os.path.exists(os.path.join(ROOT, "processed")):
    os.mkdir(os.path.join(ROOT, "processed"))


In [38]:
# Get all files from the "html" dir
files = os.listdir(os.path.join(ROOT, "html"))

# Filter all files starting with T
files = [f for f in files if f.startswith("T")]
files.sort()

In [70]:
import re
import pandas as pd

# Regex match the everything inside the square brackets
regex = re.compile(r"\[.*\]")
def split_text(text):
    """
    Split text into id and text
    """

    try:
       id = regex.findall(text)[0].strip()
    except:
        print(text) 

    id = regex.findall(text)[0].strip()

    text = regex.sub("", text)

    return id, text

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.data = []
        self.next_entry = False

    def handle_starttag(self, tag, attrs):
        if tag == "p":
            self.next_entry = True
        elif tag == "img":
            # Get src
            self.next_entry = False
            src = [attr[1] for attr in attrs if attr[0] == "src"][0]
            self.data[-1] += f"<{src}>"
        else:
            self.next_entry = False

    def handle_data(self, data):
        data = data.strip()
        if not data:
            return
        if self.next_entry:
            self.data.append(data)
        else:
            self.data[-1] += data

    def feed(self, data: str) -> None:
        super().feed(data)
        return self.data

def parse_file(file_path:str):
    """
    Parse html file and return a list of tuples (id, text)
    """
    with open(os.path.join(ROOT,"html",file_path), "r") as f:
        text = f.read()
        text = text.split("<hr>")[1]

    parser = MyHTMLParser()
    data = parser.feed(text)

    data = [split_text(d) for d in data]

    return pd.DataFrame(data, columns=["id", "text"])


# Parse all files
for file in files:
    parse_file(file).to_csv(os.path.join(ROOT, "processed", files[0].replace(".htm", ".csv")), index=False)




T00010.htm
T00020.htm
T00030.htm
T00040.htm
T00050.htm
T00060.htm
T00070.htm
T00080.htm
T00090.htm
T00100.htm
T00110.htm
T00120.htm
T00130.htm
T00140.htm
T00150.htm
T00160.htm
T00170.htm
T00180.htm
T00190.htm
T00200.htm
T00210.htm
T00220.htm
T00230.htm
T00240.htm
T00250.htm
T00260.htm
T00270.htm
T00280.htm
T00290.htm
T00300.htm
T00310.htm
T00320.htm
T00330.htm
T00340.htm
T00350.htm
T00360.htm
T00370.htm
T00380.htm
T00390.htm
T00400.htm
T00410.htm
T00420.htm
T00430.htm
T00440.htm
T00450.htm
T00460.htm
T00470.htm
T00480.htm
T00490.htm
T00500.htm
T00510.htm
T00520.htm
T00530.htm
T00540.htm
T00550.htm
T00560.htm
T00570.htm
T00580.htm
T00590.htm
T00600.htm
T00610.htm
T00620.htm
T00630.htm
T00640.htm
T00650.htm
T00660.htm
T00670.htm
T00680.htm
T00690.htm
T00700.htm
T00710.htm
T00720.htm
T00730.htm
T00740.htm
T00750.htm
T00760.htm
T00770.htm
T00780.htm
T00790.htm
T00800.htm
T00810.htm
T00820.htm
T00830.htm
T00840.htm
T00850.htm
T00860.htm
T00870.htm
T00880.htm
T00890.htm
T00900.htm
T00910.htm

In [65]:
# Match a number with a unit
regex = re.compile(r"\[[\d\.*\d]+\s*\([\d\.*\d]+[a-zA-Z]*\)\]")
regex.findall("1.0 [1.00 (1111.2a)] [1 (111.11445)]")

[]

In [67]:
# Regex match the everything inside the square brackets
regex = re.compile(r"\[.*\]")
regex.findall("1.0 [1.00 (1111.2a)] [1 (111.11445)]")

['[1.00 (1111.2a)] [1 (111.11445)]']