# Data preprocessing

As the data is not in the desired format for training, we need to preprocess it. The data is in the form of html files, which I want to convert to csv files in the following.


In [1]:
import os
ROOT = "../data/doec"

# Create "proccessed" dir if it doesn't exist
if not os.path.exists(os.path.join(ROOT, "processed")):
    os.mkdir(os.path.join(ROOT, "processed"))


In [2]:
# Get all files from the "html" dir
files = os.listdir(os.path.join(ROOT, "html"))

# Filter all files starting with T
files = [f for f in files if f.startswith("T")]
files.sort()

In [4]:
import re
import pandas as pd

# Regex match the everything inside the square brackets
regex = re.compile(r"\[.*\]")
def split_text(text):
    """
    Split text into id and text
    """

    try:
       id = regex.findall(text)[0].strip()
    except:
        print(text) 

    id = regex.findall(text)[0].strip()

    text = regex.sub("", text).strip()

    return id, text

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.data = []
        self.next_entry = False

    def handle_starttag(self, tag, attrs):
        if tag == "p":
            self.next_entry = True
        elif tag == "img":
            # Get src
            self.next_entry = False
            src = [attr[1] for attr in attrs if attr[0] == "src"][0]
            self.data[-1] += f"<{src}>"
        else:
            self.next_entry = False

    def handle_data(self, data):
        data = data.strip()
        if not data:
            return
        if self.next_entry:
            self.data.append(data)
        else:
            self.data[-1] += data

    def feed(self, data: str) -> None:
        super().feed(data)
        return self.data

def parse_file(file_path:str):
    """
    Parse html file and return a list of tuples (id, text)
    """
    with open(os.path.join(ROOT,"html",file_path), "r") as f:
        text = f.read()
        text = text.split("<hr>")[1]

    parser = MyHTMLParser()
    data = parser.feed(text)

    data = [split_text(d) for d in data]

    return pd.DataFrame(data, columns=["id", "text"])


# Parse all files
data = []
for file in files:
    df = parse_file(file)
    df["filename"] = file
    data.append(df)

# Concatenate all dataframes
data = pd.concat(data, ignore_index=True)

# Save as parquet
data.to_parquet("../data/doec.parquet")


In [9]:
data["text"][0]

'Us is riht micel ðæt we rodera weard, wereda wuldorcining, wordum herigen, modum lufien.'

In [65]:
# Match a number with a unit
regex = re.compile(r"\[[\d\.*\d]+\s*\([\d\.*\d]+[a-zA-Z]*\)\]")
regex.findall("1.0 [1.00 (1111.2a)] [1 (111.11445)]")

[]

In [67]:
# Regex match the everything inside the square brackets
regex = re.compile(r"\[.*\]")
regex.findall("1.0 [1.00 (1111.2a)] [1 (111.11445)]")

['[1.00 (1111.2a)] [1 (111.11445)]']