In [None]:
import pandas as pd
import re

In [None]:
class WikiPreprocessor:
    def _read_json(self, path):
        """Read json file and remove 'revid'"""
        df = pd.read_json(path, lines=True)
        df = df.loc[:, ['id', 'url', 'title', 'text']]
        return df
    
    def _split_doc(self, df, doc_idx):
        "Split document into sections"""
        p = re.compile('###section(\d+)###') # To find sub title
        doc_id = df['id'][doc_idx]
        doc_url = df['url'][doc_idx]
        doc_title = df['title'][doc_idx]
        doc_text = df['text'][doc_idx]
        sub_lvs = [1] 
        sub_titles = [doc_title]
        sub_texts = ['']
        for line in doc_text.splitlines():
            m = p.match(line)
            if m: # Handle sub title
                sub_lv = int(m.group(1)) # Level
                sub_title = line[m.span()[1]:] # Sub title
                if sub_lvs[-1] < sub_lv:
                    sub_title = sub_titles[-1] + ' ' + sub_title # Concatenate title with upper title(low level)
                else:
                    for bw in range(1, len(sub_lvs) + 1): # Find the most close upper title(low level)
                        if sub_lvs[-bw] < sub_lv:
                            sub_title = sub_titles[-bw] + ' ' + sub_title # Concatenate title
                            break

                # Copy text from the most close upper text(low level)
                for bw in range(1, len(sub_lvs) + 1): 
                    if sub_lvs[-bw] < sub_lv:
                        sub_texts.append(sub_texts[-bw])
                        break

                # Add sub title and its level
                sub_titles.append(sub_title)
                sub_lvs.append(sub_lv)
            else: # Handle text
                sub_texts[-1] += ' ' + line
        return doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts

    def _append_rows(self, table, doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts):
        """Append rows from sub title and texts"""
        ti = len(table)
        l = len(sub_lvs)
        nos = 0 
        for i, lv in enumerate(sub_lvs):
            if i + 1 == l or sub_lvs[i] >= sub_lvs[i + 1]: 
                table[ti + nos] = {'id': doc_id, 'url': doc_url, 'title': doc_title, 'nos': 0,  
                                   'sub_id': nos, 'sub_title': sub_titles[i], 'text': sub_texts[i]}
                nos += 1

        # Modify number of sections
        for i in range(ti, ti + nos):
            table[i]['nos'] = nos
        return table

    @classmethod
    def preproc(cls, path):
        d = {}
        df = cls._read_json(cls, path)
        for doc_idx in range(0, df.shape[0]):
            doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts= cls._split_doc(cls, df, doc_idx)
            d = cls._append_rows(cls, d, doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts)
        return pd.DataFrame.from_dict(d, 'index')