In [None]:
import pandas as pd
import re
import os
import multiprocessing
from konlpy.tag import Mecab

In [None]:
class WikiPreprocessor:
    def _read_json(self, path):
        """Read json file and remove 'revid'"""
        df = pd.read_json(path, lines=True)
        df = df.loc[:, ['id', 'url', 'title', 'text']]
        return df
    
    def _split_doc(self, df, doc_idx):
        "Split document into sections"""
        p = re.compile('###section(\d+)###') # To find sub title
        doc_id = df['id'][doc_idx]
        doc_url = df['url'][doc_idx]
        doc_title = df['title'][doc_idx]
        doc_text = df['text'][doc_idx]
        sub_lvs = [1] 
        sub_titles = [doc_title]
        sub_texts = ['']
        for line in doc_text.splitlines():
            m = p.match(line)
            if m: # Handle sub title
                sub_lv = int(m.group(1)) # Level
                sub_title = line[m.span()[1]:] # Sub title
                if sub_lvs[-1] < sub_lv:
                    sub_title = sub_titles[-1] + ' ' + sub_title # Concatenate title with upper title(low level)
                else:
                    for bw in range(1, len(sub_lvs) + 1): # Find the most close upper title(low level)
                        if sub_lvs[-bw] < sub_lv:
                            sub_title = sub_titles[-bw] + ' ' + sub_title # Concatenate title
                            break

                # Copy text from the most close upper text(low level)
                for bw in range(1, len(sub_lvs) + 1): 
                    if sub_lvs[-bw] < sub_lv:
                        sub_texts.append(sub_texts[-bw])
                        break

                # Add sub title and its level
                sub_titles.append(sub_title)
                sub_lvs.append(sub_lv)
            else: # Handle text
                sub_texts[-1] += ' ' + line
        return doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts

    def _append_rows(self, table, doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts):
        """Append rows from sub title and texts"""
        ti = len(table)
        l = len(sub_lvs)
        nos = 0 
        for i, lv in enumerate(sub_lvs):
            if len(sub_texts[i]) == 0:
                continue
            if i + 1 == l or sub_lvs[i] >= sub_lvs[i + 1]: 
                table[ti + nos] = {'id': doc_id, 'url': doc_url, 'title': doc_title, 'nos': 0,  
                                   'sub_id': nos, 'sub_title': sub_titles[i], 'text': sub_texts[i]}
                nos += 1

        # Modify number of sectionsWikiPreprocessor
        for i in range(ti, ti + nos):
            table[i]['nos'] = nos
        return table
    
    def _split_doc_in_files(self, fpaths):
        """Split documents in files"""
        d = {}
        for fpath in fpaths:
            df = self._read_json(fpath)
            for doc_idx in range(0, df.shape[0]):
                doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts= self._split_doc(df, doc_idx)
                d = self._append_rows(d, doc_id, doc_url, doc_title, sub_lvs, sub_titles, sub_texts)
        return pd.DataFrame.from_dict(d, 'index')      
        
    def _get_fpaths(self, dpath):
        """Get file paths from the directory path"""
        fpaths = []
        items = os.listdir(dpath)
        for item in items:
            path = os.path.join(dpath, item)
            if os.path.isdir(path):
                fpaths += self._get_fpaths(path)
            else:
                fpaths.append(path)
        fpaths.sort()
        return fpaths
        
    def split_doc_in_files(self, path, save_path=None):
        """Split documents in the file or files of the directory"""
        if os.path.isdir(path):
            fpaths = self._get_fpaths(path)
        else:
            fpaths = [path]
            
        df = self._split_doc_in_files(fpaths)
        if save_path:
            df.to_json(save_path)
        return df
    
    def df_to_txt(self, df, save_path=None):
        """Convert df to txt (sub_title and text)"""
        docs = []
        for i in df.index:
            sub_title = df['sub_title'][i]
            text = df['text'][i]
            docs.append(sub_title + '\n')
            docs.append(text + '\n')
        if save_path:
            with open(save_path, 'wt') as f:
                for doc in docs:
                    f.write(doc)
        return docs
    
    def _tokenize_process(self, outlist, pid, lines, a, b, tags):
        tokenizer = Mecab()
        tokenized_lines = []
        for line in lines[a:b]:
            tokenized_line = []
            tokens = tokenizer.pos(line)
            for token in tokens:
                if token[1] in tags:
                    tokenized_line.append(token[0])
            tokenized_lines.append(' '.join(tokenized_line) + '\n')
        outlist[pid] = tokenized_lines
    
    def tokenize(self, lines, save_path=None, workers=8, tags=None):
        """Tokenize each line by the tags using mecab(ko)"""
        if not tags:
            tags = ['NNG', 'NNP']
        tokenized = []
        with multiprocessing.Manager() as manager:
            outlist = manager.list(range(workers))
            ps = []
            piece = int(len(lines) / workers)
            a = 0
            b = piece
            for i in range(workers):
                p = multiprocessing.Process(target=self._tokenize_process, args=(outlist, i, lines, a, b, tags))
                p.start()
                ps.append(p)
                a = b
                if i + 1 < workers - 1: # Next is not the last
                    b += piece
                else:
                    b = len(lines)
            for p in ps:
                p.join()
            for l in outlist:
                tokenized += l
        if save_path:
            with open(save_path, 'wt') as f:
                for line in tokenized:
                    f.write(line)
        return tokenized