In [55]:
import sys
import re
import os
from docx import Document
import pandas as pd
from IPython.display import display, HTML

class CorpusProcessor:
    def __init__(self, path='.'):
        self.result = []
        self.path = path
        self._process_path(self.path)
        
    def regex_search(self, query_word):
        """
        query_word: <Str>
            要搜索的字串，此搜索字串會先丟入re.compile()
            因此可以接收regex string
            例如此參數可以輸入"(LOC|INS)"
            則可以給出所有標有LOC或INS的語料。
        """

        pattern = re.compile(query_word)
        r = self._query_keyword(pattern)
        self._display_as_dataframe(r)
    
                    
    def easy_search(self):
        result = {}
        
        print('請輸入想要搜尋的字串:')
        print('如果想搜尋多個字串，請在字串之間以逗號分隔，例如: "主格,i,媽媽"')
        print('如果想找的i為單獨出現的i，而非出現在某單詞中的i，請在i的兩邊加上空格')
        
        x = str(input("請輸入想要查找字詞："))
        
        query_words = [word for word in x.split(',')]
        
        if x=="e" or x=="E":
            return
        else:
            for file in self.result:
                result[file["file_name"]] = [sentence for sentence in file["data"] if all([re.search(re.compile(word), str(sentence)) for word in query_words])]
            
            self._display_as_dataframe(result)

    def search_note(self):
        result = {}
        
        print('請輸入想要搜尋#note當中的字串:')
        
        x = str(input("請輸入想要查找字詞："))
        
#         query_words = [word for word in x.split(',')]
        
        if x=="e" or x=="E":
            return
        else:
            for file in self.result:
                result[file["file_name"]] = []
                for sentence in file["data"]:
                    l = sentence['content'].split('#n')
                    if len(l) == 2 and re.search(re.compile(x), l[1]):
                        result[file["file_name"]].append(sentence)
            
            self._display_as_dataframe(result)

    def all_data(self):
        return self.result
            
    def _display_as_dataframe(self, r):
        for k, v in r.items():
            print(f"<< {k} >>")
            if len(v) == 0:
                print("本週無相關資料")
            else:
                for item in v:
                    n = item['num']
                    c = item['content']
                    cc = c.split('\n')
                    cc = list(filter(lambda x: x!='', cc))
                    display(pd.DataFrame([cc[0].split(), cc[1].split(), cc[2].split()]).rename({0: "泰雅：", 1: "英文：", 2: "中文："}, axis='index')) 
                    try:
                        print("[英文翻譯]")
                        print([ccc for ccc in cc if ccc.startswith("#e")][0])
                        print("[中文翻譯]")
                        print([ccc for ccc in cc if ccc.startswith("#c")][0])
                        print("[註釋]")
                        print([ccc for ccc in cc if ccc.startswith("#n")][0])
                    except:
                        pass
    
    def _process_path(self, p):
        for filename in os.listdir(p):
            if not filename.startswith("~") and (filename.endswith(".docx") or filename.endswith(".doc")):
                document = self._open_file(filename)
                self.result.append(self._process(document, filename))

    def _open_file(self, name):
        d = Document(name)
        return d
    
    
    def _process(self, doc, filename):
        result = {
            "file_name": filename,
            "data": []
        }
        all_p = doc.paragraphs
        num_re = re.compile("(\d{1,2})\.")

        num = 0
        current_index = -1
        # rrr = map(lambda x: re.match(num_re, x.text), all_p)
        start = False
        for p in all_p:
            num_re = re.compile("(\d{1,2})\.")
            match = re.match(num_re, p.text)

            if match:
                start = True
                num = match.group(1)
                result["data"].append({"num": num, "content": ""})
                current_index = len(result["data"]) - 1
            else:
                if not start:
                    pass
                else:
                    result["data"][current_index]["content"] += "\n" + p.text

        return result
    
    def _query_keyword(self, q):
        result = {}

        for file in self.result:
            result[file["file_name"]] = [sentence for sentence in file["data"] if re.search(q, sentence["content"])]

        return result


In [None]:
c = CorpusProcessor(path='.')
c.search_note()


請輸入想要搜尋#note當中的字串:


## 使用說明

### STEP 1: 先實例化`CorpusProcessor`物件

```
p = CorpusProcessor()
```

也可以指名`.docx`檔所在路徑:

```
p = CorpusProcesesor(path="<檔案所在路徑>")
```

若不指定path，則預設為與此`.ipynb`檔相同路徑

### STEP 2-1: 簡單搜尋

直接呼叫`easy_search()`方法，會跳出輸入框

```
p.easy_search()
```

### STEP 2-2: 或者你也可以使用regular expression進行搜尋

```
p.regex_search(regex)
```

此方法接收一個regex的字串參數，此字串會先丟入`re.compile()`當中
