In [1]:
import sys
import os
if "../src" not in sys.path:
    sys.path.append("../src")
if "../../pyASBC/src" not in sys.path:
    sys.path.append("../../pyASBC/src")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import math
from pathlib import Path
from itertools import islice
from pyASBC import Asbc5Corpus
from dotted_wsd import DottedWsdTagger
from tqdm.auto import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
corpus = Asbc5Corpus("../../pyASBC/data")

In [4]:
list(islice(corpus.iter_words(), 0, 5))

['時間', '三月', '十日', '星期四', '上午']

In [5]:
tagger = DottedWsdTagger()

```
n_sentence = sum(1 for _ in corpus.iter_sentences())
print(n_sentence)
## 1,396,133
```

In [6]:
n_sentence = 1_396_133

In [7]:
tok_func = lambda x: (x[0], x[1])
tagged_func = lambda x: (*x[:2], *parse_prediction(x[2]))
    
def parse_prediction(pred_str: str):
    fields = pred_str.split(" ")
    if len(fields) == 3:
        return fields[0][1:-1], fields[1], fields[2][1:-1]
    else:
        return ("", "", "")


## Debug on one sentence

In [8]:
sent_t = list(islice(corpus.iter_sentences(), 10, 11))[0]
sent_t

[('與', 'P', ''),
 ('美國', 'Nc', ''),
 ('大學', 'Nc', ''),
 ('聯繫', 'VC', ''),
 ('商討', 'VE', ''),
 ('長期', 'Nd', ''),
 ('合作', 'VH', ''),
 ('事宜', 'Na', ''),
 ('，', 'COMMACATEGORY', '')]

In [9]:
tok_seq_t = list(map(tok_func, sent_t))
sense_tagged_t = tagger.sense_tag_per_sentence(tok_seq_t)
sense_tagged_t = list(map(tagged_func, sense_tagged_t))
print(sense_tagged_t)

[('與', 'P', '04001305', '引介共同做事的對象。', '0.5112'), ('美國', 'Nc', '06681801', '國名，北美洲中部的國家，位於加拿大、墨西哥之間，共有五十州。', '0.9446'), ('大學', 'Nc', '06587001', '學校系統中最高等級，授予學位的學校。', '0.9890'), ('聯繫', 'VC', '', '', ''), ('商討', 'VE', '', '', ''), ('長期', 'Nd', '09290701', '一段長時間。', '1.0000'), ('合作', 'VH', '06639101', '形容兩個以上特定對象互相配合做特定事件，以達成特定目標。', '1.0000'), ('事宜', 'Na', '', '', ''), ('，', 'COMMACATEGORY', '', '', '')]


## Main loop

In [10]:
out_dir = Path("../data/dt-asbc")
out_dir.mkdir(exist_ok=True, parents=True)

In [None]:
batch_size = 10_000
# batch_size = 20
batch_idx = 0
n_batch = math.ceil(n_sentence / batch_size)
path_templ = f"asbc_dotted_tagged_{{batch_idx:03d}}-of-{n_batch}.txt"
batch_path = out_dir / path_templ.format(batch_idx=batch_idx)
if not batch_path.exists():
    fout = batch_path.open("w", encoding="utf-8")
else:
    fout = None

for sent_i, sent_x in enumerate(tqdm(corpus.iter_sentences(), total=n_sentence)):
    ## if file already exists, fout is None, 
    ## then skip the tagging part
    if fout:
        ## tagging
        try:
            tok_seq = list(map(tok_func, sent_x))
            sense_tagged = tagger.sense_tag_per_sentence(tok_seq)
            sense_tagged = list(map(tagged_func, sense_tagged))
            for tok_i, tagged_tok in enumerate(sense_tagged):
                if tagged_tok[2] and not tagged_tok[2].startswith("RP:"):
                    # it is tagged
                    fout.write(f"{tagged_tok[0]}-{tagged_tok[2]}")
                else:
                    # it is not tagged
                    fout.write(f"{tagged_tok[0]}-{tagged_tok[1]}")
                if tok_i < len(sense_tagged)-1:
                    fout.write(" ")
            fout.write("\n")
        except Exception as ex:
            print(ex)
    
    if (sent_i+1) % batch_size == 0:
        if fout: fout.close()
        batch_idx += 1
        
        if batch_idx > 6:
            break                    
        batch_path = out_dir / path_templ.format(batch_idx=batch_idx)
        if not batch_path.exists():
            fout = Path(batch_path).open("w", encoding="utf-8")
        else:
            fout = None

if fout:
    fout.close()

  0%|          | 0/1396133 [00:00<?, ?it/s]

In [None]:
!sha1sum ../data/dt-asbc/*.txt