In [22]:
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from transformers import BertJapaneseTokenizer
from pathlib import Path


In [25]:
seed = 202105

# main'
main_path = Path('/home/jupyter/gogolook')
main_cached_path = Path('/home/jupyter/gogolook/data')

# general setting
main_data_path = main_path / 'data' / 'learning_test_data'
cache_models_path = main_cached_path / 'cache_models_dir'


In [None]:
mecab_tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese", word_tokenizer_type="mecab", cache_dir=cache_models_path)
basic_tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese", word_tokenizer_type="basic", cache_dir=cache_models_path)
char_tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-char", word_tokenizer_type="basic", subword_tokenizer_type="character", cache_dir=cache_models_path)


In [28]:
# Input Japanese Text
#line = "„Ç¢„É≥„Éë„Çµ„É≥„Éâ (&„ÄÅËã±Ë™ûÂêçÔºö) „Å®„ÅØ‰∏¶Á´ãÂä©Ë©û„Äå‚Ä¶„Å®‚Ä¶„Äç„ÇíÊÑèÂë≥„Åô„ÇãË®òÂè∑„Åß„ÅÇ„Çã„ÄÇ„É©„ÉÜ„É≥Ë™û„ÅÆ „ÅÆÂêàÂ≠ó„Åß„ÄÅTrebuchet MS„Éï„Ç©„É≥„Éà„Åß„ÅØ„ÄÅ„Å®Ë°®Á§∫„Åï„Çå \"et\" „ÅÆÂêàÂ≠ó„Åß„ÅÇ„Çã„Åì„Å®„ÅåÂÆπÊòì„Å´„Çè„Åã„Çã„ÄÇ"
line = "„Ç¨„ÉÉ„Ç≠„ÉºÈÄÉ„ÅíÊÅ•Â©ö"
mecab_inputs = mecab_tokenizer(line, return_tensors="pt")
print(mecab_tokenizer.tokenize(line))
print(mecab_tokenizer.decode(mecab_inputs['input_ids'][0]))
corpus_size = len(mecab_tokenizer)


['„Ç¨', '##„ÉÉ„Ç≠„Éº', 'ÈÄÉ„Åí', 'ÊÅ•', 'Â©ö']
[CLS] „Ç¨„ÉÉ„Ç≠„Éº ÈÄÉ„Åí ÊÅ• Â©ö [SEP]


In [8]:
mecab_tokenizer.word_tokenizer.mecab.dictionary_info

[{'filename': '/home/jupyter/.local/lib/python3.7/site-packages/ipadic/dicdir/sys.dic',
  'charset': 'utf8',
  'size': 392126,
  'version': 102}]

In [13]:
mecab_tokenizer.vocab_size, basic_tokenizer.vocab_size, char_tokenizer.vocab_size

(32000, 32000, 4000)

In [26]:

print(mecab_tokenizer.tokenize(line))
print(basic_tokenizer.tokenize(line))
print(char_tokenizer.tokenize(line))


['„Ç¨', '##„ÉÉ„Ç≠„Éº', 'ÈÄÉ„Åí', 'ÊÅ•', 'Â©ö']
['„Ç¨', '##„ÉÉ„Ç≠„Éº', '##ÈÄÉ', '##„Åí', '##ÊÅ•', '##Â©ö']
['„Ç¨', '„ÉÉ', '„Ç≠', '„Éº', 'ÈÄÉ', '„Åí', 'ÊÅ•', 'Â©ö']


In [19]:
basic_inputs['input_ids'].shape


torch.Size([1, 69])

In [20]:
mecab_inputs['input_ids'].shape

torch.Size([1, 62])

In [20]:
mecab_tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [37]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
                      

In [38]:
# pre tokenizers ÊòØÁî®‰æÜËôïÁêÜÂ¶Ç‰ΩïÊñ∑Ë©û
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()
#files = [str(main_data_path / "wikitext-103-raw" / f"wiki.{split}.raw") for split in ["test", "train", "valid"]]
#tokenizer.train(files, trainer)


In [40]:
jp_file = "/home/jupyter/gogolook/data/jp_data/total_pretraining_data/train_all-maxseq512_AA.parquet"
tokenizer.train(jp_file, trainer)


Exception: No such file or directory (os error 2)

In [30]:
tokenizer.save(str(main_data_path / "tokenizer-wiki.json"))


In [34]:
tokenizer = Tokenizer.from_file(str(main_data_path / "tokenizer-wiki.json"))
output = tokenizer.encode("Hello, y'all! How are you üòÅ ?")
output.tokens


['Hello', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '[UNK]', '?']

In [None]:
# post processing ÊòØÁÇ∫‰∫ÜÂ∞ç [CLS] Ëàá [SEP] ÁöÑËôïÁêÜ
from tokenizers.processors import TemplateProcessing

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# Trace japanese tokenizer

In [47]:
from transformers.models import bert
import collections
import os
import unicodedata
from typing import List, Optional

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.
        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.
        Returns:
          A list of wordpiece tokens.
        """

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens
    

In [63]:

def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

#vocab = load_vocab('./wiki-ja_albert.vocab')
vocab = mecab_tokenizer.vocab


In [71]:
def _is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
        ):  #
            return True

        return False



def _tokenize_chinese_chars(text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if _is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)

In [88]:
text = "„Ç¢„É≥„Éë„Çµ„É≥„Éâ (&„ÄÅËã±Ë™ûÂêçÔºö) „Å®„ÅØ‰∏¶Á´ãÂä©Ë©û„Äå‚Ä¶„Å®‚Ä¶„Äç„ÇíÊÑèÂë≥„Åô„ÇãË®òÂè∑„Åß„ÅÇ„Çã„ÄÇ„É©„ÉÜ„É≥Ë™û„ÅÆ „ÅÆÂêàÂ≠ó„Åß„ÄÅTrebuchet MS„Éï„Ç©„É≥„Éà„Åß„ÅØ„ÄÅ„Å®Ë°®Á§∫„Åï„Çå \"et\" „ÅÆÂêàÂ≠ó„Åß„ÅÇ„Çã„Åì„Å®„ÅåÂÆπÊòì„Å´„Çè„Åã„Çã„ÄÇ"
print(_is_chinese_char(ord('Ëã±')))
print(_tokenize_chinese_chars(text))


True
„Ç¢„É≥„Éë„Çµ„É≥„Éâ (&„ÄÅ Ëã±  Ë™û  Âêç Ôºö) „Å®„ÅØ ‰∏¶  Á´ã  Âä©  Ë©û „Äå‚Ä¶„Å®‚Ä¶„Äç„Çí ÊÑè  Âë≥ „Åô„Çã Ë®ò  Âè∑ „Åß„ÅÇ„Çã„ÄÇ„É©„ÉÜ„É≥ Ë™û „ÅÆ „ÅÆ Âêà  Â≠ó „Åß„ÄÅTrebuchet MS„Éï„Ç©„É≥„Éà„Åß„ÅØ„ÄÅ„Å® Ë°®  Á§∫ „Åï„Çå "et" „ÅÆ Âêà  Â≠ó „Åß„ÅÇ„Çã„Åì„Å®„Åå ÂÆπ  Êòì „Å´„Çè„Åã„Çã„ÄÇ


In [89]:
norm_text = unicodedata.normalize("NFKC", text)
print(norm_text)

„Ç¢„É≥„Éë„Çµ„É≥„Éâ (&„ÄÅËã±Ë™ûÂêç:) „Å®„ÅØ‰∏¶Á´ãÂä©Ë©û„Äå...„Å®...„Äç„ÇíÊÑèÂë≥„Åô„ÇãË®òÂè∑„Åß„ÅÇ„Çã„ÄÇ„É©„ÉÜ„É≥Ë™û„ÅÆ „ÅÆÂêàÂ≠ó„Åß„ÄÅTrebuchet MS„Éï„Ç©„É≥„Éà„Åß„ÅØ„ÄÅ„Å®Ë°®Á§∫„Åï„Çå "et" „ÅÆÂêàÂ≠ó„Åß„ÅÇ„Çã„Åì„Å®„ÅåÂÆπÊòì„Å´„Çè„Åã„Çã„ÄÇ


In [95]:
import nagisa
words = nagisa.tagging(text)
print(words.words)

['„Ç¢„É≥„Éë', '„Çµ„É≥„Éâ', '\u3000', '(', '&', '„ÄÅ', 'Ëã±Ë™û', 'Âêç', ':', ')', '\u3000', '„Å®', '„ÅØ', '‰∏¶Á´ã', 'Âä©Ë©û', '„Äå', '...', '„Å®', '...', '„Äç', '„Çí', 'ÊÑèÂë≥', '„Åô„Çã', 'Ë®òÂè∑', '„Åß', '„ÅÇ„Çã', '„ÄÇ', '„É©„ÉÜ„É≥', 'Ë™û', '„ÅÆ', '\u3000', '„ÅÆ', 'ÂêàÂ≠ó', '„Åß', '„ÄÅ', 'Trebuchet', '\u3000', 'MS', '„Éï„Ç©„É≥„Éà', '„Åß', '„ÅØ', '„ÄÅ', '„Å®', 'Ë°®Á§∫', '„Åï', '„Çå', '\u3000', '"et', '"', '\u3000', '„ÅÆ', 'ÂêàÂ≠ó', '„Åß', '„ÅÇ„Çã', '„Åì„Å®', '„Åå', 'ÂÆπÊòì', '„Å´', '„Çè„Åã„Çã', '„ÄÇ']


In [69]:

max_input_chars_per_word = 100
token_list =whitespace_tokenize(text)
chars = list(token_list[0])
unk_token = '[UNK]'
output_tokens = []
for token in whitespace_tokenize(text):
    chars = list(token)
    if len(chars) > max_input_chars_per_word:
        output_tokens.append(unk_token)
        continue

    is_bad = False
    start = 0
    sub_tokens = []
    print(chars)
    while start < len(chars):
        end = len(chars)
        cur_substr = None
        while start < end:
            substr = "".join(chars[start:end])
            print(substr)
            if start > 0:
                substr = "##" + substr
            if substr in vocab:
                cur_substr = substr
                print(f'cur substr: {cur_substr}')
                break
            end -= 1
        if cur_substr is None:
            is_bad = True
            break
        sub_tokens.append(cur_substr)
        start = end  
    if is_bad:
        output_tokens.append(unk_token)
    else:
        output_tokens.extend(sub_tokens)
        

['„Ç¢', '„É≥', '„Éë', '„Çµ', '„É≥', '„Éâ']
„Ç¢„É≥„Éë„Çµ„É≥„Éâ
„Ç¢„É≥„Éë„Çµ„É≥
„Ç¢„É≥„Éë„Çµ
„Ç¢„É≥„Éë
„Ç¢„É≥
cur substr: „Ç¢„É≥
„Éë„Çµ„É≥„Éâ
„Éë„Çµ„É≥
„Éë„Çµ
„Éë
cur substr: ##„Éë
„Çµ„É≥„Éâ
cur substr: ##„Çµ„É≥„Éâ
['(', '&', '„ÄÅ', 'Ëã±', 'Ë™û', 'Âêç', 'Ôºö', ')']
(&„ÄÅËã±Ë™ûÂêçÔºö)
(&„ÄÅËã±Ë™ûÂêçÔºö
(&„ÄÅËã±Ë™ûÂêç
(&„ÄÅËã±Ë™û
(&„ÄÅËã±
(&„ÄÅ
(&
(
cur substr: (
&„ÄÅËã±Ë™ûÂêçÔºö)
&„ÄÅËã±Ë™ûÂêçÔºö
&„ÄÅËã±Ë™ûÂêç
&„ÄÅËã±Ë™û
&„ÄÅËã±
&„ÄÅ
&
cur substr: ##&
„ÄÅËã±Ë™ûÂêçÔºö)
„ÄÅËã±Ë™ûÂêçÔºö
„ÄÅËã±Ë™ûÂêç
„ÄÅËã±Ë™û
„ÄÅËã±
„ÄÅ
cur substr: ##„ÄÅ
Ëã±Ë™ûÂêçÔºö)
Ëã±Ë™ûÂêçÔºö
Ëã±Ë™ûÂêç
Ëã±Ë™û
Ëã±
cur substr: ##Ëã±
Ë™ûÂêçÔºö)
Ë™ûÂêçÔºö
Ë™ûÂêç
Ë™û
cur substr: ##Ë™û
ÂêçÔºö)
ÂêçÔºö
Âêç
cur substr: ##Âêç
Ôºö)
Ôºö
['„Å®', '„ÅØ', '‰∏¶', 'Á´ã', 'Âä©', 'Ë©û', '„Äå', '‚Ä¶', '„Å®', '‚Ä¶', '„Äç', '„Çí', 'ÊÑè', 'Âë≥', '„Åô', '„Çã', 'Ë®ò', 'Âè∑', '„Åß', '„ÅÇ', '„Çã', '„ÄÇ', '„É©', '„ÉÜ', '„É≥', 'Ë™û', '„ÅÆ']
„Å®„ÅØ‰∏¶Á´ãÂä©Ë©û„Äå‚Ä¶„Å®‚Ä¶„Äç„ÇíÊÑèÂë≥„Åô„ÇãË®òÂè∑„Åß„ÅÇ„Çã„ÄÇ„É©„ÉÜ„É≥Ë™û„ÅÆ
„Å®„ÅØ‰∏¶Á´ãÂä©Ë©û„Äå‚Ä¶„Å®‚