In [None]:
import pandas as pd
import math

In [None]:
# プログレスバー
class ProgressBar:
  def __init__(self, max_id):
    self.call = 0
    self.max_id = max_id
  
  def show(self, id, comment=''):
    if self.call == 0:
      self.call = 1
      print(f'{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}', end="")
    elif id >= self.max_id:
      print(f'\r{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}')
    else:
      print(f'\r{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}', end="")

In [None]:
# 正規表現[あ-んー]をLと定めた時、Lの二つ組LLのリストを作成
KANA = ['あ', 'い', 'う', 'え', 'お',
        'か', 'が', 'き', 'ぎ', 'く', 'ぐ', 'け', 'げ', 'こ', 'ご',
        'さ', 'ざ', 'し', 'じ', 'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ',
        'た', 'だ', 'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と', 'ど',
        'な', 'に', 'ぬ', 'ね', 'の',
        'は', 'ば', 'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ', 'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ',
        'ま', 'み', 'む', 'め', 'も',
        'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ',
        'ら', 'り', 'る', 'れ', 'ろ',
        'わ', 'を', 'ん', 'ー']
LL_LIST = []
for first in KANA:
  for second in KANA:
    LL_LIST.append(f'{first}{second}')

In [None]:
# 単語とLLのリストを読み込む
dtype_dict = dict(zip(['単語']+LL_LIST, ['object']+['bool']*len(LL_LIST)))  # 読み取るときの型定義
corpus_df = pd.read_csv('../output/words.csv', dtype=dtype_dict, encoding='utf-8')
corpus_df.info()

In [None]:
ll_filter = LL_LIST.copy()
corpus_for_fingerspelling = []

In [None]:
# プログレスバーを初期化
progress_bar = ProgressBar(len(LL_LIST))

# 単語の最大出力数
MAX_WORDS = 1000

while True:
  if corpus_df[ll_filter].sum(axis=1).max() == 0:
    # 新規LLが無くなったら処理を終了
    print(f'No new LL combinations.')
    break
  
  if MAX_WORDS == len(corpus_for_fingerspelling):
    print(f'{MAX_WORDS} word.')
    break
  
  # 一度しか出現しないLLを含む単語リストを作成
  one_list = list(corpus_df[ll_filter].sum()[corpus_df[ll_filter].sum()[:]==1].index)

  if not one_list:
    # ②の処理
    # LLのエンドロピーが最大化する単語を探す処理
    # 新規LLの最大個数
    max_count = corpus_df[ll_filter].sum(axis=1).max()
    # 新規LLが最大個数のindexを取得
    candidate_words_idx = list(corpus_df[ll_filter].sum(axis=1)[corpus_df[ll_filter].sum(axis=1)==max_count].index)

    result = ''
    maximum_entropy = -100
    for word_idx in candidate_words_idx:
      word = corpus_df.loc[word_idx]['単語']
      entropy = 0
      # エンドロピーを計算
      for idx in range(len(word)-1):
        # 既出のLLは計算に入れない
        if word[idx: idx+2] not in ll_filter:
          continue
        pn = corpus_df[word[idx: idx+2]].sum() / len(corpus_df)
        entropy += pn * math.log2(pn)
      if maximum_entropy < -entropy:
        maximum_entropy = -entropy
        result = word
    
    for idx in range(len(result)-1):
      if result[idx:idx+2] in ll_filter:
        corpus_df = corpus_df.drop(result[idx:idx+2], axis='columns')
        ll_filter.remove(result[idx:idx+2])

    corpus_df = corpus_df.drop(corpus_df.index[corpus_df['単語']==result])
    corpus_for_fingerspelling.append(result)
    progress_bar.show(len(LL_LIST) - len(ll_filter), f'{result}')
    continue

  # ①の処理
  # 一度しか出現しないLLがあるときの処理
  for key in one_list:
    word = corpus_df[corpus_df[key]==1]['単語']
    if word.empty:
      continue
    word = word.iloc[-1]
    
    for idx in range(len(word)-1):
      if word[idx:idx+2] in ll_filter:
        corpus_df = corpus_df.drop(word[idx:idx+2], axis='columns')
        ll_filter.remove(word[idx:idx+2])
      if word[idx:idx+2] in one_list:
        one_list.remove(word[idx:idx+2])
    
    corpus_df = corpus_df.drop(corpus_df.index[corpus_df['単語']==word])
    corpus_for_fingerspelling.append(word)

    progress_bar.show(len(LL_LIST) - len(ll_filter), f'{word}')

In [None]:
print(f'Number of LL Covers: {len(LL_LIST) - len(ll_filter)}')
print(f'Coverage of LL: {((len(LL_LIST) - len(ll_filter)) / len(LL_LIST)) * 100 :.1f}%')
print(f'Number of words: {len(corpus_for_fingerspelling)}')

In [None]:
corpus_for_fingerspelling_file = '../output/corpus_for_fingerspelling_1000.txt'
corpus_for_fingerspelling = sorted(corpus_for_fingerspelling)
f = open(corpus_for_fingerspelling_file, 'w', encoding='utf-8')
for word in corpus_for_fingerspelling:
  f.write(f'{word}\n')
f.close()