In [None]:
import pandas as pd
import jaconv
from pykakasi import kakasi
import re
import csv

In [None]:
p = re.compile('[\u3041-\u309Fー]{2,10}')  # （全角ひらがなのUnicodeブロック + 伸ばし棒）の正規表現

In [None]:
# プログレスバー
class ProgressBar:
  def __init__(self, max_id):
    self.call = 0
    self.max_id = max_id
  
  def show(self, id, comment=''):
    if self.call == 0:
      self.call = 1
      print(f'{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}', end="")
    elif id >= self.max_id:
      print(f'\r{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}')
    else:
      print(f'\r{int(id/self.max_id*100): >3}%|{"■"*int((id/self.max_id)*20)+"_"*(20 - int((id/self.max_id)*20))}| {id}/{self.max_id}:{comment:128}', end="")

In [None]:
# 郵便データベースの読み込み
postal_database = '../datasets/japanesepost/KEN_ALL_utf8.CSV'
postal_df = pd.read_csv(postal_database, header=None, encoding='utf-8')
prefecture = set(postal_df[3][:])   # 都道府県名（半角カナ）
municipality = set(postal_df[4][:]) # 市町村名 （半角カナ）
# address = set(postal_df[5][:])    # 町域名（半角カナ）

# データの重複をなくす
postal_corpus = set([])
postal_corpus |= prefecture
postal_corpus |= municipality 
# postal_corpus |= address

# （半角カナ -> 全角ひらがな）に変換
# ひらがな以外（数字・記号など）が含まれる文字列を除外
formatted_postal_corpus =[]
for c in postal_corpus:
  c = jaconv.hankaku2zenkaku(c) # 半角 -> 全角
  c = jaconv.kata2hira(c)       # カタカナ -> ひらがな
  if p.fullmatch(c):            # 正規表現チェック
    formatted_postal_corpus.append(c)

print(f'Postal corpus total: {len(formatted_postal_corpus)}')

In [None]:
# 日本語能力試験
noryoku_database = '../datasets/noryoku/noryoku.txt'
noryoku_df = pd.read_csv(noryoku_database, encoding='utf-8',sep='\t')
noryoku_corpus = set(noryoku_df['語'][:])

formatted_noryoku_corpus =[]
for c in noryoku_corpus:
  c = jaconv.hankaku2zenkaku(c) # 半角 -> 全角
  c = jaconv.kata2hira(c)       # カタカナ -> ひらがな
  if p.fullmatch(c):            # 正規表現チェック
    formatted_noryoku_corpus.append(c)

print(f'Noryoku corpus total: {len(formatted_noryoku_corpus)}')

In [None]:
# 文化庁
bunkacho_database = '../datasets/bunkacho/katakana_shiyou.txt'
bunkacho_df = pd.read_csv(bunkacho_database, encoding='utf-8',sep='\t')
bunkacho_corpus = set(bunkacho_df['語'][:])

formatted_bunkacho_corpus =[]
for c in bunkacho_corpus:
  c = jaconv.hankaku2zenkaku(c) # 半角 -> 全角
  c = jaconv.kata2hira(c)       # カタカナ -> ひらがな
  if p.fullmatch(c):            # 正規表現チェック
    formatted_bunkacho_corpus.append(c)

print(f'Bunkacho corpus total: {len(formatted_bunkacho_corpus)}')

In [None]:
# openBDから本のISBNとタイトルのリストを取得
import time
import requests
from more_itertools import chunked

OPENBD_ENDPOINT = 'https://api.openbd.jp/v1/'
JA_BOOK_CODE = '9784'
DATA = 10000

# 全てのISBNコードを取得
def get_coverage():
  return requests.get(OPENBD_ENDPOINT + 'coverage').json()

# 
def get_bibs(items) -> dict:
  return requests.post(OPENBD_ENDPOINT + 'get', data={'isbn': ','.join(items)}).json()

# 全てのISBNコードを取得
all_coverage = get_coverage()

# 全てのISBNコードうち日本の書籍コードだけ抽出
filter_coverage = list(filter(lambda c: c[:4]==JA_BOOK_CODE, all_coverage))

# ISBNのリストを10000件単位に分割
chunked_coverage = chunked(filter_coverage, DATA)

print(f'ISBN total: {len(filter_coverage)} items')

openBD_f = open('../datasets/openBD/book_title.csv', 'w', encoding='utf-8', newline="")
openBD_writer = csv.writer(openBD_f)
openBD_writer.writerow(['id','isbn','title'])

# ISBNとタイトルをcsvに記録
count = 0
start = time.time()
progress_bar = ProgressBar(max_id=len(filter_coverage))
for coverage in chunked_coverage:
  result = get_bibs(coverage)
  for bib in result:
    count += 1
    progress_bar.show(count)  # プログレスバー

    if bib is None:
      continue
    # ここで書誌1件単位の処理
    isbn = bib['summary']['isbn']
    title = bib['summary']['title']
    openBD_writer.writerow([f'{count}', f'{isbn}', f'{title}'])

openBD_f.close()

In [None]:
# タイトルが10文字以下の書籍をリスト化
openBD_database = '../datasets/openBD/book_title.csv'
openBD_df = pd.read_csv(openBD_database, encoding='utf-8')
title_list = openBD_df['title']
book_title_corpus = []
kks = kakasi()
progress_bar = ProgressBar(len(title_list))
for idx, title in enumerate(title_list):
  progress_bar.show(idx+1, title)  # プログレスバー
  if title != title:  # NaNチェック
    continue
  result = kks.convert(title)  # 漢字 -> ひらがな
  title = ''.join([item['hira'] for item in result])
  title = jaconv.hankaku2zenkaku(title) # 半角 -> 全角
  title = jaconv.kata2hira(title)       # カタカナ -> ひらがな
  if p.fullmatch(title):            # 正規表現チェック
    book_title_corpus.append(title)
print(f'OpenBD corpus total: {len(book_title_corpus)}')

In [None]:
# タイトルが10文字以下の書籍リストを保存
f = open('../datasets/openBD/book_title.txt', 'w', encoding='utf-8')
for title in book_title_corpus:
  f.write(title+'\n')
f.close()

In [None]:
# 正規表現[あ-んー]をLと定めた時、Lの二つ組LLのリストを作成
KANA = ['あ', 'い', 'う', 'え', 'お',
        'か', 'が', 'き', 'ぎ', 'く', 'ぐ', 'け', 'げ', 'こ', 'ご',
        'さ', 'ざ', 'し', 'じ', 'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ',
        'た', 'だ', 'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と', 'ど',
        'な', 'に', 'ぬ', 'ね', 'の',
        'は', 'ば', 'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ', 'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ',
        'ま', 'み', 'む', 'め', 'も',
        'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ',
        'ら', 'り', 'る', 'れ', 'ろ',
        'わ', 'を', 'ん', 'ー']
LL_LIST = []
for first in KANA:
  for second in KANA:
    LL_LIST.append(f'{first}{second}')

In [None]:
# 単語とLLのリストをcsv形式で出力
f = open('../output/words.csv', 'w', encoding='utf-8')
write = csv.writer(f)

formatted_corpus = set(formatted_postal_corpus + formatted_noryoku_corpus + formatted_bunkacho_corpus + book_title_corpus)
write.writerow(['単語']+LL_LIST)  # header
progress_bar = ProgressBar(len(formatted_corpus))
for idx, word in enumerate(formatted_corpus):
  progress_bar.show(idx+1, word)
  if word != word:
    continue
  isextend_ll = []
  for ll in LL_LIST:
    if ll in word:
      isextend_ll.append(1)
    else: 
      isextend_ll.append(0)
  write.writerow([word] + isextend_ll)
f.close()

In [None]:
# 単語とLLのリストを読み込む
dtype_dict = dict(zip(['単語']+LL_LIST, ['object']+['bool']*len(LL_LIST)))  # 読み取るときの型定義
corpus_df = pd.read_csv('../output/words.csv', dtype=dtype_dict, encoding='utf-8')
corpus_df.info()

In [None]:
ll_pattern_count = len(LL_LIST)
ll_cover_rate = (corpus_df[LL_LIST].sum()[corpus_df[LL_LIST].sum()[:]!=0].count() / len(LL_LIST))*100
print(f'LL pattern: {ll_pattern_count} ways\nLL cover rate: {ll_cover_rate: .1f}%')

In [None]:
# 単語コーパスを保存
f = open('../output/words.txt', 'w', encoding='utf8')
for word in sorted(corpus_df['単語']):
  f.write(word+'\n')
f.close()