In [None]:
! pip install pybind11
! pip install git+https://github.com/TadaoYamaoka/cmajiang
! pip install git+https://github.com/po3rin/mahjong-question-generator.git
! pip install kanjize pydub librosa
! pip install -q espnet pypinyin parallel_wavegan gdown espnet_model_zoo
! pip install -q --no-build-isolation pyopenjtalk

In [None]:
import io
import time
import warnings
from base64 import b64decode

import librosa
import numpy as np
import pandas as pd
from IPython.display import display, Audio, Javascript
from cmajiang import Shoupai
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text
from kanjize import number2kanji
from google.colab import output
from base64 import b64decode
from mahjong_question_generator import generate_question
from pydub import AudioSegment

In [None]:
warnings.filterwarnings('ignore')

In [None]:
d = ModelDownloader()
speech2text = Speech2Text(
  **d.download_and_unpack("kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"),
  device="cuda"
)

In [None]:
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec, filename='audio.wav'):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec * 1000))
  b = b64decode(s.split(',')[1])
  with open(filename, 'wb+') as f:
    f.write(b)

In [None]:
%%capture
df = generate_question(n=3)
questions = df.to_dict('records')

In [None]:
def convert_answer_for_voice_assertion(q: dict) -> str:
  # 音声データと比較するための回答を用意
  if q["hand_value_cost_additional"] == 0:
    # ロン
    answer_for_voice_assertion = number2kanji(q["hand_value_cost_main"])
  elif q["hand_value_cost_additional"] == q["hand_value_cost_main"]:
    # 親のツモあがり
    answer_for_voice_assertion = number2kanji(q["hand_value_cost_main"]) + 'オール'
  elif q["hand_value_cost_additional"] != 0:
    # 子のツモあがり
    answer_for_voice_assertion = number2kanji(q["hand_value_cost_additional"]) + number2kanji(q["hand_value_cost_main"])
  return answer_for_voice_assertion


def adjust_speech(text: str) -> str:
  # 「千」が「ん」にされることが多い
  if text == "ん":
    text = "千"
  # たまに「ん」が入り込む
  if 'ん' in text:
    text = text.replace('ん', '')
  return text

In [None]:
for q in questions[:3]:
  answer_for_voice_assertion = convert_answer_for_voice_assertion(q)

  print('======================================')
  print(f'場風: {q["round_wind"]}, 自風: {q["player_wind"]}')
  print('ドラ表示牌')
  display(Shoupai(q['dora_str']))
  print('------------------------------------------------------------------')
  print('手牌')
  display(Shoupai(q['hand']))
  print('リーチ' if q['is_riichi'] else '', 'ツモ' if q['is_tsumo'] else 'ロン', '海底摸月' if q['is_haitei'] else '', '河底撈魚' if q['is_houtei'] else '')
  display(Shoupai(q['win_tile']))

  time.sleep(3)
  audiofile = "audio.wav"
  second = 5
  print(f"点数申告をマイクで入力してください 制限時間 {second} 秒...")
  record(second, audiofile)
  # speech = record2array(second)
  print("回答を受け付けました")

  speech, sr = librosa.core.load("audio.wav", sr=16000)
  nbests = speech2text(speech)
  text, *_ = nbests[0]

  text = adjust_speech(text)

  print(f"音声が認識した点数申告: {text}")

  if answer_for_voice_assertion == text:
      print('正解です')
  else:
    print(f'間違っています。正解は {q["hand_value_han"]}飜{q["hand_value_hu"]}符, {q["hand_value_cost_additional"]}, {q["hand_value_cost_main"]}です。')
    print(f'役: {q["hand_value_yaku"]}')

print("お疲れ様でした。全ての問題が終了しました")
