## 下載模組

In [1]:
!pip install SpeechRecognition  # 語音辨識
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg  # 語音辨識
!pip install pydub  # 語音檔轉換
!pip install opencc-python-reimplemented  # 繁體、簡體中文轉換
!pip install datasets

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 153 kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libasound2-dev is already the newest version (1.1.3-5ubuntu0.6).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 42 not upgraded.
Need to get 184 kB of archives.
After this operation, 891 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0

## 匯入模組

In [2]:
import numpy as np
import speech_recognition as sr  # 語音辨識

from datasets import concatenate_datasets, load_dataset
from pydub import AudioSegment  # 轉換檔案格式
from opencc import OpenCC  # 繁體、簡體中文轉換
from typing import List

## 下載網路資料

來源：https://huggingface.co/datasets/common_voice \
官方網站：https://commonvoice.mozilla.org/zh-TW/datasets \
sr模組語言代碼：(https://cloud.google.com/speech-to-text/docs/languages

In [3]:
# 語言(改這邊就可以去查看其他語言了)
# 對應編號去看 "Hugging Face" 連結內的 "Subset"
load_lang = ["zh-TW", "ja"]  # 讀檔案用的

# 對應編號去看 "sr模組語言代碼"
lang_list = ["zh-TW", "ja-JP"]  # 語音辨識用的

In [4]:
train_dataset = [load_dataset("common_voice", lang, split="train") for lang in load_lang]
train_dataset = concatenate_datasets(train_dataset)  # 把資料結合在一起

test_dataset = [load_dataset("common_voice", lang, split="test") for lang in load_lang]
test_dataset = concatenate_datasets(test_dataset)

valid_dataset = [load_dataset("common_voice", lang, split="validation") for lang in load_lang]
valid_dataset = concatenate_datasets(valid_dataset)

Downloading builder script:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/zh-TW (download: 2.03 GiB, generated: 2.38 GiB, post-processed: Unknown size, total: 4.41 GiB) to /root/.cache/huggingface/datasets/common_voice/zh-TW/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e...


Downloading data:   0%|          | 0.00/2.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3507 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2895 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2895 [00:00<?, ? examples/s]

Generating other split:   0%|          | 0/22477 [00:00<?, ? examples/s]

Generating validated split:   0%|          | 0/61232 [00:00<?, ? examples/s]

Generating invalidated split:   0%|          | 0/3584 [00:00<?, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/zh-TW/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e. Subsequent calls will reuse this data.
Downloading and preparing dataset common_voice/ja (download: 145.80 MiB, generated: 224.59 MiB, post-processed: Unknown size, total: 370.39 MiB) to /root/.cache/huggingface/datasets/common_voice/ja/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e...


Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/722 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/632 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/586 [00:00<?, ? examples/s]

Generating other split:   0%|          | 0/885 [00:00<?, ? examples/s]

Generating validated split:   0%|          | 0/3072 [00:00<?, ? examples/s]

Generating invalidated split:   0%|          | 0/504 [00:00<?, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/ja/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e. Subsequent calls will reuse this data.


Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/zh-TW/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/ja/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/zh-TW/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/ja/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)


In [5]:
print(len(train_dataset), len(test_dataset), len(valid_dataset))

4229 3527 3481


In [6]:
# 嘗試回傳各個語言的結果
def try_return(lang:str, audio:sr.AudioData) -> str:
    # print("done!", end="\t")
    try:
        # print(r.recognize_google(audio, language=lang))
        return (r.recognize_google(audio, language=lang))
    except sr.UnknownValueError:  # 避免程式因為聽不懂語音，而出現錯誤
        # print("I don't know")
        return "I don't know"

In [7]:
def recognize(path:str, lang_list:List[str]) -> List[str]:
    # 轉成 wav 檔案
    dst = f"{path[:-4]}.wav"
    audSeg = AudioSegment.from_mp3(path)
    audSeg.export(dst, format="wav")

    # 語音辨識
    voice = sr.AudioFile(dst)
    with voice as source:
        audio = r.record(source)
    
    # 嘗試回傳各個語言的結果
    result = [try_return(lang, audio) for lang in np.array(lang_list)]
    return result

In [8]:
# 要跑很久，有空再試
# 2hr40min
r = sr.Recognizer()
s2t = OpenCC('s2t').convert
sentence_list = [recognize(path, lang_list) for path in np.array(train_dataset["path"])]

In [9]:
sentence_list[0]

['並做出行動', '院長 商品と']

In [13]:
# 存入 "train.json" 檔案中
import json
with open("train.json", "w+", encoding="utf-8") as json_file:
    json.dump(sentence_list, json_file, ensure_ascii=False)