# 下載模組

In [None]:
!pip install SpeechRecognition  # 語音辨識
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg  # 語音辨識
!pip install pydub  # 語音檔轉換
!pip install opencc-python-reimplemented  # 繁體、簡體中文轉換

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 156 kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libasound2-dev is already the newest version (1.1.3-5ubuntu0.6).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following packages were automatically installed and are no longer required:
  libnvidia-common-460 nsight-compute-2020.2.0
Use 'apt autoremove' to remove them.
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 42 not upgraded.
Need to get 184 kB of archives.
After this operation, 891 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd

In [None]:
import speech_recognition as sr  # 語音辨識
import os
import zipfile

import pandas as pd
import numpy as np

from pydub import AudioSegment  # 轉換檔案格式
from collections import Counter  # 計算次數用
from opencc import OpenCC  # 繁體、簡體中文轉換

# 主程式

### 讀入整理好的資料

In [None]:
df_train = pd.read_csv("total_train.csv", index_col=0)
df_test = pd.read_csv("total_test.csv", index_col=0)

##### 檢查用

In [None]:
df_train

Unnamed: 0,path,sentence,language
0,common_voice_zh-TW_20290066.mp3,大家可以公開的下載,0
1,common_voice_zh-TW_20290078.mp3,洗洗睡了,0
2,common_voice_zh-TW_20290080.mp3,不好說，不好說阿,0
3,common_voice_zh-TW_20290111.mp3,才是女性每天面對的現實,0
4,common_voice_zh-TW_20290115.mp3,實際作道路供公眾通行,0
...,...,...,...
53706,common_voice_ar_24170884.mp3,وَقِيلَ لِرَجُلٍ كَيْفَ كِتْمَانُك لِلسِّرِّ ؟...,1
53707,common_voice_ar_24170917.mp3,اتّصل سامي بصديق كي يصطحبه.,1
53708,common_voice_ar_24170942.mp3,اشتدي يا أزمة تنفرجي,1
53709,common_voice_ar_24170991.mp3,بعض الحلم ذل بعض العفو ضعف,1


In [None]:
df_test

Unnamed: 0,path,sentence,language
0,common_voice_zh-TW_31210486.mp3,高鐵聯外公路嘉義段,0
1,common_voice_zh-TW_30534333.mp3,沒有要讓人看清楚,0
2,common_voice_zh-TW_31250509.mp3,日式照燒豬排飯,0
3,common_voice_zh-TW_31149482.mp3,台北捷運淡海線先導公車,0
4,common_voice_zh-TW_18500863.mp3,在黑暗中進行,0
...,...,...,...
19652,common_voice_ar_24038982.mp3,هل بإمكاني إجراء مكالمة هاتفية بعشرة ينات ؟,1
19653,common_voice_ar_24038983.mp3,فَلَمَّا جَاءَ السَّحَرَةُ قَالُوا لِفِرْعَوْن...,1
19654,common_voice_ar_24038984.mp3,توم جاسوس فرنسي.,1
19655,common_voice_ar_24038985.mp3,حصل ما قال أنه سيحصل.,1


### 取出 array

In [None]:
# 語音檔名稱
train_path = df_train["path"].to_numpy()
test_path = df_test["path"].to_numpy()

In [None]:
# 正確答案
y_train = df_train["language"].to_numpy()
y_test = df_test["language"].to_numpy()

### 解壓縮語音檔

In [None]:
local_zip = "test_clips.zip"
zip_ref = zipfile.ZipFile(local_zip, "r")
zip_ref.extractall("")
zip_ref.close()

# os.remove("")  # 刪除非語音檔

In [None]:
# 建立空資料夾，用於存放 wav 檔案
newpath = "wav"
if not os.path.exists(newpath):
    os.makedirs(newpath)

### mp3 轉 wav

In [None]:
for mp3_file in os.listdir("test_clips"):
    src = f"test_clips/{mp3_file}"
    dst = f"wav/{mp3_file[:-4]}.wav"
    audSeg = AudioSegment.from_mp3(src)
    audSeg.export(dst, format="wav")

### 語音辨識

In [None]:
def recognize(wav_file):
    #print(wav_file)
    voice = sr.AudioFile(f"wav/{wav_file}")
    with voice as source:
        audio = r.record(source)
    # language="zh-TW" 
    # return s2t(r.recognize_google(audio, language="zh-TW"))  # 將結果轉成繁體中文後回傳
    # language="ar-SA"
    try:
        return (r.recognize_google(audio))
        #return (r.recognize_google(audio))
    except sr.UnknownValueError:
        pass

In [None]:
r = sr.Recognizer()
s2t = OpenCC('s2t').convert
sentences_list = [recognize(wav_file) for wav_file in os.listdir("wav")]

In [None]:
def recognize(wav_file, language="zh-TW"):
    #print(wav_file)
    voice = sr.AudioFile(f"wav/{wav_file}")
    with voice as source:
        audio = r.record(source)
    try:
        if language == "zh-TW":
            return s2t(r.recognize_google(audio, language=language))  # 將結果轉成繁體中文後回傳
        elif language == "ar-SA":
            return (r.recognize_google(audio, language=language))
    except sr.UnknownValueError:
        recognize(wav_file, language="ar-SA")

In [None]:
r = sr.Recognizer()
s2t = OpenCC('s2t').convert
sentences_list = [recognize(wav_file) for wav_file in os.listdir("wav")]

In [None]:
sentences_list

['客廳都沒開燈',
 '伺服器出現異常',
 '目前臺灣分最多段的道路',
 'as simple as that happen in only',
 'www.279 office2003',
 '連接臺北車站',
 '萬萬不能被tanzania underdeveloped',
 '你能玩party on',
 '怎麼把什麼好',
 '里約公主明301 Mi admitted that use Chrome',
 'panorama of the Korea',
 '球場路',
 'youtube.com',
 '有小兒麻痹的美國總統',
 'What is and energetic and what is antithetical to',
 'int mr.black',
 '現在已經會了',
 '臺中港路',
 '屏東縣墾丁上汗潮州鎮交界',
 '臺北捷運忠縣',
 '拉麪共和國',
 '以前真的是也整理規劃',
 '保存着他們的記憶',
 '問過身邊的人',
 'innovative',
 '我可以慢慢聽你說',
 'helena State feedback',
 '第二高速公路',
 '罵我的還是說看起來沒',
 '外婆格外沉沒',
 '我們決定工作職掌',
 '恩妃的規律的祕密聽了的法語',
 'v2ray',
 '臺中市建國北路',
 '才能坐着休息',
 '指揮中心的記者會',
 'article on internet',
 '臺北港端',
 'admin template',
 '注視着你的一點機會',
 '面對面不好意思',
 '東西向快速公路東石嘉義縣',
 '淡水漁人碼頭',
 '阿聯交流道',
 '剩下三分之二的能吃',
 '新店家樂福要開始動工了',
 'ntv7',
 'covid-19 limited',
 '0港幣',
 '神岡區中山路',
 '說明病貓的重要性',
 '你的多個聯合一個名單取消',
 'violin',
 '若沒被記錄下來',
 '給Peter拉布麻sense',
 '你想幹什麼那個蘋果很差嗎',
 '而不是放大某些意見的代表性',
 '新莊廟街商圈',
 '努力破解的問題',
 'xxxxx',
 '市民大道高架橋',
 'halion Sonic 3',
 '不是這麼豐富的了',
 'k407',
