In [1]:
# !pip install jiwer -qq

In [2]:
# !pip install git+https://github.com/redapesolutions/suara-kami-community

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sk import predict
from pathlib import Path
import jiwer
import pandas as pd
import numpy as np

# Using predict function

Predict function is a generic function that will handle all type of input. 

It is a high layer api for transcription, if want to do more flexible transcription can use SK class

# Predict single or multiple file

In [3]:
fn = "/content/test/youtube/0228444ff34081eda587c0ca53712486.wav" # or ["/content/audio1.wav","/content/audio2.wav"]
out = predict(fn)

Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.53it/s]


In [4]:
Path(fn).with_suffix(".txt").read_text()

'pada peringkat ini pembukaan sekolah hanya melibatkan'

In [5]:
out

{'texts': ['pada peringkat ini pembukaan sekolah hanya melibatkan'],
 'filenames': [PosixPath('/content/test/youtube/0228444ff34081eda587c0ca53712486.wav')],
 'entropy': [0.081219584],
 'timestamps': [[0]],
 'speakers': [['not enabled']]}

# Predict from single or multiple folder

In [6]:
fn = "/content/test/" # or ["/content/test","/content/data"]
out = predict(fn)

Total input path: 1
Total audio found(.wav): 1700


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1700/1700 [04:42<00:00,  6.03it/s]


In [7]:
out.keys()

dict_keys(['texts', 'filenames', 'entropy', 'timestamps', 'speakers'])

In [8]:
preds = out["texts"]
files = out["filenames"]

In [9]:
data = []
for i,j in zip(preds,files):
    label = j.with_suffix('.txt').read_text()
    data.append([j,label,i,jiwer.compute_measures(label,i)['wer'],jiwer.compute_measures([l for l in label],[ii for ii in i])['wer']])

In [10]:
df = pd.DataFrame(data)
df.columns = ["path","label","pred","wer","cer"]
df.head()

Unnamed: 0,path,label,pred,wer,cer
0,/content/test/youtube/acce76874b0d5edc51fdd5fa...,sebaik saja kita selesai makan malam,sebaik saja kita selesai makan malam,0.0,0.0
1,/content/test/youtube/e8c3214529ef34af368c0439...,pentaksiran tingkatan tiga p t tiga bagi tahun...,pentaksiran tingkatan tiga pt tiga bagi tahun ...,0.2,0.0
2,/content/test/youtube/6c4ee08c30a81d280ea66dd7...,tengok dekat mana,dekat mana tengok,0.666667,0.8
3,/content/test/youtube/223d189c07d45aa377436a41...,gil keras kepala macam mak engkau engkau,gerah kepala macam manggaum lenggau,0.714286,0.294118
4,/content/test/youtube/87f7a2b2d1d88b235f43c28c...,pengusaha sekolah swasta antarabangsa dan pusa...,pengusaha sekolah swasta antarabangsa dan pusa...,0.0,0.0


In [11]:
df["wer"].mean(),df["cer"].mean()

(0.07793928162784017, 0.025547171006191426)

# Using SK class

Why use SK class? if want to transcribe multiple time predict will load the model and lm multiple time which is not efficient, with SK object you can call it multiple time without reloading anything again.

In [12]:
from sk import SK

In [13]:
asr = SK(model="conformer_small",decoder=None)

In [14]:
asr.transcribe_file(fn="/content/test/youtube/0228444ff34081eda587c0ca53712486.wav")

Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.67it/s]


{'texts': ['pada peringkat ini pembukaan sekolah hanya melibatkan'],
 'filenames': ['/content/test/youtube/0228444ff34081eda587c0ca53712486.wav'],
 'entropy': [0.081219584],
 'timestamps': [[0]],
 'speakers': [['not enabled']],
 'all_logits': []}

In [15]:
import librosa
asr.transcribe_array(array=librosa.load("/content/test/youtube/0228444ff34081eda587c0ca53712486.wav",16000)[0])

{'texts': 'pada peringkat ini pembukaan sekolah hanya melibatkan',
 'filenames': array([0.00311279, 0.00430298, 0.00643921, 0.00476074, 0.00372314,
        0.00167847, 0.00363159, 0.0072937 , 0.00421143, 0.00289917,
        0.0022583 , 0.00341797, 0.00488281, 0.00982666, 0.0098877 ,
        0.00900269], dtype=float32),
 'entropy': 0.025752233,
 'timestamps': [0]}