In [1]:
# !pip install jiwer -qq

In [2]:
# !pip install git+https://github.com/redapesolutions/suara-kami-community

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from sk import predict
from pathlib import Path
import jiwer
import pandas as pd
import numpy as np

# Using predict function

Predict function is a generic function that will handle all type of input. 

It is a high layer api for transcription, if want to do more flexible transcription can use SK class

# Predict single or multiple file

In [5]:
fn = "/content/test/youtube/0228444ff34081eda587c0ca53712486.wav" # or ["/content/audio1.wav","/content/audio2.wav"]
out = predict(fn)

loaded model: /home/ubuntu/.sk/models/conformer_small.onnx ['CPUExecutionProvider']
Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.60it/s]


In [6]:
Path(fn).with_suffix(".txt").read_text()

'pada peringkat ini pembukaan sekolah hanya melibatkan'

In [7]:
out

{'texts': ['pada peringkat ini pembukaan sekolah hanya melibatkan'],
 'filenames': [PosixPath('/content/test/youtube/0228444ff34081eda587c0ca53712486.wav')],
 'entropy': [0.081219584],
 'timestamps': [[0]],
 'speakers': [['not enabled']]}

# Predict from single or multiple folder

In [8]:
fn = "/content/test/" # or ["/content/test","/content/data"]
out = predict(fn)

loaded model: /home/ubuntu/.sk/models/conformer_small.onnx ['CPUExecutionProvider']
Total input path: 1
Total audio found(.wav): 1700


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1700/1700 [04:45<00:00,  5.96it/s]


In [9]:
out.keys()

dict_keys(['texts', 'filenames', 'entropy', 'timestamps', 'speakers'])

In [10]:
preds = out["texts"]
files = out["filenames"]

In [11]:
data = []
for i,j in zip(preds,files):
    label = j.with_suffix('.txt').read_text()
    data.append([j,label,i,jiwer.compute_measures(label,i)['wer'],jiwer.compute_measures([l for l in label],[ii for ii in i])['wer']])

In [12]:
df = pd.DataFrame(data)
df.columns = ["path","label","pred","wer","cer"]
df.head()

Unnamed: 0,path,label,pred,wer,cer
0,/content/test/youtube/acce76874b0d5edc51fdd5fa...,sebaik saja kita selesai makan malam,sebaik saja kita selesai makan malam,0.0,0.0
1,/content/test/youtube/e8c3214529ef34af368c0439...,pentaksiran tingkatan tiga p t tiga bagi tahun...,pentaksiran tingkatan tiga pt tiga bagi tahun ...,0.2,0.0
2,/content/test/youtube/6c4ee08c30a81d280ea66dd7...,tengok dekat mana,dekat mana tengok,0.666667,0.8
3,/content/test/youtube/223d189c07d45aa377436a41...,gil keras kepala macam mak engkau engkau,gerah kepala macam manggaum lenggau,0.714286,0.294118
4,/content/test/youtube/87f7a2b2d1d88b235f43c28c...,pengusaha sekolah swasta antarabangsa dan pusa...,pengusaha sekolah swasta antarabangsa dan pusa...,0.0,0.0


In [13]:
df["wer"].mean(),df["cer"].mean()

(0.07793928162784017, 0.025547171006191426)

# Using SK class

Why use SK class? if want to transcribe multiple time predict will load the model and lm multiple time which is not efficient, with SK object you can call it multiple time without reloading anything again.

In [14]:
from sk import SK

In [15]:
asr = SK(model="conformer_small",decoder=None)

loaded model: /home/ubuntu/.sk/models/conformer_small.onnx ['CPUExecutionProvider']


In [16]:
asr.transcribe_file(fn="/content/test/youtube/0228444ff34081eda587c0ca53712486.wav")

Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.07it/s]


{'texts': ['pada peringkat ini pembukaan sekolah hanya melibatkan'],
 'filenames': ['/content/test/youtube/0228444ff34081eda587c0ca53712486.wav'],
 'entropy': [0.081219584],
 'timestamps': [[0]],
 'speakers': [['not enabled']],
 'all_logits': []}

In [17]:
import librosa
asr.transcribe_array(array=librosa.load("/content/test/youtube/0228444ff34081eda587c0ca53712486.wav",16000)[0])

{'texts': 'pada peringkat ini pembukaan sekolah hanya melibatkan',
 'filenames': array([0.00311279, 0.00430298, 0.00643921, 0.00476074, 0.00372314,
        0.00167847, 0.00363159, 0.0072937 , 0.00421143, 0.00289917,
        0.0022583 , 0.00341797, 0.00488281, 0.00982666, 0.0098877 ,
        0.00900269], dtype=float32),
 'entropy': 0.081219584,
 'timestamps': [0]}

In [19]:
asr = SK(model="silero_en",decoder=None)

downloading: en_v5.onnx


114836it [00:06, 18753.09it/s]                                                                                                                                                                                                               


saved to: /home/ubuntu/.sk/models/en_v5.onnx
loaded model: /home/ubuntu/.sk/models/en_v5.onnx ['CPUExecutionProvider']


In [20]:
asr.transcribe_file(fn="/content/testset-imda/0.wav")

Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.09it/s]


{'texts': ["because if the next person doesn't do a good job at as you all your efforts may be wasted"],
 'filenames': ['/content/testset-imda/0.wav'],
 'entropy': [-2.5951443],
 'timestamps': [[0]],
 'speakers': [['not enabled']],
 'all_logits': []}

In [21]:
asr = SK(model="silero_en",decoder="en")

downloading mixed-lower.binary.zip language model of size 600+MB, might take a while


778999it [00:34, 22601.06it/s]                                                                                                                                                                                                               


saved to: /home/ubuntu/.sk/lm/mixed-lower.binary.klm


Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
No known unigrams provided, decoding results might be a lot worse.


loaded model: /home/ubuntu/.sk/models/en_v5.onnx ['CPUExecutionProvider']
loaded lm: en


In [22]:
asr.transcribe_file(fn="/content/testset-imda/0.wav")

Total input path: 1
Total audio found(.wav): 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.83it/s]


{'texts': ["because if the next person doesn't do a good job as you all your efforts may be wasted"],
 'filenames': ['/content/testset-imda/0.wav'],
 'entropy': [[-5.385753631591797,
   -2.7172725200653076,
   -2.7174205780029297,
   -2.717975378036499,
   -2.717954635620117,
   -12.571464538574219,
   -2.7051992416381836,
   -5.339262962341309,
   -2.7172226905822754,
   -7.258273601531982,
   -4.206550598144531,
   -2.591944694519043,
   -2.349553346633911,
   -5.384856700897217,
   -10.193320274353027,
   -2.7152457237243652,
   -2.7167961597442627,
   -12.797026634216309]],
 'timestamps': [[('because', 0.03, 0.04),
   ('if', 0.05, 0.05),
   ('the', 1.0, 1.01),
   ('next', 1.02, 1.02),
   ('person', 1.03, 1.04),
   ("doesn't", 1.05, 2.01),
   ('do', 2.01, 2.02),
   ('a', 2.02, 2.03),
   ('good', 2.04, 2.04),
   ('job', 2.05, 3.0),
   ('as', 3.04, 3.04),
   ('you', 3.05, 3.06),
   ('all', 4.04, 4.04),
   ('your', 4.05, 5.0),
   ('efforts', 5.01, 5.03),
   ('may', 5.03, 5.04),
   ('be