In [1]:
import os

In [2]:
# define utility functions for metrics performance
def avg_wer(wer_scores, combined_ref_len):
	return float(sum(wer_scores)) / float(combined_ref_len)

def _levenshtein_distance(ref, hyp):
	"""Levenshtein distance is a string metric for measuring the difference
	between two sequences. Informally, the levenshtein disctance is defined as
	the minimum number of single-character edits (substitutions, insertions or
	deletions) required to change one word into the other. We can naturally
	extend the edits to word level when calculate levenshtein disctance for
	two sentences.
	"""
	m = len(ref)
	n = len(hyp)

	# special case
	if ref == hyp:
		return 0
	if m == 0:
		return n
	if n == 0:
		return m

	if m < n:
		ref, hyp = hyp, ref
		m, n = n, m

	# use O(min(m, n)) space
	distance = np.zeros((2, n + 1), dtype=np.int32)

	# initialize distance matrix
	for j in range(0,n + 1):
		distance[0][j] = j

	# calculate levenshtein distance
	for i in range(1, m + 1):
		prev_row_idx = (i - 1) % 2
		cur_row_idx = i % 2
		distance[cur_row_idx][0] = i
		for j in range(1, n + 1):
			if ref[i - 1] == hyp[j - 1]:
				distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
			else:
				s_num = distance[prev_row_idx][j - 1] + 1
				i_num = distance[cur_row_idx][j - 1] + 1
				d_num = distance[prev_row_idx][j] + 1
				distance[cur_row_idx][j] = min(s_num, i_num, d_num)

	return distance[m % 2][n]


def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
	"""Compute the levenshtein distance between reference sequence and
	hypothesis sequence in word-level.
	:param reference: The reference sentence.
	:type reference: basestring
	:param hypothesis: The hypothesis sentence.
	:type hypothesis: basestring
	:param ignore_case: Whether case-sensitive or not.
	:type ignore_case: bool
	:param delimiter: Delimiter of input sentences.
	:type delimiter: char
	:return: Levenshtein distance and word number of reference sentence.
	:rtype: list
	"""
	if ignore_case == True:
		reference = reference.lower()
		hypothesis = hypothesis.lower()

	ref_words = reference.split(delimiter)
	hyp_words = hypothesis.split(delimiter)

	edit_distance = _levenshtein_distance(ref_words, hyp_words)
	return float(edit_distance), len(ref_words)


def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
	"""Compute the levenshtein distance between reference sequence and
	hypothesis sequence in char-level.
	:param reference: The reference sentence.
	:type reference: basestring
	:param hypothesis: The hypothesis sentence.
	:type hypothesis: basestring
	:param ignore_case: Whether case-sensitive or not.
	:type ignore_case: bool
	:param remove_space: Whether remove internal space characters
	:type remove_space: bool
	:return: Levenshtein distance and length of reference sentence.
	:rtype: list
	"""
	if ignore_case == True:
		reference = reference.lower()
		hypothesis = hypothesis.lower()

	join_char = ' '
	if remove_space == True:
		join_char = ''

	reference = join_char.join(filter(None, reference.split(' ')))
	hypothesis = join_char.join(filter(None, hypothesis.split(' ')))

	edit_distance = _levenshtein_distance(reference, hypothesis)
	return float(edit_distance), len(reference)


def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
	"""Calculate word error rate (WER). WER compares reference text and
	hypothesis text in word-level. WER is defined as:
	.. math::
		WER = (Sw + Dw + Iw) / Nw
	where
	.. code-block:: text
		Sw is the number of words subsituted,
		Dw is the number of words deleted,
		Iw is the number of words inserted,
		Nw is the number of words in the reference
	We can use levenshtein distance to calculate WER. Please draw an attention
	that empty items will be removed when splitting sentences by delimiter.
	:param reference: The reference sentence.
	:type reference: basestring
	:param hypothesis: The hypothesis sentence.
	:type hypothesis: basestring
	:param ignore_case: Whether case-sensitive or not.
	:type ignore_case: bool
	:param delimiter: Delimiter of input sentences.
	:type delimiter: char
	:return: Word error rate.
	:rtype: float
	:raises ValueError: If word number of reference is zero.
	"""
	edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
										 delimiter)

	if ref_len == 0:
		raise ValueError("Reference's word number should be greater than 0.")

	wer = float(edit_distance) / ref_len
	return wer


def cer(reference, hypothesis, ignore_case=False, remove_space=False):
	"""Calculate charactor error rate (CER). CER compares reference text and
	hypothesis text in char-level. CER is defined as:
	.. math::
		CER = (Sc + Dc + Ic) / Nc
	where
	.. code-block:: text
		Sc is the number of characters substituted,
		Dc is the number of characters deleted,
		Ic is the number of characters inserted
		Nc is the number of characters in the reference
	We can use levenshtein distance to calculate CER. Chinese input should be
	encoded to unicode. Please draw an attention that the leading and tailing
	space characters will be truncated and multiple consecutive space
	characters in a sentence will be replaced by one space character.
	:param reference: The reference sentence.
	:type reference: basestring
	:param hypothesis: The hypothesis sentence.
	:type hypothesis: basestring
	:param ignore_case: Whether case-sensitive or not.
	:type ignore_case: bool
	:param remove_space: Whether remove internal space characters
	:type remove_space: bool
	:return: Character error rate.
	:rtype: float
	:raises ValueError: If the reference length is zero.
	"""
	edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
										 remove_space)

	if ref_len == 0:
		raise ValueError("Length of reference should be greater than 0.")

	cer = float(edit_distance) / ref_len
	return cer
	
def GreedyDecoder(output, labels, label_lengths, blank_label=0, collapse_repeated=True):
	arg_maxes = torch.argmax(output, dim=2)
	decodes = []
	targets = []
	for i, args in enumerate(arg_maxes):
		decode = []
		targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
		for j, index in enumerate(args):
			if index > 34:
				continue
			
			if index != blank_label:
				if collapse_repeated and j != 0 and index == args[j -1]:
					continue
				decode.append(index.item())
		decodes.append(text_transform.int_to_text(decode))
	return decodes, targets

### Create Metadata CSV

In [10]:
surat = ["078", "080", "089", "098", "099", "104", "105", "106", "107", "108", "109", "110", "111", "112", "113", "114"]
file_transcript = "data/transliterasi_id_juz_30.csv" # this csv has formated file_name|ayat|transcript
file_output = "data/transliterasi_id_juz_30_filter.csv"
id_reciter = ["01", "03", "04", "05",  "06", "07", "08", "09"]

with open(file_transcript, "r") as f:
    lines = f.readlines()

with open(file_output, "w") as f:
    for line in lines:
        file_name, ayat, transcript = line.strip().split("|")        
        file_name = file_name.split("_")[0]        
        # check if file name first three character is in surat
        if file_name[:3] in surat:
            for id in id_reciter:
                # print(f"{file_name}_{id}.wav|{ayat}|{transcript}")
                f.write(f"{file_name}_{id}.wav|{ayat}|{transcript}\n")

print("Done")

Done


### Remove Reciter

In [3]:
accepted_reciters = ['01', '03', '09', '10', '12', '13', '14', '15', '16', '19', '22', '24', '27']
# removed files is number of files isnt in accepted reciters
removed_files = []
for i in range(1, 31):
    if "{:02d}".format(i) not in accepted_reciters:
        removed_files.append("{:02d}".format(i))

### Cleansing Markdown

In [7]:
# open file markdown
file_transliterasi = open("data/full-id-transliteration-kemenag.md", "r")

# delete lines contain # in the beginning
lines = file_transliterasi.readlines()
lines = [line for line in lines if not line.startswith("#")]

# delete lines just contain newline
lines = [line for line in lines if line != "\n"]


# write file markdown
clean_file = open("data/clean-full-id-transliteration-kemenag.md", "w")
clean_file.writelines(lines)

### MOVE FILEs

In [12]:
# define path
path = 'data/audio'
# get all folder
all_folder = os.listdir(path)
# loop all folder
for folder in all_folder:
    # get all folder ayat in folder
    folder_ayat = os.listdir(path + '/' + folder)
    # loop all file in folder
    for f_ayat in folder_ayat:
        # get all file in folder ayat
        all_file = os.listdir(path + '/' + folder + '/' + f_ayat)
        # loop all file in folder ayat
        for file in all_file:
            # move to data/audio folder
            os.system('mv ' + path + '/' + folder + '/' + f_ayat + '/' + file + ' ' + path)
        # remove folder ayat
        os.system('rm -r ' + path + '/' + folder + '/' + f_ayat)
    # remove folder
    os.system('rm -r ' + path + '/' + folder)    


### REMOVE FILEs

In [24]:
for file in os.listdir(path):
    if file.endswith('.wav') and file.split("_")[1].split(".")[0] in removed_files:
        os.remove(os.path.join(path, file))
        print(f'{file} removed')

100007_26.wav removed
080006_20.wav removed
096013_06.wav removed
099003_20.wav removed
091010_07.wav removed
099006_05.wav removed
105004_05.wav removed
087014_06.wav removed
088002_17.wav removed
086009_29.wav removed
090002_18.wav removed
078005_18.wav removed
101008_20.wav removed
086016_20.wav removed
114005_07.wav removed
079034_20.wav removed
092019_25.wav removed
079033_07.wav removed
092008_30.wav removed
098006_04.wav removed
101001_18.wav removed
081017_29.wav removed
091008_23.wav removed
083021_05.wav removed
084020_04.wav removed
091012_11.wav removed
114004_05.wav removed
089023_20.wav removed
091002_29.wav removed
083026_17.wav removed
092015_07.wav removed
080031_20.wav removed
094007_25.wav removed
084013_11.wav removed
094003_30.wav removed
081018_04.wav removed
093007_29.wav removed
089020_07.wav removed
078037_11.wav removed
083027_05.wav removed
085004_25.wav removed
096015_07.wav removed
089005_02.wav removed
078033_07.wav removed
099005_06.wav removed
081028_06.

In [6]:
# Ganti 'path_to_file' dengan path yang sesuai
path_to_file = 'data/raw_transcription_phonem.csv'

# Membaca file teks baris per baris
with open(path_to_file, 'r') as file:
    lines = file.readlines()

# Mengubah format ID
new_lines = []
for line in lines:
    parts = line.split('|')
    base_id = parts[0].strip('|') + parts[1].zfill(3)  # Menggabungkan ID dan menambahkan angka nol jika perlu
    
    # if base id under 100, then add 0
    if int(base_id) < 100000:
        base_id = '0' + base_id    

    # Loop untuk menambahkan "_0X" dari 01 hingga 30, kecuali yang dihapus
    for i in range(1, 31):
        if "{:02d}".format(i) in removed_files:
            continue
        new_id = base_id + "_{:02d}".format(i)
        new_line = new_id + '|' + '|'.join(parts[2:])
        new_lines.append(new_line)

# Menambahkan karakter baris baru setelah baris terakhir
new_lines[-1] += '\n'

# buat file 
new_file = "data/transcription_phonem.csv"
with open(new_file, 'w') as file:
    file.writelines(new_lines)