#Creating ASR dataset for Khanty

In this code, we transform the audio and the corresponding .eaf annotations into a datasets library format so we can use this dataset to fine-tune a multilingual Whisper model.

In [None]:
pip install pydub pympi-ling

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting pympi-ling
  Downloading pympi_ling-1.70.2-py2.py3-none-any.whl (24 kB)
Installing collected packages: pympi-ling, pydub
Successfully installed pydub-0.25.1 pympi-ling-1.70.2


In [None]:
from pydub import AudioSegment
import pympi
import os
import pandas as pd
import re

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Function to cut audio into chunks (we cut the audio into segments corresponding with the annotations)
def cut_audio(audio_path, start_ms, end_ms, output_path):
    audio = AudioSegment.from_file(audio_path)
    chunk = audio[start_ms:end_ms]
    chunk.export(output_path, format="wav")

# Function to create chunks and DataFrame
def process_audio(audio_folder, eaf_folder):
    chunks_folder = "/content/gdrive/MyDrive/asr/chunks"
    os.makedirs(chunks_folder, exist_ok=True)

    chunk_data = []

    # Iterate through EAF files
    for eaf_file in os.listdir(eaf_folder):
        if eaf_file.endswith(".eaf"):
            eaf_path = os.path.join(eaf_folder, eaf_file)
            eaf = pympi.Elan.Eaf(eaf_path)
            audio_name = os.path.splitext(eaf_file)[0] + ".wav"
            audio_path = os.path.join(audio_folder, audio_name)
            tiers = eaf.get_tier_names()
            if 'phrase' in tiers:
            # Get annotation data for tier 'phrase'
              annotations = eaf.get_annotation_data_for_tier('phrase')


            # Iterate through annotations
              for idx, (start_ms, end_ms, transliteration) in enumerate(annotations):
                  chunk_name = f"{os.path.splitext(audio_name)[0]}_{idx}.wav"
                  chunk_path = os.path.join(chunks_folder, chunk_name)
                  cut_audio(audio_path, start_ms, end_ms, chunk_path)

                  # Append data to chunk_data list
                  chunk_data.append({"Filename": chunk_name, "Transliteration": transliteration})
            else:
              print(eaf_file)

    # Create DataFrame
    df = pd.DataFrame(chunk_data)

    # Save DataFrame to CSV
    df.to_csv("chunks_info.csv", index=False, encoding="utf-8")

# Example usage
audio_folder = "/content/gdrive/MyDrive/asr/audios"
eaf_folder = "/content/gdrive/MyDrive/asr/eafs"
process_audio(audio_folder, eaf_folder)

In [None]:
# Define the paths to the folders
audios_folder = "/content/gdrive/MyDrive/asr/audios"
eafs_folder = "/content/gdrive/MyDrive/asr/eafs"

# Get the list of audio files
audio_files = os.listdir(audios_folder)

# Get the list of eaf files
eaf_files = os.listdir(eafs_folder)

# Extract the base names (without extensions) of eaf files
eaf_base_names = {os.path.splitext(eaf)[0] for eaf in eaf_files}

# Check which audio files don't have a corresponding eaf file
audios_without_eafs = [audio for audio in audio_files if os.path.splitext(audio)[0] not in eaf_base_names]

# Print the result
print("Audio files without corresponding EAF files:")
for audio in audios_without_eafs:
    print(audio)


Audio files without corresponding EAF files:


In [None]:
with open('chunks_info.csv', 'r') as file1, open('chunks_info_2.csv', 'r') as file2:

    table1_content = file1.read()

    next(file2)
    table2_content = file2.read()


merged_content = table1_content + table2_content


with open('asr_info.csv', 'w') as merged_file:
    merged_file.write(merged_content)


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/asr/asr_info.csv')
df = df.dropna()


In [None]:
# We transliterate those segments that use different notation to make all the annotations the same format
# Mapping dictionary
transliteration_map = {
    'И': 'i', 'Ы': 'i', 'Э': 'e', 'Є': 'ԑ', 'ä': 'ă', 'Ă': 'ă', 'Ө': 'ɵ', 'Ў': 'ʉ', 'У': 'u', 'О': 'o',
    'А': 'a', 'ə': 'ə', 'Ӆ': 'λ', 'Т': 't', 'С': 's', 'Ш': 'š', 'Щ': 'ś', 'Њ': 'ń', 'њ': 'ń', 'Ӈ': 'ŋ',
    'Љ': "λ'", 'љ': "λ'", 'Л': 'l', 'В': 'w', 'Й': 'j', 'П': 'p', 'Р': 'r', 'З': 'z', 'К': 'k', 'Б': 'b', 'Д': 'd',
    'п': 'p', 'у': 'u', 'т': 't', 'ӑ': 'ă', 'р': 'r', 'э': 'e', 'м': 'm', 'М': 'm', 'й': 'j', 'с': 's', 'н': 'n',
    'в': 'w', 'о': 'o', 'ӊ': 'ň', 'Х': 'χ', 'щ': 'ś', 'и': 'i', 'ш': 'š', 'ю': 'ju', 'ә': 'ə', 'ў': 'ʉ',
    'к': 'k', 'д': 'd', 'Д': 'd', 'ӆ': 'λ', 'а': 'a', 'л': 'l', 'з': 'z', 'є': 'ԑ', 'ө': 'ɵ', 'ӈ': 'ŋ',
    'ӧ': 'ɵ', 'ы': 'i', 'б': 'b', 'г': 'g', 'ф': 'f', 'х': 'χ',
    'ь': "'", 'я' : 'ja', 'Я' : 'ja', 'ё': 'jo', 'Ё': 'jo'
}

# Function to transliterate a string using the mapping dictionary
def transliterate_string(text):
    transliterated_text = ''

    for char in text:
        if char in transliteration_map:
            transliterated_text += transliteration_map[char]
        else:
            transliterated_text += char
    return transliterated_text

# Apply transliteration only if File_Name starts with 'kh'
kh_rows = ~df['Filename'].str.startswith('kh_')
df.loc[kh_rows, 'Transliteration'] = df.loc[kh_rows, 'Transliteration'].apply(transliterate_string)


In [None]:
# We clean the annotations
def clean_text(text):

    cleaned_text = re.sub(r'[\d]', '', text)
    cleaned_text = re.sub(r'[«»''""<>]', '', cleaned_text)
    cleaned_text = re.sub(r'[а-яА-Я]', '', cleaned_text)
    cleaned_text = re.sub(r'[.?!:,\-\–]', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.replace('ä', 'ă').replace('…', '').replace('(', '').replace(')', '')
    return cleaned_text

df['Transliteration'] = df['Transliteration'].apply(clean_text)


In [None]:
#We add the path to the files for the model to be able to find them
def add_prefix(filename):
    return "/content/data/" + filename


df['Filename'] = df['Filename'].apply(add_prefix)

In [None]:
df = df.rename(columns={'Filename': 'file_name', 'Transliteration': 'transcription'})

In [None]:
df

Unnamed: 0,file_name,transcription
0,/content/gdrive/MyDrive/folder_for_export/data...,χʉλ suχ ewăλt tʉtśaň χir jontsăt
1,/content/gdrive/MyDrive/folder_for_export/data...,jis pura wʉš ewăλt χănnԑχujatăt χʉλ suχ ewa...
2,/content/gdrive/MyDrive/folder_for_export/data...,xvii xviiimit nəpătn χănšum nԑpkătn uša wԑr...
3,/content/gdrive/MyDrive/folder_for_export/data...,śi purajn pannԑ səχ kări χʉλ suχăt ewăλt λə...
4,/content/gdrive/MyDrive/folder_for_export/data...,śimăś λəmătsuχăt ăn toχsăt pa χʉw măr ta...
...,...,...
3559,/content/gdrive/MyDrive/folder_for_export/data...,šăŋk wăra
3560,/content/gdrive/MyDrive/folder_for_export/data...,χʉλ χăλti wăr ăntɵm wɵs
3561,/content/gdrive/MyDrive/folder_for_export/data...,śit iśi jăm chto χʉλan λiλiŋa χaśsət arašək j...
3562,/content/gdrive/MyDrive/folder_for_export/data...,śi oλaŋən tăm χʉλ oλaŋən muλti jăša putərsəm


In [None]:
df.to_csv('metadata.csv')