# Testing speechbrain SLID model

In this code, we test the speechbrain spoken language identification model's ability to distinguish between Russian and Khanty. Even though the model is not trained on Khanty, we hope that it will identify Khanty as its closest high-resource languages from the same family. We limit the model to just six languages: Finnish, Estonian, Hungarian and Russian, Belarussian, Ukraninan. If the highest probability of a segment is one of the first three languages, we consider the language of the segment to be Khanty. If the highest probability of a segment is one of the last three languages, we consider the language of the segment to be Russian.

As a result, we achieve 54% accuracy and conclude that additional fine-tuning of the SLID model on a novel dataset specifically designed for language identification is needed.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# 26 - finnish, 38 - hungarian, 23 - estonian
# 77 - russian, 7 - belarusian, 99 - ukranian
# indexes 0,1,2 - khanty. 3,4,5 - russian

In [None]:
import os
from pympi import Eaf
import pandas as pd

def process_files(eafs_folder, audios_folder, processing_list):

    '''This function processes a set of audio and annotation files. It extracts audio segments based on annotations in
    EAF files and saves these segments as separate audio files.
    The function also creates a CSV file listing each audio segment's filename and corresponding annotation text.'''
    diar_chunks_folder = "/content/gdrive/MyDrive/diarizationcorpora/diar_chunks"
    os.makedirs(diar_chunks_folder, exist_ok=True)

    chunks_table = []

    for filename in processing_list:
        eaf_path = os.path.join(eafs_folder, filename + ".eaf")
        audio_path = os.path.join(audios_folder, filename + ".wav")

        if not os.path.exists(eaf_path) or not os.path.exists(audio_path):
            print(f"Skipping {filename}: EAF or audio file not found.")
            continue

        eaf = Eaf(eaf_path)
        tiers = eaf.get_tier_names()

        language_tiers = [tier for tier in tiers if 'language' in tier.lower()]
        if not language_tiers:
            print(f"No language tiers found in {filename}. Skipping.")
            continue

        annotations = []
        for tier_name in language_tiers:
            for annotation in eaf.get_annotation_data_for_tier(tier_name):
                annotations.append(annotation)

        for annotation in annotations:
            start_time, end_time, text = annotation
            chunk_name = f"{filename}_{start_time}_{end_time}.wav"
            chunk_path = os.path.join(diar_chunks_folder, chunk_name)
            start_time_seconds = start_time / 1000
            end_time_seconds = end_time / 1000

            os.system(f"ffmpeg -i {audio_path} -ss {start_time_seconds} -to {end_time_seconds} -c copy {chunk_path}")
            chunks_table.append((chunk_name, text))

    chunks_df = pd.DataFrame(chunks_table, columns=["Chunk Name", "Annotation"])
    chunks_df.to_csv("chunks_table.csv", index=False)


eafs_folder = "/content/gdrive/MyDrive/diarizationcorpora/eafs/"
audios_folder = "/content/gdrive/MyDrive/diarizationcorpora/audio_final/"
processing_list = ['knn_vp', 'rai_sds_1', 'rai_vm_2', 'x_sds_1', 'sed_as_1', 'rai_sds_2']

process_files(eafs_folder, audios_folder, processing_list)


In [None]:
#Loading speechbrain langid model
import os
import torch
import torchaudio
import pandas as pd
from speechbrain.inference.classifiers import EncoderClassifier
audio_detector = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="/tmp")

hyperparams.yaml:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/84.5M [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/763k [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
def detect_language(audio_path):

  '''detect_language loads an audio file, encodes it to generate embeddings,
  predicts the probability distribution over different language classes,
  selects specific language probabilities, and determines the detected language
  as either "khanty" or "russian" based on the highest probability among selected classes.'''

    audio = audio_detector.load_audio(audio_path)
    emb = audio_detector.encode_batch(audio.unsqueeze(0))
    out_prob = audio_detector.mods.classifier(emb).squeeze(1)
    probabilities = torch.nn.functional.softmax(out_prob, dim=-1)
    probabilities = torch.tensor([probabilities[0][26], probabilities[0][38], probabilities[0][23], probabilities[0][77],  probabilities[0][7],  probabilities[0][99]])
    probabilities = probabilities.tolist()

    detected_language = "khanty" if probabilities.index(max(probabilities)) in [0, 1, 2] else "russian"

    return detected_language

def add_language(chunk_folder, chunks_table_path):

  '''add_language reads a CSV file containing information about audio chunks,
  detects the language for each chunk using detect_language, adds this language information to the table,
  and saves the updated table to a new CSV file.'''

    chunks_df = pd.read_csv(chunks_table_path)

    chunks_table_with_language = []
    for index, row in chunks_df.iterrows():
        chunk_path = os.path.join(chunk_folder, row["Chunk Name"])
        detected_language = detect_language(chunk_path)
        chunks_table_with_language.append((row["Chunk Name"], row["Annotation"], detected_language))

    chunks_table_with_language_df = pd.DataFrame(chunks_table_with_language, columns=["Chunk Name", "Annotation", "Detected Language"])
    chunks_table_with_language_df.to_csv("chunks_table_with_language.csv", index=False)

chunk_folder = "/content/gdrive/MyDrive/diarizationcorpora/diar_chunks/"
chunks_table_path = "chunks_table.csv"

add_language(chunk_folder, chunks_table_path)

In [None]:
df = pd.read_csv('chunks_table_with_language.csv')
df['Annotation'] = df['Annotation'].replace(r'rus', 'russian', regex=True)
accuracy = (df["Annotation"] == df["Detected Language"]).mean()

In [None]:
accuracy

0.5443094916779128