<a href="https://colab.research.google.com/github/sahith2004/Indic-Codecs-Evaluation/blob/main/Codecs_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install Requirements**

In [None]:
!pip install -U datasets
# !pip install git+https://github.com/huggingface/transformers.git@main
!git clone https://github.com/facebookresearch/encodec.git
!pip install torch torchaudio pesq pystoi librosa snac torchmetrics soundfile --quiet --force-reinstall
!pip install xcodec2
!pip install transformers==4.40.1

**Import Requirements**

In [None]:
import torch
import time
import torchaudio
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
import os
from encodec.encodec import EncodecModel
from encodec.encodec.utils import convert_audio
from snac import SNAC
from xcodec2.modeling_xcodec2 import XCodec2Model
from pesq import pesq
from pystoi import stoi
from torchmetrics.audio import ScaleInvariantSignalDistortionRatio


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

**Codecs Evaluation Metrics**

In [None]:
class CodecsQualityEvaluator:
    def __init__(self, sr=24000):
        self.sr = sr

    def _align_length(self, ref, deg):
        L = min(len(ref), len(deg))
        return ref[:L], deg[:L]

    def _compute_sdr(self, ref, deg):
        ref, deg = self._align_length(ref, deg)
        ref_t = torch.tensor(ref, dtype=torch.float32)
        deg_t = torch.tensor(deg, dtype=torch.float32)
        eps = 1e-8
        num = torch.sum(ref_t ** 2)
        den = torch.sum((ref_t - deg_t) ** 2) + eps
        return float(10 * torch.log10(num / den))

    def _compute_si_sdr(self, ref, deg):
        ref, deg = self._align_length(ref, deg)
        ref_t = torch.tensor(ref, dtype=torch.float32)
        deg_t = torch.tensor(deg, dtype=torch.float32)
        eps = 1e-8
        a = torch.dot(deg_t, ref_t) / (torch.dot(ref_t, ref_t) + eps)
        proj = a * ref_t
        noise = deg_t - proj
        return float(10 * torch.log10(torch.sum(proj**2) / (torch.sum(noise**2) + eps)))

    def _compute_pesq(self, ref, deg):
        import librosa
        from pesq import pesq

        ref, deg = self._align_length(ref, deg)

        if len(ref) < 4800:
            return np.nan

        r16 = librosa.resample(y=ref, orig_sr=self.sr, target_sr=16000)
        d16 = librosa.resample(y=deg, orig_sr=self.sr, target_sr=16000)

        if np.max(np.abs(r16)) < 1e-6 or np.max(np.abs(d16)) < 1e-6:
            return np.nan

        return pesq(16000, r16, d16, "wb")

    def _compute_stoi(self, ref, deg):
        import librosa
        from pystoi import stoi

        ref, deg = self._align_length(ref, deg)

        if len(ref) < 2000:
            return np.nan

        r16 = librosa.resample(y=ref, orig_sr=self.sr, target_sr=16000)
        d16 = librosa.resample(y=deg, orig_sr=self.sr, target_sr=16000)

        return stoi(r16, d16, 16000, extended=False)

    def _compute_mcd(self, ref, deg, n_mfcc=13):
        import librosa

        ref, deg = self._align_length(ref, deg)

        mf1 = librosa.feature.mfcc(y=ref.astype(float), sr=self.sr, n_mfcc=n_mfcc)
        mf2 = librosa.feature.mfcc(y=deg.astype(float), sr=self.sr, n_mfcc=n_mfcc)

        T = min(mf1.shape[1], mf2.shape[1])
        diff = mf1[:, :T] - mf2[:, :T]
        d = np.mean(np.sqrt(np.sum(diff**2, axis=0)))
        return float(10 / np.log(10) * np.sqrt(2) * d)

    def _compute_mel_distance(self, ref, deg):
        import librosa
        ref, deg = self._align_length(ref, deg)

        mel_r = librosa.feature.melspectrogram(y=ref, sr=self.sr, n_mels=80)
        mel_d = librosa.feature.melspectrogram(y=deg, sr=self.sr, n_mels=80)

        T = min(mel_r.shape[1], mel_d.shape[1])
        return float(np.mean(np.abs(mel_r[:, :T] - mel_d[:, :T])))

    def _compute_stft_distance(self, ref, deg):
        import librosa
        ref, deg = self._align_length(ref, deg)

        S_r = np.abs(librosa.stft(y=ref, n_fft=1024, hop_length=256))
        S_d = np.abs(librosa.stft(y=deg, n_fft=1024, hop_length=256))

        T = min(S_r.shape[1], S_d.shape[1])
        return float(np.mean(np.abs(S_r[:, :T] - S_d[:, :T])))

    def evaluate(self, ref, deg):
        ref, deg = self._align_length(ref, deg)

        return {
            "SDR": self._compute_sdr(ref, deg),
            "SI-SDR": self._compute_si_sdr(ref, deg),
            "PESQ": self._compute_pesq(ref, deg),
            "STOI": self._compute_stoi(ref, deg),
            "MCD": self._compute_mcd(ref, deg),
            "Mel_Distance": self._compute_mel_distance(ref, deg),
            "STFT_Distance": self._compute_stft_distance(ref, deg),
        }


**Loading Codecs**




**Encodec**

In [None]:


class EncodecCodec:
    def __init__(self, model_type="encodec_24khz", bitrate=None, nq=None, device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        if model_type == "encodec_24khz":
            self.model = EncodecModel.encodec_model_24khz().to(self.device)
        elif model_type == "encodec_48khz":
            self.model = EncodecModel.encodec_model_48khz().to(self.device)
        else:
            raise ValueError("Invalid model type")
        self.sr = self.model.sample_rate
        self.channels = self.model.channels
        if bitrate is not None:
            self.set_bitrate(bitrate)
        if nq is not None:
            self.set_nq(nq)

    def set_bitrate(self, kbps):
        self.model.set_target_bandwidth(kbps)

    def set_nq(self, nq):
        if hasattr(self.model, "quantizer"):
            self.model.quantizer.n_q = nq

    def load_audio(self, file_path):
        wav, sr = torchaudio.load(file_path)
        wav = convert_audio(wav, sr, self.sr, self.channels)
        return wav.unsqueeze(0).to(self.device)

    @torch.no_grad()
    def encode(self, wav):
        return self.model.encode(wav)

    @torch.no_grad()
    def decode(self, encoded):
        return self.model.decode(encoded)

    def count_bytes(self, encoded):
        total = 0
        for pkt in encoded:
            if "codes" in pkt:
                total += pkt["codes"].numel()
            if "scales" in pkt:
                total += pkt["scales"].numel()
        return int(total)


**SNAC**

In [None]:


class SNACCodec:
    def __init__(self, model_name="hubertsiuzdak/snac_32khz", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = SNAC.from_pretrained(model_name).eval().to(self.device)

    def load_audio(self, tensor):
        return tensor.to(self.device)

    @torch.inference_mode()
    def encode(self, audio):
        return self.model.encode(audio)

    @torch.inference_mode()
    def decode(self, codes):
        return self.model.decode(codes)


**XCodec2**

In [None]:

class XCodec2Codec:
    def __init__(self, model_path="HKUSTAudio/xcodec2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = XCodec2Model.from_pretrained(model_path).eval().to(self.device)

    def load_audio(self, path):
        wav, sr = sf.read(path)
        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).to(self.device)
        return wav_tensor, sr

    @torch.no_grad()
    def encode(self, wav_tensor):
        return self.model.encode_code(input_waveform=wav_tensor)

    @torch.no_grad()
    def decode(self, codes):
        return self.model.decode_code(codes)

    def save_audio(self, tensor, sr, path):
        sf.write(path, tensor[0, 0].cpu().numpy(), sr)

**Initializing Codecs**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

languages = {
    "bengali": "/content/drive/MyDrive/indicvoices_audio_bengali",
    "malayalam": "/content/drive/MyDrive/indicvoices_audio_malayalam",
    "marathi": "/content/drive/MyDrive/indicvoices_audio_marathi",
    "hindi": "/content/drive/MyDrive/indicvoices_audio_hindi",
    "gujarati": "/content/drive/MyDrive/indicvoices_audio_gujarati",
    "english": "/content/drive/MyDrive/librispeech_audios",
}

bandwidths = {
    1.5: 2,
    3.0: 4,
    6.0: 8,
    12.0: 16,
    24.0: 32,
}

enc_model = EncodecCodec(model_type="encodec_24khz")
snac_model = SNACCodec()
xcodec_model = XCodec2Codec()

codecs = {
    "Encodec": enc_model,
    "SNAC": snac_model,
    "XCodec2": xcodec_model,
}

evaluator = CodecsQualityEvaluator(sr=24000)
language_dfs = {}


  WeightNorm.apply(module, name, dim)
  state_dict = torch.load(model_path, map_location="cpu")
You are using a model of type xcodec2 to instantiate a model of type xcodec. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at HKUSTAudio/xcodec2 were not used when initializing XCodec2Model: ['CodecEnc.conv_blocks.0.weight_g', 'CodecEnc.conv_blocks.0.weight_v', 'CodecEnc.conv_blocks.1.block.0.block.1.weight_g', 'CodecEnc.conv_blocks.1.block.0.block.1.weight_v', 'CodecEnc.conv_blocks.1.block.0.block.3.weight_g', 'CodecEnc.conv_blocks.1.block.0.block.3.weight_v', 'CodecEnc.conv_blocks.1.block.1.block.1.weight_g', 'CodecEnc.conv_blocks.1.block.1.block.1.weight_v', 'CodecEnc.conv_blocks.1.block.1.block.3.weight_g', 'CodecEnc.conv_blocks.1.block.1.block.3.weight_v', 'CodecEnc.conv_blocks.1.block.2.block.1.weight_g', 'CodecEnc.conv_blocks.1.block.2.block.1.weight_v', 'CodecEnc.conv_blocks.1.block.2.block.3.weight_g', 'CodecEnc.con

In [None]:



for lang, base in tqdm(languages.items(), desc="Languages"):

    lang_results = []
    files = sorted(f for f in os.listdir(base) if f.endswith(".wav"))[:10]

    for bw, nq in tqdm(bandwidths.items(), desc=f"{lang} bandwidth", leave=False):

        enc_model.set_bitrate(bw)
        enc_model.set_nq(nq)

        for codec_name, codec in codecs.items():

            metrics_log = {
                "SDR": [], "SI-SDR": [], "PESQ": [], "STOI": [],
                "MCD": [], "Mel_Distance": [], "STFT_Distance": [],
                "Enc_Latency_ms": [], "Dec_Latency_ms": [],
                "RTF": [], "Compression_Ratio": [],
            }

            for fname in tqdm(files, leave=False, desc=f"{codec_name}"):

                path = os.path.join(base, fname)


                wav, sr = torchaudio.load(path)


                wav = convert_audio(wav, sr, 24000, 1).to(device).unsqueeze(0)
                wav_seconds = wav.shape[-1] / 24000
                input_size_bytes = os.path.getsize(path)

                t0 = time.time()

                encoded = codec.encode(wav)

                t1 = time.time()
                enc_time = (t1 - t0) * 1000

                if codec_name == "Encodec":
                    encoded_bytes = enc_model.count_bytes(encoded)
                else:
                    try:
                        encoded_bytes = encoded.numel()
                    except:
                        encoded_bytes = np.nan

                t2 = time.time()
                try:
                    decoded = codec.decode(encoded)
                except:
                    continue
                t3 = time.time()
                dec_time = (t3 - t2) * 1000

                comp_ratio = input_size_bytes / encoded_bytes if encoded_bytes > 0 else np.nan
                rtf = ((t1 - t0) + (t3 - t2)) / wav_seconds

                ref_np = wav.squeeze().cpu().numpy()
                dec_np = decoded.squeeze().cpu().numpy()

                m = evaluator.evaluate(ref_np, dec_np)

                for k in m: metrics_log[k].append(m[k])
                metrics_log["Enc_Latency_ms"].append(enc_time)
                metrics_log["Dec_Latency_ms"].append(dec_time)
                metrics_log["RTF"].append(rtf)
                metrics_log["Compression_Ratio"].append(comp_ratio)

            lang_results.append({
                "Model": codec_name,
                "Bandwidth": f"{bw} kbps" if codec_name == "Encodec" else "fixed",
                "Nq": nq if codec_name == "Encodec" else "fixed",

                "SDR": np.nanmean(metrics_log["SDR"]),
                "SI-SDR": np.nanmean(metrics_log["SI-SDR"]),
                "PESQ": np.nanmean(metrics_log["PESQ"]),
                "STOI": np.nanmean(metrics_log["STOI"]),
                "MCD": np.nanmean(metrics_log["MCD"]),
                "Mel_Distance": np.nanmean(metrics_log["Mel_Distance"]),
                "STFT_Distance": np.nanmean(metrics_log["STFT_Distance"]),
                "Encode_Latency_ms": np.nanmean(metrics_log["Enc_Latency_ms"]),
                "Decode_Latency_ms": np.nanmean(metrics_log["Dec_Latency_ms"]),
                "RTF": np.nanmean(metrics_log["RTF"]),
                "Compression_Ratio": np.nanmean(metrics_log["Compression_Ratio"]),
                "Speed_xRealTime": 1 / np.nanmean(metrics_log["RTF"]),
            })

    df = pd.DataFrame(lang_results)

    high = ["SDR", "SI-SDR", "PESQ", "STOI", "Compression_Ratio", "Speed_xRealTime"]
    low = ["MCD", "Mel_Distance", "STFT_Distance", "Encode_Latency_ms",
           "Decode_Latency_ms", "RTF"]

    for col in high:
        df[col] = df[col].apply(lambda x: f"{x:.3f} ↑")

    for col in low:
        df[col] = df[col].apply(lambda x: f"{x:.3f} ↓")

    language_dfs[lang] = df

    print(f"Results for {lang.upper()}")
    display(df)


Languages:   0%|          | 0/6 [00:00<?, ?it/s]
bengali bandwidth:   0%|          | 0/5 [00:00<?, ?it/s][A



Encodec:  10%|█         | 1/10 [00:01<00:13,  1.51s/it][A[A

Encodec:  20%|██        | 2/10 [00:08<00:36,  4.59s/it][A[A

Encodec:  30%|███       | 3/10 [00:23<01:06,  9.56s/it][A[A

Encodec:  40%|████      | 4/10 [00:25<00:39,  6.61s/it][A[A

Encodec:  50%|█████     | 5/10 [00:29<00:27,  5.42s/it][A[A

Encodec:  60%|██████    | 6/10 [01:33<01:41, 25.45s/it][A[A

Encodec:  70%|███████   | 7/10 [01:56<01:13, 24.52s/it][A[A

Encodec:  80%|████████  | 8/10 [03:19<01:26, 43.19s/it][A[A

Encodec:  90%|█████████ | 9/10 [04:10<00:45, 45.73s/it][A[A

Encodec: 100%|██████████| 10/10 [05:39<00:00, 59.08s/it][A[A

  "Compression_Ratio": np.nanmean(metrics_log["Compression_Ratio"]),




SNAC:  10%|█         | 1/10 [00:06<00:54,  6.03s/it][A[A

SNAC:  20%|██        | 2/10 [00:20<01:27, 10.93s/it][A[A

SNAC:  30%|███       | 3/10 [00:52<02:25, 20.76s/it][A[A

SNAC