In [4]:
# !pip3 install resemblyzer -i https://mirrors.aliyun.com/pypi/simple/
# !pip3 install noisereduce -i https://mirrors.aliyun.com/pypi/simple/ 

In [16]:
# !pip3 install git+https://github.com/wenet-e2e/wespeaker.git --no-dependencies
# !pip3 install silero-vad --no-dependencies -i https://mirrors.aliyun.com/pypi/simple/
# !pip3 install transformers -i https://mirrors.aliyun.com/pypi/simple/ 
# !pip3 install kaldiio==2.17.2 -i https://mirrors.aliyun.com/pypi/simple/ 
# !pip3 install fairseq -i https://mirrors.aliyun.com/pypi/simple/ 
# !pip3 install s3prl@git+https://github.com/s3prl/s3prl.git@7ab62aaf2606d83da6c71ee74e7d16e0979edbc3#egg=s3prl

In [1]:
import warnings
warnings.filterwarnings("ignore")

from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import wespeaker
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
from tqdm import tqdm
import torch
import librosa
import numpy as np
import glob
import os

import sys
sys.path.append("../../cuhksz-phd/sho_util/pyfiles/")
from sound import play_audio

from IPython.display import clear_output

sr = 16000

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [4]:
### Objective Scores ###
import requests
from wespeaker.cli.hub import Hub, download
# For Wespeaker
def download_wespeaker_model_local(lang: str, model_dir: str) -> str:
    if lang not in Hub.Assets.keys():
        print('ERROR: Unsupported lang {} !!!'.format(lang))
        sys.exit(1)
    model = Hub.Assets[lang]
    # model_dir = os.path.join(Path.home(), ".wespeaker", lang)
    model_dir = os.path.join(model_dir, ".wespeaker", lang)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    if set(["avg_model.pt",
            "config.yaml"]).issubset(set(os.listdir(model_dir))):
        return model_dir
    else:
        response = requests.get(
            "https://modelscope.cn/api/v1/datasets/wenet/wespeaker_pretrained_models/oss/tree"  # noqa
        )
        model_info = next(data for data in response.json()["Data"]
                          if data["Key"] == model)
        model_url = model_info['Url']
        download(model_url, model_dir)
        return model_dir

def cosine_similarity(e1, e2): # from wespeaker, delete the normalizing part
    cosine_score = torch.dot(e1, e2) / (torch.norm(e1) * torch.norm(e2))
    cosine_score = cosine_score.item()
    # return (cosine_score + 1.0) / 2
    return cosine_score

class SpeechObjectiveEvaluation:
    def __init__(self, sr=16000, target_models=["speechmos", "wavlm", "wespeaker"],
                 # wavlm_path='../../UniSpeech/WavLM/models--microsoft--wavlm-base-plus-sv/snapshots/feb593a6c23c1cc3d9510425c29b0a14d2b07b1e/',
                 wavlm_path='/home/sho/.cache/huggingface/hub/models--microsoft--wavlm-base-plus-sv/snapshots/feb593a6c23c1cc3d9510425c29b0a14d2b07b1e/',
                 device="cuda",
                ):
        self.target_models = target_models
        self.sr = sr
        self.device = "cuda"
        if "wavlm" in target_models:
            self.wavlm_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wavlm_path, local_files_only=True)
            self.wavlm = WavLMForXVector.from_pretrained(wavlm_path, local_files_only=True)
            # self.wavlm_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
            # self.wavlm = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv')
        self.wespeaker = {}
        wespeaker_dir = "/mntcephfs/data/audiow/shoinoue/Model/models/wespeaker"
        if "wespeaker" in target_models:
            wespeaker_model_dir = download_wespeaker_model_local("english", wespeaker_dir)
            wespeaker.load_model_local(wespeaker_model_dir)
        if "wespeaker_lm" in target_models:
            self.wespeaker["wespeaker_lm"] = wespeaker.load_model_local(f'{wespeaker_dir}/voxceleb_resnet34_LM/')
        if "wespeaker_nolm" in target_models:
            self.wespeaker["wespeaker_nolm"] = wespeaker.load_model_local(f'{wespeaker_dir}/voxceleb_resnet34/')
        if "resemblyzer" in target_models:
            self.resemblyzer = VoiceEncoder()
        if "speechmos" in target_models:
            self.speechmos = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong")
        
    def get_speaker_similarity(self, f1, f2):
        similarity = {}
        if "wavlm" in self.target_models:
            self.wavlm = self.wavlm.to(self.device)
            
            audio = [librosa.load(path, sr=self.sr)[0] for path in [f1, f2]]
            inputs = self.wavlm_feature_extractor(audio, padding=True, return_tensors="pt", sampling_rate=self.sr).to(self.device)
            embeddings = self.wavlm(**inputs).embeddings
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
            cosine_sim = torch.nn.CosineSimilarity(dim=-1)
            sim = cosine_sim(embeddings[0], embeddings[1])
            similarity["wavlm"] = np.array(sim.detach().cpu()).sum()
            
            self.wavlm = self.wavlm.to("cpu")
            
        for ws in ["wespeaker", "wespeaker_lm", "wespeaker_nolm"]:
            if ws in self.target_models:
                
                self.wespeaker[ws].model = self.wespeaker[ws].model.to(self.device)
                self.wespeaker[ws].device = self.device

                e1 = self.wespeaker[ws].extract_embedding(f1)
                e2 = self.wespeaker[ws].extract_embedding(f2)
                similarity[ws] = cosine_similarity(e1, e2)

                self.wespeaker[ws].model = self.wespeaker[ws].model.to("cpu")
                self.wespeaker[ws].device = "cpu"
            
        if "resemblyzer" in self.target_models:
            path = Path(f1)
            wav = preprocess_wav(f1, self.sr)
            e1 = self.resemblyzer.embed_utterance(wav).reshape(1, -1)
            path = Path(f2)
            wav = preprocess_wav(f2, self.sr)
            e2 = self.resemblyzer.embed_utterance(wav).reshape(1, -1)
            similarity["resemblyzer"] = cosine_similarity(torch.tensor(e1[0]), torch.tensor(e2[0]))
        return similarity
    
    def get_speech_quality(self, path):
        quality = {}
        if "speechmos" in self.target_models:
            wave, sr = librosa.load(path, sr=self.sr, mono=True)
            quality["speechmos"] = np.array(self.speechmos(torch.from_numpy(wave).unsqueeze(0), sr).detach().cpu()).sum()
        return quality
soe = SpeechObjectiveEvaluation(sr, ["resemblyzer", "wavlm"])

In [None]:
gt_dir = ""

In [3]:
soe

<__main__.SpeechObjectiveEvaluation at 0x1554af1a9940>