In [18]:
# !pip install noisereduce -i https://pypi.tuna.tsinghua.edu.cn/simple

In [1]:
import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
import torch
import librosa
import numpy as np
import glob
import os
from tqdm import tqdm
import pandas as pd

import sys
sys.path.append("../../cuhksz-phd/sho_util/pyfiles/")
from sound import play_audio
from secs import SpeechObjectiveEvaluation

from IPython.display import clear_output

information = [
    ["ABA", "Arabic", "M"],
    ["SKA", "Arabic", "F"],
    ["YBAA", "Arabic", "M"],
    ["ZHAA", "Arabic", "F"],
    ["BWC", "Mandarin", "M"],
    ["LXC", "Mandarin", "F"],
    ["NCC", "Mandarin", "F"],
    ["TXHC", "Mandarin", "M"],
    ["ASI", "Hindi", "M"],
    ["RRBI", "Hindi", "M"],
    ["SVBI", "Hindi", "F"],
    ["TNI", "Hindi", "F"],
    ["HJK", "Korean", "F"],
    ["HKK", "Korean", "M"],
    ["YDCK", "Korean", "F"],
    ["YKWK", "Korean", "M"],
    ["EBVS", "Spanish", "M"],
    ["ERMS", "Spanish", "M"],
    ["MBMPS", "Spanish", "F"],
    ["NJS", "Spanish", "F"],
    ["HQTV", "Vietnamese", "M"],
    ["PNV", "Vietnamese", "F"],
    ["THV", "Vietnamese", "F"],
    ["TLV", "Vietnamese", "M"],
]
spk2acc = {info[0]: info[1] for info in information}
spk2sex = {info[0]: info[2] for info in information}
acc2spk = {key: [] for key in set(list(spk2acc.values()))}
sex2spk = {key: [] for key in set(list(spk2sex.values()))}
for spk in spk2acc:
    acc2spk[spk2acc[spk]] += [spk]
    sex2spk[spk2sex[spk]] += [spk]

In [2]:
basedir = "/mntcephfs/lab_data/shoinoue/Dataset/L2-ARCTIC/"
speakers = [os.path.basename(a[:-1]) for a in glob.glob(basedir+"*/")]
speakers.remove("suitcase_corpus")
speakers.sort()
speakers_ordered = [a for la in acc2spk for a in acc2spk[la]]
fulllist = [os.path.basename(a) for a in glob.glob(basedir + f"{speakers[4]}/wav/*")]
fulllist.sort()
commonfiles = [base for base in fulllist if len(glob.glob(basedir + f"*/wav/{base}"))==len(speakers)]
sr = 16000

models = ["resemblyzer", "wavlm", "wespeaker", "wespeaker_lm", "wespeaker_nolm"]
soe = SpeechObjectiveEvaluation(sr, models)

Some weights of the model checkpoint at /mntcephfs/data/audiow/shoinoue/Model/hf_hub/wavlm/models--microsoft--wavlm-base-plus-sv/snapshots/feb593a6c23c1cc3d9510425c29b0a14d2b07b1e/ were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at /mntcephfs/data/audiow/shoinoue/Model/hf_hub/wavlm/models--microsoft--wavlm-base-plus-sv/snapshots/feb593a6c23c1cc3d9510425c29b0a1

Loaded the voice encoder model on cuda in 2.61 seconds.


In [3]:
def cosine_similarity(e1, e2): # from wespeaker, delete the normalizing part
    e1 = torch.tensor(e1)
    e2 = torch.tensor(e2)
    cosine_score = torch.dot(e1, e2) / (torch.norm(e1) * torch.norm(e2))
    cosine_score = cosine_score.item()
    return cosine_score

def get_similarity_matrix(data, base):
    arrays = np.zeros((len(speakers), len(speakers)))
    for s1, spk1 in enumerate(speakers_ordered):
        for s2, spk2 in enumerate(speakers_ordered):
            if s1==s2:
                score = np.nan
            else:
                score = cosine_similarity(data[base][spk1], data[base][spk2])
            arrays[s1, s2] = score
    spksim = pd.DataFrame(arrays, index=speakers_ordered, columns=speakers_ordered)
    step = 4
    nsteps = spksim.shape[0]//step
    arrays = np.zeros((nsteps, nsteps))
    for i in range(nsteps):
        for j in range(nsteps):
            score = spksim.iloc[i*step:i*step+step, j*step:j*step+step].mean().mean()
            arrays[i, j] = score
    accsim = pd.DataFrame(arrays, index=list(acc2spk.keys()), columns=list(acc2spk.keys()))
    return spksim, accsim

In [4]:
sample_num = 100

load = True
if load:
    data = np.load("temp.npy", allow_pickle=True).item()
else:
    data = {key: {} for key in models}
    np.random.seed(0)
    for b, base in enumerate(np.random.choice(commonfiles, sample_num, False)):
        for key in models:
            data[key][base] = {}
        for s, spk in enumerate(speakers):
            clear_output(wait=True)
            print(f"{b+1} / {sample_num}: {base}")
            print(f"{s+1} / {len(speakers)}: {spk}")
            path = basedir + f"{spk}/wav/{base}"
            scores = soe.get_speaker_embedding(path)
            for key in scores:
                data[key][base][spk] = scores[key]
    np.save("temp.npy", data)

In [6]:
dt = data["resemblyzer"]

base = list(dt.keys())[0]
arrays = []
for base in dt:
    spksim, accsim = get_similarity_matrix(dt, base)
    arrays += [accsim.values]
accsim.loc[:] = np.array(arrays).mean(0)

min_, max_ = accsim.values.min(), accsim.values.max()
def heatmap_color(val):
    val = (val-min_) / (max_-min_)
    color = f'background-color: rgba(255, 255, 255, {val*0.7});'
    return color
accsim.style.applymap(heatmap_color)

Unnamed: 0,Arabic,Hindi,Mandarin,Korean,Spanish,Vietnamese
Arabic,0.541653,0.573041,0.577823,0.571077,0.591679,0.583084
Hindi,0.573041,0.612271,0.560145,0.543087,0.554356,0.585162
Mandarin,0.577823,0.560145,0.573442,0.583355,0.601656,0.587967
Korean,0.571077,0.543087,0.583355,0.57515,0.581732,0.561066
Spanish,0.591679,0.554356,0.601656,0.581732,0.579616,0.586228
Vietnamese,0.583084,0.585162,0.587967,0.561066,0.586228,0.587078


In [5]:
l = []

for key in tqdm(data):
    dt = data[key]
    base = list(dt.keys())[0]
    arrays = []
    for base in dt:
        spksim, accsim = get_similarity_matrix(dt, base)
        arrays += [accsim.values]
    accsim.loc[:] = np.array(arrays).mean(0)

    array = []
    for i, acc in enumerate(acc2spk):
        same = accsim.iloc[i, i]
        diff = np.array([accsim.iloc[i, j] for j in range(accsim.shape[1]) if i!=j]).mean()
        value = (same-diff)
        array += [value]
    l += [array]

100%|██████████| 5/5 [00:28<00:00,  5.61s/it]


In [6]:
colors = ["green", "red"]
def color_gradient_custom_minmax(val, min_val, max_val, higher_better=True):
    if val==0:
        color = "gray"
        min_lighter = False
    else:
        if val>0:
            min_ = 0
            max_ = max_val
            color = "green" if higher_better else "red"
            # min_lighter = False if higher_better else True
            min_lighter = False
        if val<0:
            min_ = min_val
            max_ = 0
            color = "red" if higher_better else "green"
            # min_lighter = True if higher_better else False
            min_lighter = True
        val = (val-min_)/(max_-min_)
    if min_lighter:
        intensity = int(255 * (1 - val))
    else:
        intensity = 255-int(255 * (1 - val))
    if color=="red":
        return f'background-color: rgb({intensity}, 0, 0)'
    elif color=="green":
        return f'background-color: rgb(0, {intensity}, 0)'
    elif color=="blue":
        return f'background-color: rgb(0, 0, {intensity})'
    else: # Gray
        intensity = intensity /1.3
        return f'background-color: rgb({intensity}, {intensity}, {intensity})'

def apply_color_gradient(df):
    min_val = df[df<0].min().min()
    max_val = df[df>0].max().max()
    styled_df = df.style.applymap(lambda val: color_gradient_custom_minmax(val, min_val, max_val, True))
    return styled_df

In [7]:
df = pd.DataFrame(np.array(l), index=data, columns=acc2spk)
apply_color_gradient(df)

Unnamed: 0,Spanish,Vietnamese,Korean,Arabic,Mandarin,Hindi
resemblyzer,-0.003514,0.006377,0.007087,-0.037688,-0.008748,0.049113
wavlm,-0.052483,0.006468,-0.072829,-0.058542,-0.068041,0.155372
wespeaker,0.019331,0.032736,0.023881,-0.028179,0.052947,0.121516
wespeaker_lm,0.019165,0.031926,0.019491,-0.038159,0.026311,0.099001
wespeaker_nolm,0.016382,0.027922,0.017401,-0.036329,0.023348,0.123795
