In [None]:
import torchaudio
from speechbrain.pretrained.interfaces import foreign_class
import glob
import os

import IPython
def play_audio(data, rate):
    IPython.display.display(IPython.display.Audio(data=data,rate=rate))

# classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
classifier = foreign_class(source="pretrained_models/CustomEncoderWav2vec2Classifier-a72df039c801fa14a1c3226e95ab8c14/", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")

information = [
    ["ABA", "Arabic", "M"],
    ["SKA", "Arabic", "F"],
    ["YBAA", "Arabic", "M"],
    ["ZHAA", "Arabic", "F"],
    ["BWC", "Mandarin", "M"],
    ["LXC", "Mandarin", "F"],
    ["NCC", "Mandarin", "F"],
    ["TXHC", "Mandarin", "M"],
    ["ASI", "Hindi", "M"],
    ["RRBI", "Hindi", "M"],
    ["SVBI", "Hindi", "F"],
    ["TNI", "Hindi", "F"],
    ["HJK", "Korean", "F"],
    ["HKK", "Korean", "M"],
    ["YDCK", "Korean", "F"],
    ["YKWK", "Korean", "M"],
    ["EBVS", "Spanish", "M"],
    ["ERMS", "Spanish", "M"],
    ["MBMPS", "Spanish", "F"],
    ["NJS", "Spanish", "F"],
    ["HQTV", "Vietnamese", "M"],
    ["PNV", "Vietnamese", "F"],
    ["THV", "Vietnamese", "F"],
    ["TLV", "Vietnamese", "M"],
    ["cmu_us_bdl_arctic", "US", "M"],
    ["cmu_us_eey_arctic", "US", "F"],
    ["cmu_us_slt_arctic", "US", "F"],
    ["cmu_us_rms_arctic", "US", "M"],
]
spk2acc = {info[0]: info[1] for info in information}
spk2sex = {info[0]: info[2] for info in information}
acc2spk = {key: [] for key in set(list(spk2acc.values()))}
sex2spk = {key: [] for key in set(list(spk2sex.values()))}
for spk in spk2acc:
    acc2spk[spk2acc[spk]] += [spk]
    sex2spk[spk2sex[spk]] += [spk]

In [None]:
basedir = "./../Dataset/L2-ARCTIC_v5/"
speakers = [os.path.basename(a[:-1]) for a in glob.glob(basedir+"*/")]
speakers.remove("suitcase_corpus")
speakers.sort()
speakers_ordered = [a for la in acc2spk for a in acc2spk[la]]
fulllist = [os.path.basename(a) for a in glob.glob(basedir + f"{speakers[4]}/wav/*")]
fulllist.sort()
commonfiles = [base for base in fulllist if len(glob.glob(basedir + f"*/wav/{base}"))==len(speakers)]

In [None]:
import torch
import numpy as np
from tqdm import tqdm
import pandas as pd
from IPython.display import clear_output

def get_accent_embedding(path, sr=16000):
    signal, org_sr = torchaudio.load(path)
    signal = torchaudio.functional.resample(signal, orig_freq=org_sr, new_freq=sr)
    embeddings =  classifier.encode_batch(signal)
    return embeddings[0]

def cosine_similarity(e1, e2): # from wespeaker, delete the normalizing part
    cosine_score = torch.dot(e1, e2) / (torch.norm(e1) * torch.norm(e2))
    cosine_score = cosine_score.item()
    return cosine_score

def get_similarity_matrix(data, base):
    arrays = np.zeros((len(speakers), len(speakers)))
    for s1, spk1 in enumerate(speakers_ordered):
        for s2, spk2 in enumerate(speakers_ordered):
            if s1==s2:
                score = np.nan
            else:
                score = cosine_similarity(data[base][spk1], data[base][spk2])
            arrays[s1, s2] = score
    spksim = pd.DataFrame(arrays, index=speakers_ordered, columns=speakers_ordered)

    step = 4
    nsteps = spksim.shape[0]//step
    arrays = np.zeros((nsteps, nsteps))
    for i in range(nsteps):
        for j in range(nsteps):
            score = spksim.iloc[i*step:i*step+step, j*step:j*step+step].mean().mean()
            arrays[i, j] = score
    accsim = pd.DataFrame(arrays, index=list(acc2spk.keys()), columns=list(acc2spk.keys()))
    return spksim, accsim

In [None]:
sample_num = 100

data = {}
np.random.seed(0)
for b, base in enumerate(np.random.choice(commonfiles, sample_num, False)):
    data[base] = {}
    for s, spk in enumerate(speakers):
        clear_output(wait=True)
        print(f"{b+1} / {sample_num}: {base}")
        print(f"{s+1} / {len(speakers)}: {spk}")
        path = basedir + f"{spk}/wav/{base}"
        data[base][spk] = get_accent_embedding(path)

In [None]:
base = list(data.keys())[0]
arrays = []
for base in data:
    spksim, accsim = get_similarity_matrix(data, base)
    arrays += [accsim.values]
accsim.loc[:] = np.array(arrays).mean(0)

In [None]:
min_, max_ = accsim.values.min(), accsim.values.max()
def heatmap_color(val):
    val = (val-min_) / (max_-min_)*0.7
    color = f'background-color: rgba(255, 255, 255, {val});'
    return color
accsim.style.applymap(heatmap_color)