In [2]:
import numpy as np
import glob
import os
from tqdm import tqdm
import joblib
import torch
from itertools import groupby

from sklearn.cluster import MiniBatchKMeans

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# dataset_dir = "/mntcephfs/lab_data/shoinoue/Dataset/LibriTTS_R/features/"
dataset_dir = "/mntcephfs/data/audiow/shoinoue/Dataset/LibriTTS_R/features/"

# n_clusters = 500
# n_clusters = 1000
n_clusters = 300
max_iter = 100
init = "k-means++"
tol = 0.0
max_no_improvement = 100
batch_size = 10000
n_init = 20
reassignment_ratio = 0.0
percent = 0.1
modename = "train"

In [4]:
speakers = [os.path.basename(a) for a in glob.glob(dataset_dir + "*/*")]
speakers.sort()
files = []
for spk in speakers:
    files += glob.glob(dataset_dir + f"{modename}*/{spk}/*/*[0-9]_hubert.npy")
    # files += glob.glob(dataset_dir + f"*/{spk}/*/*[0-9]_hubert.npy")
files.sort()

data = []
for path in tqdm(files):
    a = np.load(path)
    np.random.seed(0)
    indices = np.random.choice(np.arange(a.shape[0]), np.ceil(a.shape[0]*percent).astype(int), False)
    data += [a[indices]]
data = np.concatenate(data, 0)

km_model = MiniBatchKMeans(
    n_clusters=n_clusters,
    init=init,
    max_iter=max_iter,
    batch_size=batch_size,
    verbose=1,
    compute_labels=False,
    tol=tol,
    max_no_improvement=max_no_improvement,
    init_size=None,
    n_init=n_init,
    reassignment_ratio=reassignment_ratio,
)

100%|██████████| 149623/149623 [08:29<00:00, 293.38it/s]


In [5]:
save = True
# km_path = "./ckpts/kmeans/L9_km500_LibriTTSR.bin"
# km_path = "./ckpts/kmeans/L9_km1000_LibriTTSR.bin"
km_path = "./ckpts/kmeans/L9_km300_LibriTTSR.bin"

km_model.fit(data)
if save:
    joblib.dump(km_model, km_path)

Init 1/20 with method k-means++
Inertia for init 1/20: 1974091.5313350703
Init 2/20 with method k-means++
Inertia for init 2/20: 1971751.2113952527
Init 3/20 with method k-means++
Inertia for init 3/20: 1976673.9487287058
Init 4/20 with method k-means++
Inertia for init 4/20: 1963272.4617036479
Init 5/20 with method k-means++
Inertia for init 5/20: 1971232.8862292916
Init 6/20 with method k-means++
Inertia for init 6/20: 1968749.6673402584
Init 7/20 with method k-means++
Inertia for init 7/20: 1970089.4161087433
Init 8/20 with method k-means++
Inertia for init 8/20: 1981070.7120631519
Init 9/20 with method k-means++
Inertia for init 9/20: 1970549.0527996325
Init 10/20 with method k-means++
Inertia for init 10/20: 1968848.9905688637
Init 11/20 with method k-means++
Inertia for init 11/20: 1974309.5447766865
Init 12/20 with method k-means++
Inertia for init 12/20: 1972694.4961303144
Init 13/20 with method k-means++
Inertia for init 13/20: 1972743.3763959818
Init 14/20 with method k-means

# Apply Kmeans

In [6]:
class ApplyKmeans():
    def __init__(self, km_path):
        self.km_model = joblib.load(km_path)
        self.C_np = self.km_model.cluster_centers_.transpose()
        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)

        self.C = torch.from_numpy(self.C_np)
        self.Cnorm = torch.from_numpy(self.Cnorm_np)
        if torch.cuda.is_available():
            self.C = self.C.cuda()
            self.Cnorm = self.Cnorm.cuda()
    def discretize(self, data):
        # data: (frame, dim)
        
        x = torch.tensor(data, dtype=self.C.dtype)
        if torch.cuda.is_available():
            x = x.cuda()
            
        dist = (
            x.pow(2).sum(1, keepdim=True)
            - 2 * torch.matmul(x, self.C)
            + self.Cnorm
        )
        return dist.argmin(dim=1).cpu().numpy()
# km_path = "./ckpts/kmeans/L9_km500_LibriTTSR.bin"
# km_path = "./ckpts/kmeans/L9_km1000_LibriTTSR.bin"
km_path = "./ckpts/kmeans/L9_km300_LibriTTSR.bin"
km = ApplyKmeans(km_path)

In [7]:
remove_repetition = False

### CMU-ARCTIC
# hubert_dir = "/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/hubert/"
# hubert_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/hubert/"
# output_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/hubert_km500/"
# add = ""
# add2 = ""

### LibriTTS-R
hubert_dir = "/mntcephfs/data/audiow/shoinoue/Dataset/LibriTTS_R/features/"
output_dir = "/mntcephfs/data/audiow/shoinoue/Dataset/LibriTTS_R/features/"
add = "_hubert"
# add2 = "_km500"
# add2 = "_km1000"
add2 = "_km300"

for subdir, dirs, files in tqdm(os.walk(hubert_dir)):
    for file in files:
# for subdir, dirs, files in os.walk(hubert_dir):
    # for file in tqdm(files):
        if file[:2]!="._" and file[-(4+len(add)):]==f"{add}.npy":
            if subdir[-1]=="/":
                wav_path = subdir + file
            else:
                wav_path = subdir + "/" + file
            
            data = np.load(wav_path)
            out_feat = list(km.discretize(data))
            if remove_repetition:
                out_feat = [key for key, _group in groupby(out_feat)]
                
            savefile = output_dir + wav_path[len(hubert_dir):-4] + f"{add2}.npy"
            np.save(savefile, out_feat) 

3971it [05:08, 12.87it/s]


In [8]:
hubert_dir

'/mntcephfs/data/audiow/shoinoue/Dataset/LibriTTS_R/features/'