In [30]:
from data import LibriLightDataset
import json
import numpy as np
import torch

In [31]:
train_dataset = LibriLightDataset(
    subset="9h",
    identifier_to_phones_file_path="phones/librispeech_normalized_phones_no_bcl.json",
    vocab_file_path="vocabs/libri-light_9h.json"
)
test_dataset = LibriLightDataset(
    subset="1h",
    identifier_to_phones_file_path="phones/librispeech_normalized_phones_no_bcl.json",
    vocab_file_path="vocabs/libri-light_9h.json"
)

In [32]:
target_phonemes = set(
    ["aa",
    "ae",
    "ah",
    "aw",
    "ay",
    "b",
    "ch",
    "d",
    "dh",
    "dx",
    "eh",
    "axr",
    "ey",
    "f",
    "g",
    "bcl",
    "hh",
    "ih",
    "iy",
    "jh",
    "k",
    "el",
    "em",
    "en",
    "eng",
    "ow",
    "oy",
    "p",
    "r",
    "s",
    "sh",
    "t",
    "th",
    "uh",
    "uw",
    "v",
    "w",
    "y",
    "z",]
)

In [33]:
phone_to_idx = {phone: idx for idx, phone in enumerate(target_phonemes)}

In [34]:
def calculate_tf_idf(dataset: torch.utils.data.Dataset, phone_to_idx: dict, target_phones: set):
    df = np.zeros(39, dtype=np.float32)
    tf = np.zeros((len(dataset), 39), dtype=np.float32)
    for idx in range(len(dataset)):
        print(f"{idx / len(dataset) * 100:.2f}%", end="\r")
        phones = dataset[idx][-1]
        unique_phones = set(phones)
        for target_phone in list(target_phones):
            if target_phone in unique_phones:
                df[phone_to_idx[target_phone]] += 1
        for phone in phones:
            if phone is not None:
                tf[idx, phone_to_idx[phone]] += 1
        tf[idx] = tf[idx] / len(phones)

    df = df / len(dataset)
    df += 1e-8
    idf = np.log(1 / df)

    return tf * idf

In [35]:
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [36]:
train_tf_idf = calculate_tf_idf(train_dataset, phone_to_idx, target_phonemes)
test_tf_idf = calculate_tf_idf(test_dataset, phone_to_idx, target_phonemes)

99.65%

In [37]:
not_sampled_indices = set(range(len(train_dataset)))
sampled_indices = set()
for idx in range(len(test_dataset)):
    print(f"{idx / len(test_dataset) * 100:.2f}%", end="\r")
    test_sample = test_tf_idf[idx]
    similarities = {}
    for sample_idx in not_sampled_indices:
        train_sample = train_tf_idf[sample_idx]
        similaritiy = cos_sim(test_sample, train_sample)
        similarities[sample_idx] = similaritiy

    max_similaritiy_idx = max(similarities, key=similarities.get)
    sampled_indices.add(max_similaritiy_idx)
    not_sampled_indices.remove(max_similaritiy_idx)


99.65%

In [38]:
assert len(set(sampled_indices)) == len(sampled_indices)

In [39]:
sampled_train_dataset = torch.utils.data.Subset(train_dataset, list(sampled_indices))

In [40]:
import pickle
with open("sampled_train_dataset.pkl", "wb") as f:
    pickle.dump(sampled_train_dataset, f)

In [41]:
total_duration = 0.
for idx in range(len(sampled_train_dataset)):
    duration = sampled_train_dataset[idx][2] / 16000
    total_duration += duration

In [42]:
print(f"Total duration of sampled dataset: {total_duration / 60:.2f} min")

Total duration of sampled dataset: 63.47 min


In [43]:
total_duration

tensor(3808.1145)

In [46]:
sampled_train_dataset[0]

(1025,
 tensor([-0.0016, -0.0020, -0.0101,  ..., -0.0012, -0.0058, -0.0128]),
 tensor(125120),
 tensor([ 7, 23, 14,  7, 27, 25, 14,  7,  8, 27, 28,  8, 24, 24, 11,  6, 27, 11,
          6, 27, 14,  0,  7, 16, 14, 20, 20, 10, 27,  0, 19, 26, 11,  3, 15, 27,
          7, 23, 24, 19, 16, 15, 23, 27, 19,  3, 27,  7, 23,  8, 27, 19,  7, 23,
          8, 24, 27,  6, 11,  1,  8, 27, 11, 28, 27,  6, 23,  8, 27,  0, 23, 14,
          3, 15,  8,  6, 27, 23,  8, 24, 27, 26, 11,  3,  1, 27, 26, 14,  3, 10,
         27, 26, 19, 24,  8, 27,  7, 11, 26,  8,  6, 27,  7, 23,  8, 24,  8, 27,
         17, 19,  3, 13,  7, 27, 18,  8, 27, 14,  3, 10,  7, 23, 11,  3, 15, 27,
         20,  8, 28,  7,  4]),
 tensor(131),
 "that kate ferris is actually coming through on the other side if she changes her mind many more times there won't be anything left\n",
 6415,
 100596,
 18,
 ['dh',
  'ae',
  't',
  'k',
  'ey',
  't',
  'f',
  'eh',
  'r',
  'ih',
  's',
  'ih',
  'z',
  'ae',
  'k',
  'ch',
  'el',
  'iy',

In [52]:
def calculate_tf_idf_over_ds(dataset: torch.utils.data.Dataset, phone_to_idx: dict, target_phones: set):
    df = np.zeros(39, dtype=np.float32)
    tf = np.zeros(39, dtype=np.float32)

    for idx in range(len(dataset)):
        print(f"{idx / len(dataset) * 100:.2f}%", end="\r")
        phones = dataset[idx][-1]
        unique_phones = set(phones)
        for target_phone in list(target_phones):
            if target_phone in unique_phones:
                df[phone_to_idx[target_phone]] += 1
        for phone in phones:
            if phone is not None:
                tf[phone_to_idx[phone]] += 1
                
    tf = tf / tf.sum()

    df = df / len(dataset)
    df += 1e-8
    idf = np.log(1 / df)

    return tf * idf

In [53]:
test_tf_idf_over_ds = calculate_tf_idf_over_ds(test_dataset, phone_to_idx, target_phonemes)

99.65%

In [55]:
sampled_duration = 0
limit_duration = 600
not_sampled_indices = set(range(len(train_dataset)))
sampled_indices = set()
while sampled_duration < limit_duration:
    print(f"{sampled_duration / limit_duration * 100:.2f}%", end="\r")
    similarities = {}
    for idx in not_sampled_indices:
        sampled_indices_copy = sampled_indices.copy()
        sampled_indices_copy.add(idx)
        sampled_subset = torch.utils.data.Subset(sampled_train_dataset, list(sampled_indices_copy))
        sampled_tf_idf_over_ds = calculate_tf_idf_over_ds(sampled_subset, phone_to_idx, target_phonemes)
        similarity = cos_sim(test_tf_idf_over_ds, sampled_tf_idf_over_ds)
        similarities[idx] = similarity
    
    max_similaritiy_idx = max(similarities, key=similarities.get)
    sampled_indices.add(max_similaritiy_idx)
    not_sampled_indices.remove(max_similaritiy_idx)
    sampled_duration += train_dataset[max_similaritiy_idx][2] / 16000



NameError: name 'tf' is not defined

In [1]:
import numpy as np
a = np.zeros(3)
b = a.copy()

In [2]:
b[0] = 1

In [4]:
b

array([1., 0., 0.])