In [1]:
import torchaudio
import json
import os
from preprocess import create_reverberated_data, create_noisy_data, create_noisy_data_parallel, create_reverberated_data_parallel
from tqdm import tqdm
import math
import torch

## VAD付きデータの準備

### 完了

In [1]:
# データ再整理 (json/aligned/train_nodup_sp.json)
# 1. json内で話者もキーにする
# 2. VAD情報を付与する

import json
from tqdm import tqdm

align_file_path = "/home/mimura/alignments/align.phones.per-frame.csj.sp"
with open(align_file_path, "r") as f:
    aligns = f.readlines()
align_dict = {}
for align in tqdm(aligns):
    align = align.strip().split(" ")
    key = align[0]
    if not key.startswith("sp"):
        key = "sp1.0-" + key
    align_dict[key] = align[1:]

with open(f"json/original/csj_train_nodup_sp.json", "r") as f:
    data_json = json.load(f)

original_keys = list(data_json.keys())

100%|██████████| 1220436/1220436 [01:16<00:00, 16021.67it/s]


In [2]:
result_json = {}

counter = 0
for i, key in enumerate(original_keys):
    print(f"finish: {i / len(original_keys) * 100:.2f} %, ne: {counter / len(original_keys) * 100:.2f} %", end="\r")
    speaker = key.split("-")[1].split("_")[0]
    if speaker not in result_json:
        result_json[speaker] = {}
    # VAD情報を付与可能か調べる
    if key in align_dict:
        phonemes = align_dict[key]
        data_json[key]["vad"] = [0 if phoneme.startswith("sp") else 1 for phoneme in phonemes]
        result_json[speaker][key] = data_json[key]
    else:
        counter += 1
        #print(key)

with open(f"json/vad/train_nodup_sp.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

finish: 100.00 %, ne: 0.50 %

In [3]:
# 事前学習データおよびテストデータの準備 (json/aligned/pretrain.json, adaptation.json)
import json
from tqdm import tqdm
with open("json/vad/train_nodup_sp.json", "r") as f:
    data_json = json.load(f)
speakers = list(data_json.keys())
# 事前学習データの作成
pretrain_speakers = speakers[:int(len(speakers) * 0.9)]
pretrain_json = {}
for speaker in tqdm(pretrain_speakers):
    pretrain_json[speaker] = data_json[speaker]
with open("json/vad/pretrain.json", "w") as f:
    json.dump(pretrain_json, f, indent=4, ensure_ascii=False)
# テストデータの作成
adaptation_speakers = speakers[int(len(speakers) * 0.9):]
adaptation_json = {}
for speaker in tqdm(adaptation_speakers):
    adaptation_json[speaker] = data_json[speaker]
with open("json/vad/adaptation.json", "w") as f:
    json.dump(adaptation_json, f, indent=4, ensure_ascii=False)


100%|██████████| 2387/2387 [00:00<00:00, 1928684.96it/s]
100%|██████████| 266/266 [00:00<00:00, 1417642.78it/s]


In [2]:
# 事前学習データの準備 (json/aligned_csj/noisy_pretrain.json)
# 事前学習データはノイズの多様性を持たせるために, 話者内で異なるノイズを用いることを許容する

import multiprocessing
from preprocess import create_aligned_noisy_pretrain_data_parallel
import json

with open("json/original/musan.json", "r") as f:
    musan_data_json = json.load(f)
with open("json/original/demand.json", "r") as f:
    demand_data_json = json.load(f)

noise_data_jsons = [musan_data_json, demand_data_json]

with open(f"json/vad/pretrain.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=create_aligned_noisy_pretrain_data_parallel, args=(data_json, speakers, "./datasets/vad/noisy/pretrain", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/vad/noisy_pretrain.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

  0%|          | 0/150 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
# 事前学習時のテストデータの準備 (json/aligned_csj/noisy_pretrain_eval.json)
# テストデータでは可能な限り話者内で同じノイズを用いるようにする
# ここで作成したデータはAdaptationでも利用する

import multiprocessing
from preprocess import create_aligned_noisy_pretrain_eval_data_parallel
import json

with open("json/original/chime3.json", "r") as f:
    chime3_data_json = json.load(f)

noise_data_jsons = [chime3_data_json]

with open(f"json/vad/adaptation.json", "r") as f: # ベースはAdaptationデータ
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=create_aligned_noisy_pretrain_eval_data_parallel, args=(data_json, speakers, "./datasets/vad/noisy/pretrain_eval", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/vad/noisy_pretrain_eval.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

100%|██████████| 1043010/1043010 [00:00<00:00, 1405080.42it/s]
100%|██████████| 115890/115890 [00:00<00:00, 1280695.92it/s]


In [None]:
# 事前学習時のテストデータの準備 （ref）(json/aligned_csj/noisy_pretrain_eval_ref.json)
# テストデータでは可能な限り話者内で同じノイズを用いるようにする
# ここで作成したデータはAdaptationでも利用する

import multiprocessing
from preprocess import create_aligned_noisy_pretrain_eval_data_parallel
import json

with open("json/original/demand.json", "r") as f:
    demand_data_json = json.load(f)
with open("json/original/musan.json", "r") as f:
    musan_data_json = json.load(f)

noise_data_jsons = [demand_data_json, musan_data_json]

with open(f"json/vad/adaptation.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=create_aligned_noisy_pretrain_eval_data_parallel, args=(data_json, speakers, "./datasets/vad/noisy/pretrain_eval_ref", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/vad/noisy_pretrain_eval_ref.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

In [None]:
# VADのサブサンプリング
import json

from tqdm import tqdm


def vad_subsample(vad, kernel_size, stride):
    n_subsample = (len(vad) - kernel_size + stride) // stride
    subsampled_vad = []
    for i in range(n_subsample):
        sub = vad[i * stride : i * stride + kernel_size]
        if len(sub) // 2 + 1 <= sum(sub):
            subsampled_vad.append(1)
        else:
            subsampled_vad.append(0)
    return subsampled_vad

for type in ["pretrain", "pretrain_eval", "pretrain_eval_ref"]:

    path = f"./json/vad/noisy_{type}.json"
    result_path = f"./json/vad/noisy_{type}_with_subsampled_vad.json"

    with open(path, "r") as f:
        data_json = json.load(f)

    result_json = data_json

    speakers = list(data_json.keys())
    for speaker in tqdm(speakers):
        keys = list(data_json[speaker].keys())
        for key in keys:
            vad = data_json[speaker][key]["vad"]
            subsampled_vad = vad_subsample(vad_subsample(vad, 3, 2), 3, 2)
            result_json[speaker][key]["subsampled_vad"] = subsampled_vad

    with open(result_path, "w") as f:
        json.dump(result_json, f, indent=4, ensure_ascii=False)

In [None]:
# Adaptationデータの準備 (json/aligned_csj/noisy_adaptation_with_subsampled_vad.json)
# 基本的には事前学習時のテストデータと同じ (noisy_pretrain_eval_with_subsampled_vad.json)
# ただし、Speaker + ノイズの組み合わせが同一のデータをすべて連結し、比較的長い音声データを作成する

import multiprocessing
from preprocess import concat_pretrain_eval_with_subsampled_vad_parallel
import json

with open(f"json/vad/noisy_pretrain_eval_with_subsampled_vad.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=concat_pretrain_eval_with_subsampled_vad_parallel, args=(data_json, speakers, "./datasets/vad/noisy/adaptation", queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

with open(f"json/vad/noisy_adaptation_with_subsampled_vad.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

In [None]:
# １path - Adaptationデータの準備 (json/vad/noisy_1_path_adaptation_with_subsampled_vad.json)
# evalデータにおける同一Speaker,ノイズの発話セットを配列にまとめる

import multiprocessing
from preprocess import array_concat_pretrain_eval_with_subsampled_vad_parallel
import json

with open(f"json/vad/noisy_pretrain_eval_with_subsampled_vad.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=array_concat_pretrain_eval_with_subsampled_vad_parallel, args=(data_json, speakers, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

with open(f"json/vad/noisy_1_path_adaptation_with_subsampled_vad.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

In [1]:
# developデータの作成
import json

with open(f"json/vad/noisy_pretrain_with_subsampled_vad.json", "r") as f:
    data_json = json.load(f)

speakers = list(data_json.keys())
develop_speakers = speakers[:int(len(speakers) * 0.005)]
develop_data_json = {speaker: data_json[speaker] for speaker in develop_speakers}
with open(f"json/vad/noisy_pretrain_develop_with_subsampled_vad.json", "w") as f:
    json.dump(develop_data_json, f, indent=4, ensure_ascii=False)

##  データ整備

KeyboardInterrupt: 