In [1]:
import torchaudio
import json
import os
from preprocess import create_reverberated_data, create_noisy_data, create_noisy_data_parallel, create_reverberated_data_parallel
from tqdm import tqdm
import math
import torch

In [14]:
from tqdm import tqdm
import json

def vad_subsample(vad, kernel_size, stride):
    n_subsample = (len(vad) - kernel_size + stride) // stride
    subsampled_vad = []
    for i in range(n_subsample):
        sub = vad[i * stride : i * stride + kernel_size]
        if len(sub) // 2 + 1 <= sum(sub):
            subsampled_vad.append(1)
        else:
            subsampled_vad.append(0)
    return subsampled_vad

path = "./json/aligned/noisy_pretrain.json"
result_path = "./json/aligned/noisy_pretrain_with_subsampled_vad.json"

with open(path, "r") as f:
    data_json = json.load(f)

result_json = data_json

speakers = list(data_json.keys())
for speaker in tqdm(speakers):
    keys = list(data_json[speaker].keys())
    for key in keys:
        vad = data_json[speaker][key]["vad"]
        subsampled_vad = vad_subsample(vad_subsample(vad, 3, 2), 3, 2)
        result_json[speaker][key]["subsampled_vad"] = subsampled_vad

with open(result_path, "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

In [2]:
# play audio in jupyter
import IPython.display
path = "/n/work3/mimura/data/musan/music/fma-western-art/music-fma-wa-0001.wav"


IPython.display.Audio(path)


In [None]:
# create json

data_types = ["train_nodup_sp", "train_dev", "eval1", "eval2", "eval3"]
DATA_JSON_PATH_PREFIX = "/home/shibutani/fs/ASR/espnet/egs/csj/asr1/dump/"
def get_data_json_path(data_type):
    return os.path.join(DATA_JSON_PATH_PREFIX, data_type, "deltafalse/data.json")
def get_wav_path(data_type, key):
    WAV_PATH_PREFIXS = {
        "train_nodup_sp": "/n/work3/mimura/data/csj/wav/sp",
        "train_dev": "/n/work3/mimura/data/csj/wav/wav.segments",
        "eval1": "/n/work3/mimura/data/csj/wav/wav.segments.testset",
        "eval2": "/n/work3/mimura/data/csj/wav/wav.segments.testset",
        "eval3": "/n/work3/mimura/data/csj/wav/wav.segments.testset",
    }
    wav_path_prefix = WAV_PATH_PREFIXS[data_type]
    if data_type == "train_nodup_sp":
        sp = key.split("-")[0]
        key = key.split("-")[1]
        utt_id = key.split("_")[0]
        wav_path = os.path.join(wav_path_prefix, sp, utt_id, key + ".wav")
    else:
        utt_id = key.split("_")[0]
        wav_path = os.path.join(wav_path_prefix, utt_id, key + ".wav")

    return wav_path

data_type = "train_nodup"
data_json_path = get_data_json_path(data_type)
with open(data_json_path) as f:
    data = json.load(f)
utts = data["utts"]
keys = data["utts"].keys()

result_json = {}

error_counter = 0

for key in keys:
    # show progress
    if len(result_json) % 100 == 0:
        print(len(result_json)/ len(keys) * 100, "%", end="\r")
    wav_file_path = get_wav_path(data_type, key)
    if not os.path.exists(wav_file_path):
        error_counter += 1
        continue
    sampling_rate = torchaudio.info(wav_file_path).sample_rate
    audio_sec = torchaudio.info(wav_file_path).num_frames / sampling_rate
    utt = utts[key]
    raw_transcript = ""
    for output in utt["output"]:
        raw_transcript += output["text"]
        if len(utt["output"]) > 1:
            print("Warning: utt has multiple outputs")
    result_json[key] = {}
    result_json[key]["wav_file_path"] = wav_file_path
    result_json[key]["sampling_rate"] = sampling_rate
    result_json[key]["audio_sec"] = audio_sec
    result_json[key]["raw_transcript"] = raw_transcript

print(f"error rate: {error_counter / len(keys) * 100}%")

with open(f"json/csj_{data_type}.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

In [6]:
# create json for noise
chime3_dir_path = "/n/work3/shibutani/ASR/datasets/CHiME3/data/audio/16kHz/backgrounds"
chime3_file_paths = list(os.listdir(chime3_dir_path))
chime3_paths = [os.path.join(chime3_dir_path, path) for path in chime3_file_paths]
chime3_json = {}
for i, path in enumerate(chime3_paths):
    chime3_json[i] = {}
    chime3_json[i]["wav_file_path"] = path
    chime3_json[i]["sampling_rate"] = torchaudio.info(path).sample_rate
    chime3_json[i]["audio_sec"] = torchaudio.info(path).num_frames / torchaudio.info(path).sample_rate
    chime3_json[i]["raw_transcript"] = ""
with open("json/chime3.json", "w") as f:
    json.dump(chime3_json, f, indent=4, ensure_ascii=False)


demand_dir_paths = [os.path.join("/n/work3/mimura/data/DEMAND/demand", dir_pre) for dir_pre in os.listdir("/n/work3/mimura/data/DEMAND/demand")]
demand_paths = []
for demand_dir_path in demand_dir_paths:
    for demand_file_path in os.listdir(demand_dir_path):
        demand_paths.append(os.path.join(demand_dir_path, demand_file_path))
demand_json = {}
for i, path in enumerate(demand_paths):
    demand_json[i] = {}
    demand_json[i]["wav_file_path"] = path
    demand_json[i]["sampling_rate"] = torchaudio.info(path).sample_rate
    demand_json[i]["audio_sec"] = torchaudio.info(path).num_frames / torchaudio.info(path).sample_rate
    demand_json[i]["raw_transcript"] = ""
with open("json/demand.json", "w") as f:
    json.dump(demand_json, f, indent=4, ensure_ascii=False)

# musanは以下のファイルに記載の音声を使う
# script.noise.musan.music_noise
musan_paths_path = "/n/work3/mimura/exp/multicond/script.noise.musan.music_noise"
with open(musan_paths_path, "r") as f:
    musan_paths = f.readlines()
musan_paths = [path.strip() for path in musan_paths]
musan_json = {}
for i, path in enumerate(musan_paths):
    musan_json[i] = {}
    musan_json[i]["wav_file_path"] = path
    musan_json[i]["sampling_rate"] = torchaudio.info(path).sample_rate
    musan_json[i]["audio_sec"] = torchaudio.info(path).num_frames / torchaudio.info(path).sample_rate
    musan_json[i]["raw_transcript"] = ""
with open("json/musan.json", "w") as f:
    json.dump(musan_json, f, indent=4, ensure_ascii=False)

In [3]:
# 残響の付加 (dev, eval)
NAME = "csj_eval1"
with open(f"json/{NAME}.json", "r") as f:
    data_json = json.load(f)

result_json = create_reverberated_data(data_json, NAME)
with open(f"json/reverberated_{NAME}.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

100%|██████████| 1272/1272 [10:21<00:00,  2.05it/s]


In [2]:
# ノイズの付加 (dev, eval)
import multiprocessing
noise_data_json = {
    "cafeteria": {
        "wav_file_path": "/n/work3/shibutani/ASR/datasets/ku-real/noise/noise_cafeteria.wav"
    },
    "museum": {
        "wav_file_path": "/n/work3/shibutani/ASR/datasets/ku-real/noise/noise_museum.wav"
    }
}
noise_data_jsons = [noise_data_json]

NAME = "csj_eval1"
with open(f"json/reverberated_{NAME}.json", "r") as f:
    data_json = json.load(f)

all_keys = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_keys) / NUM_PROCS * i)
    end = int(len(all_keys) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_keys)
    keys = all_keys[start:end]
    p = multiprocessing.Process(
        target=create_noisy_data_parallel, args=(data_json, keys, NAME, noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/noisy_{NAME}.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

100%|██████████| 80/80 [00:33<00:00,  2.39it/s]

 96%|█████████▌| 76/79 [00:33<00:01,  2.45it/s]
100%|██████████| 79/79 [00:33<00:00,  2.35it/s]
100%|██████████| 80/80 [00:33<00:00,  2.36it/s]
100%|██████████| 80/80 [00:33<00:00,  2.36it/s]
100%|██████████| 79/79 [00:34<00:00,  2.32it/s]
100%|██████████| 79/79 [00:34<00:00,  2.28it/s]
100%|██████████| 79/79 [00:34<00:00,  2.27it/s]
100%|██████████| 80/80 [00:34<00:00,  2.30it/s]
100%|██████████| 79/79 [00:34<00:00,  2.27it/s]
100%|██████████| 79/79 [00:35<00:00,  2.25it/s]
100%|██████████| 80/80 [00:35<00:00,  2.26it/s]
100%|██████████| 80/80 [00:35<00:00,  2.25it/s]
100%|██████████| 79/79 [00:35<00:00,  2.20it/s]
100%|██████████| 80/80 [00:36<00:00,  2.21it/s]


In [2]:
# eval1に対してtrainと同様のノイズを加え、比較実験(reference)に利用する
import multiprocessing

with open("json/chime3.json", "r") as f:
    chime3_data_json = json.load(f)
with open("json/musan.json", "r") as f:
    musan_data_json = json.load(f)
with open("json/demand.json", "r") as f:
    demand_data_json = json.load(f)

noise_data_jsons = [chime3_data_json, musan_data_json, demand_data_json]

with open(f"json/reverberated_csj_eval1.json", "r") as f:
    data_json = json.load(f)

all_keys = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_keys) / NUM_PROCS * i)
    end = int(len(all_keys) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_keys)
    keys = all_keys[start:end]
    p = multiprocessing.Process(
        target=create_noisy_data_parallel, args=(data_json, keys, "csj_eval1_reference", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/noisy_csj_eval1_reference.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

100%|██████████| 79/79 [01:41<00:00,  1.28s/it]
100%|██████████| 80/80 [01:42<00:00,  1.28s/it]
100%|██████████| 80/80 [01:44<00:00,  1.30s/it]
100%|██████████| 79/79 [01:45<00:00,  1.33s/it]
100%|██████████| 80/80 [01:46<00:00,  1.33s/it]
100%|██████████| 80/80 [01:47<00:00,  1.34s/it]
100%|██████████| 79/79 [01:47<00:00,  1.36s/it]
100%|██████████| 80/80 [01:47<00:00,  1.34s/it]
 82%|████████▏ | 65/79 [01:47<00:02,  5.09it/s]
100%|██████████| 79/79 [01:48<00:00,  1.37s/it]
100%|██████████| 79/79 [01:48<00:00,  1.37s/it]
100%|██████████| 79/79 [01:48<00:00,  1.38s/it]
100%|██████████| 80/80 [01:49<00:00,  1.37s/it]
100%|██████████| 79/79 [01:49<00:00,  1.39s/it]
100%|██████████| 79/79 [01:50<00:00,  1.39s/it]
100%|██████████| 80/80 [01:50<00:00,  1.39s/it]


## VAD付きデータを準備

In [1]:
# データ再整理 (json/aligned/train_nodup_sp.json)
# 1. json内で話者もキーにする
# 2. VAD情報を付与する

import json
from tqdm import tqdm

align_file_path = "/home/mimura/alignments/align.phones.per-frame.csj.sp"
with open(align_file_path, "r") as f:
    aligns = f.readlines()
align_dict = {}
for align in tqdm(aligns):
    align = align.strip().split(" ")
    key = align[0]
    if not key.startswith("sp"):
        key = "sp1.0-" + key
    align_dict[key] = align[1:]

with open(f"json/csj_train_nodup_sp.json", "r") as f:
    data_json = json.load(f)

original_keys = list(data_json.keys())

100%|██████████| 1220436/1220436 [01:16<00:00, 16021.67it/s]


In [2]:
result_json = {}

counter = 0
for i, key in enumerate(original_keys):
    print(f"finish: {i / len(original_keys) * 100:.2f} %, ne: {counter / len(original_keys) * 100:.2f} %", end="\r")
    speaker = key.split("-")[1].split("_")[0]
    if speaker not in result_json:
        result_json[speaker] = {}
    # VAD情報を付与可能か調べる
    if key in align_dict:
        phonemes = align_dict[key]
        data_json[key]["vad"] = [0 if phoneme.startswith("sp") else 1 for phoneme in phonemes]
        result_json[speaker][key] = data_json[key]
    else:
        counter += 1
        #print(key)

with open(f"json/aligned_csj/train_nodup_sp.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

finish: 100.00 %, ne: 0.50 %

In [3]:
# 事前学習データおよびテストデータの準備 (json/aligned/pretrain.json, dev.json)
import json
from tqdm import tqdm
with open("json/aligned_csj/train_nodup_sp.json", "r") as f:
    data_json = json.load(f)
speakers = list(data_json.keys())
# 事前学習データの作成
pretrain_speakers = speakers[:int(len(speakers) * 0.9)]
pretrain_json = {}
for speaker in tqdm(pretrain_speakers):
    pretrain_json[speaker] = data_json[speaker]
with open("json/aligned_csj/pretrain.json", "w") as f:
    json.dump(pretrain_json, f, indent=4, ensure_ascii=False)
# テストデータの作成
adaptation_speakers = speakers[int(len(speakers) * 0.9):]
adaptation_json = {}
for speaker in tqdm(adaptation_speakers):
    adaptation_json[speaker] = data_json[speaker]
with open("json/aligned_csj/adaptation.json", "w") as f:
    json.dump(adaptation_json, f, indent=4, ensure_ascii=False)


100%|██████████| 2387/2387 [00:00<00:00, 1928684.96it/s]
100%|██████████| 266/266 [00:00<00:00, 1417642.78it/s]


In [2]:
# 事前学習データの準備 (json/aligned_csj/noisy_pretrain.json)
# 事前学習データはノイズの多様性を持たせるために, 話者内で異なるノイズを用いることを許容する

import multiprocessing
from preprocess import create_aligned_noisy_pretrain_data_parallel
import json

with open("json/chime3.json", "r") as f:
    chime3_data_json = json.load(f)
with open("json/musan.json", "r") as f:
    musan_data_json = json.load(f)

noise_data_jsons = [chime3_data_json, musan_data_json]

with open(f"json/aligned_csj/pretrain.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=create_aligned_noisy_pretrain_data_parallel, args=(data_json, speakers, "./datasets/aligned_csj/noisy/pretrain", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/aligned_csj/noisy_pretrain.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

  0%|          | 0/150 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
# テストデータの準備 (json/aligned_csj/noisy_adaptation.json)
# テストデータでは可能な限り話者内で同じノイズを用いるようにする

import multiprocessing
from preprocess import create_aligned_noisy_adaptation_data_parallel
import json

with open("json/demand.json", "r") as f:
    demand_data_json = json.load(f)

noise_data_jsons = [demand_data_json]

with open(f"json/aligned_csj/adaptation.json", "r") as f:
    data_json = json.load(f)

all_speakers = list(data_json.keys())

jobs = []
queue = multiprocessing.Queue()
NUM_PROCS = 16
for i in range(NUM_PROCS):
    start = int(len(all_speakers) / NUM_PROCS * i)
    end = int(len(all_speakers) / NUM_PROCS * (i + 1))
    if i == NUM_PROCS - 1:
        end = len(all_speakers)
    speakers = all_speakers[start:end]
    p = multiprocessing.Process(
        target=create_aligned_noisy_adaptation_data_parallel, args=(data_json, speakers, "./datasets/aligned_csj/noisy/adaptation", noise_data_jsons, queue)
    )
    p.start()
    jobs.append(p)

# concat result_jsons in queue
result_json = {}
for i in range(NUM_PROCS):
    result_json.update(queue.get())

for p in jobs:
    p.join()

assert len(result_json) == len(data_json)

with open(f"json/aligned_csj/noisy_adaptation.json", "w") as f:
    json.dump(result_json, f, indent=4, ensure_ascii=False)

100%|██████████| 1043010/1043010 [00:00<00:00, 1405080.42it/s]
100%|██████████| 115890/115890 [00:00<00:00, 1280695.92it/s]


In [None]:
# VADのサブサンプリング
import json

from tqdm import tqdm


def vad_subsample(vad, kernel_size, stride):
    n_subsample = (len(vad) - kernel_size + stride) // stride
    subsampled_vad = []
    for i in range(n_subsample):
        sub = vad[i * stride : i * stride + kernel_size]
        if len(sub) // 2 + 1 <= sum(sub):
            subsampled_vad.append(1)
        else:
            subsampled_vad.append(0)
    return subsampled_vad

for type in ["pretrain", "adaptation"]:

    path = f"./json/aligned_csj/noisy_{type}.json"
    result_path = f"./json/aligned_csj/noisy_{type}_with_subsampled_vad.json"

    with open(path, "r") as f:
        data_json = json.load(f)

    result_json = data_json

    speakers = list(data_json.keys())
    for speaker in tqdm(speakers):
        keys = list(data_json[speaker].keys())
        for key in keys:
            vad = data_json[speaker][key]["vad"]
            subsampled_vad = vad_subsample(vad_subsample(vad, 3, 2), 3, 2)
            result_json[speaker][key]["subsampled_vad"] = subsampled_vad

    with open(result_path, "w") as f:
        json.dump(result_json, f, indent=4, ensure_ascii=False)

## Tokenizer

In [8]:
# TranscriptFileの作成
with open("json/noisy_csj_train_nodup_sp.json", "r") as f:
    data_json = json.load(f)

transcripts = []
for key in tqdm(data_json.keys()):
    utt = data_json[key]
    transcript = utt["raw_transcript"]
    transcripts.append(transcript)
with open("vocabs/csj_train_nodup_sp.txt", "w") as f:
    for transcript in transcripts:
        f.write(transcript + "\n")

100%|██████████| 1158900/1158900 [00:00<00:00, 1404385.87it/s]


In [6]:
# Tokenizerの作成
from tokenizer import SentencePieceTokenizer

SentencePieceTokenizer.create_model(
    transcription_file_path="vocabs/csj_train_nodup_sp.txt",
    model_prefix="vocabs/csj_train_nodup_sp_4096.bpe",
    num_tokens=4096,
    model_type="bpe",
    character_coverage=0.9995
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=vocabs/csj_train_nodup_sp.txt --model_prefix=vocabs/csj_train_nodup_sp_4096.bpe --vocab_size=4096 --character_coverage=0.9995 --model_type=bpe --control_symbols=<blank> --unk_id=1 --bos_id=2 --eos_id=3 --pad_id=4
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: vocabs/csj_train_nodup_sp.txt
  input_format: 
  model_prefix: vocabs/csj_train_nodup_sp_4096.bpe
  model_type: BPE
  vocab_size: 4096
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  control_symbols: <blank>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  tra

In [7]:
# Tokenizerの作成
from tokenizer import SentencePieceTokenizer

SentencePieceTokenizer.create_model(
    transcription_file_path="vocabs/yesno_train_transcripts.txt",
    model_prefix="vocabs/yesno",
    num_tokens=1000,
    model_type="char",
    character_coverage=1
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=vocabs/yesno_train_transcripts.txt --model_prefix=vocabs/yesno --vocab_size=1000 --character_coverage=1 --model_type=char --control_symbols=<blank> --unk_id=1 --bos_id=2 --eos_id=3 --pad_id=4
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: vocabs/yesno_train_transcripts.txt
  input_format: 
  model_prefix: vocabs/yesno
  model_type: CHAR
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  control_symbols: <blank>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_