## Keyword spotting with triplet loss

In [1]:
from tensorflow.python.client import device_lib 
device_lib.list_local_devices()

2023-02-23 11:03:14.462535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-23 11:03:16.085733: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 9336812770462961031
 xla_global_id: -1]

In [2]:
DEV = 'cuda' # select your device 'cpu' or 'cuda'

### Import Library

In [3]:
import torch
import torch.nn as nn
import torchaudio
from tqdm import tqdm
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import IPython.display as ipd
from datetime import datetime
import os.path
import re
import hashlib

In [4]:
# !tar -zxvf google_speech_recognition_v1.tar.gz

## Dataset class
#### dataset으로는 google_speech_recognition_v1을 이용


In [5]:
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M

def which_set(filename, validation_percentage=10, testing_percentage=10):
  base_name = os.path.basename(filename)

  hash_name = re.sub(r'_nohash_.*$', '', base_name)

  hash_name_hashed = hashlib.sha1(hash_name.encode("utf-8")).hexdigest()
  percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result

In [6]:
which_set('0a7c2a8d_nohash_0.wav')

'training'

In [7]:
class MTATDataset:
  def __init__(self, dir_path, data_set = 'core', split='training', num_max_data=4000, sr=16000):
    self.dir = Path(dir_path)
        
    if data_set == 'core':
      sub_dir_ids = ["yes", "no", "up", "down", "left","right", "on", "off", "stop", "go","zero", "one", "two", "three", "four","five", "six", "seven", "eight", "nine"]

    if data_set == 'addition':
      sub_dir_ids = ['tree', 'bed', 'one', 'four', 'go', 'off', '_background_noise_', 'right', 'six', 'happy', 'sheila', 'five', 'down', 'cat', 'three', 'no', 'left', 'two', 'bird', 'on', 'marvin', 'zero', 'nine', 'dog', 'stop', 'house', 'seven', 'up', 'eight', 'wow', 'yes']
    
    self.file_dict = {}
    
    for sub in sub_dir_ids:
      path = self.dir/sub
      
      file_list = list(path.glob('**/*.wav'))
      file_list = [x for x in file_list if which_set(x)==split]
      
      # print(len(file_list))
      self.file_dict[sub] = file_list
    
    self.labels = sub_dir_ids
    self.sr = sr

  def __len__(self):
    return len(self.labels)
  

MTAT_DIR = Path('google_speech_recognition_v1/')

In [8]:
base_set = MTATDataset(MTAT_DIR)

In [17]:
# Dataset 구현 확인
sub_dir_ids = ["yes", "no", "up", "down", "left","right", "on", "off", "stop", "go","zero", "one", "two", "three", "four","five", "six", "seven", "eight", "nine"]
file_dict = {}
for sub in sub_dir_ids:
    path = Path('./google_speech_recognition_v1/')
    path = path/sub
    print("------------file_dir =", path,"--------------")
    file_list = list(path.glob('**/*.wav'))
    
    split_file_list = [x for x in file_list if which_set(x)=='testing']
    file_list = split_file_list
    print(len(file_list))
    file_dict[sub] = file_list

temp_file_path = file_dict['yes'][30]
print(temp_file_path)
wav_file = Path(temp_file_path)
waveform, sample_rate = torchaudio.load(wav_file, format='wav')
ipd.display(ipd.Audio(waveform, rate=16000))

------------file_dir = google_speech_recognition_v1/yes --------------
256
------------file_dir = google_speech_recognition_v1/no --------------
252
------------file_dir = google_speech_recognition_v1/up --------------
272
------------file_dir = google_speech_recognition_v1/down --------------
253
------------file_dir = google_speech_recognition_v1/left --------------
267
------------file_dir = google_speech_recognition_v1/right --------------
259
------------file_dir = google_speech_recognition_v1/on --------------
246
------------file_dir = google_speech_recognition_v1/off --------------
262
------------file_dir = google_speech_recognition_v1/stop --------------
249
------------file_dir = google_speech_recognition_v1/go --------------
251
------------file_dir = google_speech_recognition_v1/zero --------------
250
------------file_dir = google_speech_recognition_v1/one --------------
248
------------file_dir = google_speech_recognition_v1/two --------------
264
------------file_dir = 

In [28]:
class OnMemoryDataset(MTATDataset):
  def __init__(self, dir_path, data_set = 'core', split='training', num_max_data=4000, sr=16000):
    super().__init__(dir_path, data_set, split, num_max_data, sr)
    
    self.loaded_audios = self.load_audio()
    
  def load_audio(self):
    total_audio_datas = []

    for keys, values in self.file_dict.items():
      for i in values:
        waveform, sample_rate = torchaudio.load(i, format='wav')
        resampled_waveform = torchaudio.functional.resample(waveform[0], sample_rate, self.sr)
        total_audio_datas.append(resampled_waveform)

    return total_audio_datas

  def __getitem__(self, idx):

    return self.loaded_audios[idx], self.label_tensor[idx]

dummy_set = OnMemoryDataset(MTAT_DIR, split='train', num_max_data=50)
audio, label = dummy_set[10]
assert audio.ndim == 1, "Number of dimensions of audio tensor has to be 1. Use audio[0] or audio.mean(dim=0) to reduce it"
ipd.display(ipd.Audio(audio, rate=dummy_set.sr))
print(dummy_set.vocab[torch.where(label)])

IndexError: list index out of range

In [2]:
!nvidia-smi -q



Timestamp                                 : Thu Feb 23 14:41:41 2023
Driver Version                            : 528.49
CUDA Version                              : 12.0

Attached GPUs                             : 1
GPU 00000000:2B:00.0
    Product Name                          : NVIDIA GeForce RTX 3060
    Product Brand                         : GeForce
    Product Architecture                  : Ampere
    Display Mode                          : Enabled
    Display Active                        : Enabled
    Persistence Mode                      : Enabled
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : WDDM
        Pending                           : WDDM
    Serial Number                         : N/A
    GPU UUID                              : GPU-30e81568-116

In [16]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0


In [18]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.3953, 0.2537, 0.8155],
        [0.1708, 0.4165, 0.2770],
        [0.9159, 0.1907, 0.8065],
        [0.9414, 0.2669, 0.0441],
        [0.7146, 0.3097, 0.8394]])


In [17]:
import torch
torch.cuda.is_available()

False

In [10]:
torch.cuda.is_available()

False

In [27]:
values

NameError: name 'values' is not defined

In [25]:
base_set.dir
for key, value in base_set.file_dict.items():
    print(key)
    for i in value:
        waveform, sample_rate = torchaudio.load(i, format='wav')
        print('check')

yes
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
