## E25 - Spectrogram classification 모델 구현
### 1. 데이터 처리와 분류
#### 라벨 데이터 처리하기

In [1]:
import numpy as np
import os

data_path = os.getenv("HOME")+'/aiffel/speech_recognition/data/speech_wav_8000.npz'
speech_data = np.load(data_path)

In [14]:
speech_data.files

['wav_vals', 'label_vals']

In [17]:
print("Wave data shape:", speech_data["wav_vals"].shape)
print("Label data shape:", speech_data["label_vals"].shape)

Wave data shape: (50620, 8000)
Label data shape: (50620, 1)


In [18]:
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

label_value = target_list
label_value.append('unknown')
label_value.append('silence')

new_label_value = dict()
for i, l in enumerate(label_value):
    new_label_value[l] = i
label_value = new_label_value

In [19]:
temp = []
for v in speech_data["label_vals"]:
    temp.append(label_value[v[0]])
label_data = np.array(temp)

#### train, test 분리

In [20]:
from sklearn.model_selection import train_test_split

sr = 8000
train_wav, test_wav, train_label, test_label = train_test_split(speech_data["wav_vals"], 
                                                                label_data, 
                                                                test_size=0.1,
                                                                shuffle=True)
print(train_wav)

train_wav = train_wav.reshape([-1, sr, 1]) # add channel for CNN
test_wav = test_wav.reshape([-1, sr, 1])

[[ 3.2947338e-03  4.7987588e-03  1.4206454e-04 ...  2.6489818e-03
   2.1685450e-03 -1.7849552e-03]
 [ 2.4449188e-04  3.1953814e-04  1.6096933e-04 ... -4.3812816e-04
  -6.9981499e-04 -6.5533165e-04]
 [-8.1074348e-04 -2.2321553e-03 -1.3870993e-03 ... -1.9992180e-03
  -1.2605948e-03 -1.5686379e-03]
 ...
 [ 6.5565930e-04  1.8049298e-03  2.3677533e-03 ... -2.5520814e-03
  -2.5446941e-03 -3.6315045e-03]
 [ 3.2935701e-03  4.7346000e-03  8.2463746e-05 ...  2.5994417e-03
   2.0828587e-03 -1.8811021e-03]
 [ 1.6217533e-01  3.3195928e-02 -2.5768925e-03 ... -2.2015899e-01
  -1.0517484e-01 -4.4664241e-02]]


In [21]:
print("train data : ", train_wav.shape)
print("train labels : ", train_label.shape)
print("test data : ", test_wav.shape)
print("test labels : ", test_label.shape)

train data :  (45558, 8000, 1)
train labels :  (45558,)
test data :  (5062, 8000, 1)
test labels :  (5062,)


In [22]:
train_wav[0]

array([[ 0.00329473],
       [ 0.00479876],
       [ 0.00014206],
       ...,
       [ 0.00264898],
       [ 0.00216855],
       [-0.00178496]], dtype=float32)

### 2. 학습을 위한 하이퍼파라미터 설정

In [23]:
batch_size = 128
max_epochs = 10

# the save point
checkpoint_dir = os.getenv('HOME')+'/aiffel/speech_recognition/models/wav'

checkpoint_dir

'/home/aiffel0042/aiffel/speech_recognition/models/wav'

### 3. 데이터셋 구성

In [24]:
def one_hot_label(wav, label):
    label = tf.one_hot(label, depth=12)
    return wav, label

In [25]:
import tensorflow as tf

# for train
train_dataset = tf.data.Dataset.from_tensor_slices((train_wav, train_label))
train_dataset = train_dataset.map(one_hot_label)
train_dataset = train_dataset.repeat().batch(batch_size=batch_size)
print(train_dataset)

# for test
test_dataset = tf.data.Dataset.from_tensor_slices((test_wav, test_label))
test_dataset = test_dataset.map(one_hot_label)
test_dataset = test_dataset.batch(batch_size=batch_size)
print(test_dataset)

<BatchDataset shapes: ((None, 8000, 1), (None, 12)), types: (tf.float32, tf.float32)>
<BatchDataset shapes: ((None, 8000, 1), (None, 12)), types: (tf.float32, tf.float32)>


### 4. 2차원 Spectrogram을 처리하는 모델 구성

In [26]:
import librosa

def wav2spec(wav, fft_size=258): # spectrogram shape을 맞추기위해서 size 변형
    D = np.abs(librosa.stft(wav, n_fft=fft_size))
    return D

In [27]:
spec = wav2spec(data)
print("Waveform shape : ",data.shape)
print("Spectrogram shape : ",spec.shape)

NameError: name 'data' is not defined