# 语音指令识别

这部分是训练使用的代码，包括8个部分：

- 1、数据读取
- 2、数据增强
- 3、提取mfcc特征
- 4、提取logmelspectrum特征
- 5、vgg训练
- 6、resnet训练
- 7、densenet训练
- 8、集成三个网络模型

In [1]:
import warnings
warnings.filterwarnings("ignore")

# 通用
import numpy as np
import pandas 
import matplotlib.pyplot as plt
import os

# 语音处理
import librosa
import librosa.display
import IPython.display as ipd
import webrtcvad

from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from tools import get_data, preprocess_mel, preprocess_mfcc

from vggnet19 import vgg_net
from resnet50 import ResNet50
from densenet import DenseNet
from warmup_cosdecay import WarmUpCosineDecayScheduler

## 1. 数据读取

In [2]:
raw_train_data, train_label_names = get_data('dataset/train/')
raw_test_data, test_label_names = get_data('dataset/test/')

## 2. 数据增强

In [3]:
# 包括增加噪声、time shift、time stretch、pitch shift

def add_noise(x, w=0.006):
    output = x + w * np.random.normal(loc=0, scale=1, size=len(x))
    return output

def time_shift(x, shift):
    return np.roll(x, int(shift))

def time_stretch(x, rate):
    return librosa.effects.time_stretch(x, rate)

def pitch_shift(x, sr, n_steps, bins_per_octave):
    return librosa.effects.pitch_shift(x, sr, n_steps, bins_per_octave=bins_per_octave)

In [4]:
new_train_datas = []
for datas in raw_train_data:
    new_datas = []
    for data in datas:
        sample, sr = data
        
        new_datas.append([add_noise(sample), sr])
        new_datas.append([time_shift(sample, np.random.uniform(2000, 4000)), sr])
        new_datas.append([time_stretch(sample, np.random.uniform(1.5, 2.5)), sr])
            
        if np.random.randint(0, 2) == 0:
            new_datas.append([pitch_shift(sample, sr, 6, 12), sr])
        else:
            new_datas.append([pitch_shift(sample, sr, -6, 12), sr])
        
    new_train_datas.append(new_datas)        

In [5]:
# 将增强后生成的数据加入训练集
for i in range(len(raw_train_data)):
    raw_train_data[i].extend(new_train_datas[i])

## 3. Delta2mfcc

In [6]:
label_dit = {'go': 0, 'left': 1, 'off': 0, 'on': 0, 'right': 2, 'stop': 3}

# 对数据预处理并生成label
delta_mfcc_train_data = []
delta_mfcc_train_labels = []
for i in range(len(raw_train_data)):
    delta_mfcc_train_data.extend(list(map(preprocess_mfcc, raw_train_data[i])))
    delta_mfcc_train_labels.extend(np.ones(len(raw_train_data[i])).astype(np.int32) * label_dit[train_label_names[i]])
    
# 打乱数据，很重要
delta_mfcc_train_data, delta_mfcc_train_labels = shuffle(delta_mfcc_train_data, delta_mfcc_train_labels) 

delta_mfcc_test_data = []
delta_mfcc_test_labels = []
for i in range(len(raw_test_data)):
    delta_mfcc_test_data.extend(list(map(preprocess_mfcc, raw_test_data[i])))
    delta_mfcc_test_labels.extend(np.ones(len(raw_test_data[i])).astype(np.int32) * label_dit[test_label_names[i]])

In [7]:
delta_mfcc_train_data = np.array(delta_mfcc_train_data)
delta_mfcc_train_labels = np.array(delta_mfcc_train_labels)

delta_mfcc_test_data = np.array(delta_mfcc_test_data)
delta_mfcc_test_labels = np.array(delta_mfcc_test_labels)

delta_mfcc_train_data.shape, delta_mfcc_train_labels.shape, delta_mfcc_test_data.shape, delta_mfcc_test_labels.shape

((11960, 40, 32, 1), (11960,), (289, 40, 32, 1), (289,))

## 4. logmelspectrum

In [8]:
label_dit = {'go': 0, 'left': 1, 'off': 0, 'on': 0, 'right': 2, 'stop': 3}

# 对数据预处理并生成label
mel_train_data = []
mel_train_labels = []
for i in range(len(raw_train_data)):
    mel_train_data.extend(list(map(preprocess_mel, raw_train_data[i])))
    mel_train_labels.extend(np.ones(len(raw_train_data[i])).astype(np.int32) * label_dit[train_label_names[i]])
    
# 打乱数据，很重要
mel_train_data, mel_train_labels = shuffle(mel_train_data, mel_train_labels) 

mel_test_data = []
mel_test_labels = []
for i in range(len(raw_test_data)):
    mel_test_data.extend(list(map(preprocess_mel, raw_test_data[i])))
    mel_test_labels.extend(np.ones(len(raw_test_data[i])).astype(np.int32) * label_dit[test_label_names[i]])

In [9]:
mel_train_data = np.array(mel_train_data)
mel_train_labels = np.array(mel_train_labels)

mel_test_data = np.array(mel_test_data)
mel_test_labels = np.array(mel_test_labels)

mel_train_data.shape, mel_train_labels.shape, mel_test_data.shape, mel_test_labels.shape

((11960, 120, 32, 1), (11960,), (289, 120, 32, 1), (289,))

# 5. VGG19

In [10]:
vgg = vgg_net((40, 32, 1), 4)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor



In [11]:
warm_up_lr = WarmUpCosineDecayScheduler(
    learning_rate_base=0.01,
    total_steps=2000,
    warmup_learning_rate=4e-06,
    warmup_steps=100,
    hold_base_rate_steps=0,
)

optimizer = keras.optimizers.Adam()

callbacks = [
    keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
    warm_up_lr
]

vgg.compile(optimizer=optimizer,
            loss=keras.losses.sparse_categorical_crossentropy,
            metrics=['accuracy'])

In [19]:
vgg.fit(delta_mfcc_train_data, delta_mfcc_train_labels, epochs=50, batch_size=32, validation_split=0.2, callbacks=callbacks)

In [40]:
# vgg.save_weights('vgg_net_weight/vgg19_net')

In [14]:
vgg.load_weights('vgg_net_weight/vgg19_net')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f5374f4eb70>

In [15]:
vgg.evaluate(delta_mfcc_test_data, delta_mfcc_test_labels)



[0.5885787816608653, 0.8546713]

## 6. ResNet50

In [16]:
resnet = ResNet50(input_shape=(120, 32, 1), classes=4)

In [17]:
warm_up_lr = WarmUpCosineDecayScheduler(
    learning_rate_base=0.01,
    total_steps=2000,
    warmup_learning_rate=4e-06,
    warmup_steps=200,
    hold_base_rate_steps=0,
)

optimizer = keras.optimizers.Adam()

callbacks = [
    keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
    warm_up_lr
]

resnet.compile(
    optimizer=optimizer,
    loss=keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)

In [18]:
resnet.fit(mel_train_data, mel_train_labels, epochs=50, batch_size=32, validation_split=0.2, callbacks=callbacks)

In [28]:
# resnet.save_weights('resnet_weight/resnet')

In [20]:
resnet.load_weights('resnet_weight/resnet')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f52a7e5de80>

In [21]:
resnet.evaluate(mel_test_data, mel_test_labels)



[0.4048273695252552, 0.8685121]

## 7. DenseNet

In [22]:
densenet_mel = DenseNet(4, (120, 32, 1), dropout_rate=0.4)

In [23]:
warm_up_lr = WarmUpCosineDecayScheduler(
    learning_rate_base=0.01,
    total_steps=2000,
    warmup_learning_rate=4e-06,
    warmup_steps=200,
    hold_base_rate_steps=0,
)

optimizer = keras.optimizers.Adam()

callbacks = [
    keras.callbacks.EarlyStopping(patience=3, monitor='val_loss'),
    warm_up_lr
]

densenet_mel.compile(
    optimizer=optimizer,
    loss=keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']
)

In [24]:
densenet_mel.fit(mel_train_data, mel_train_labels, epochs=50, batch_size=32, validation_split=0.2, callbacks=callbacks)

In [25]:
# densenet_mel.save_weights('densenet/densenet')

In [26]:
densenet_mel.load_weights('densenet/densenet')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f529c88fac8>

In [27]:
densenet_mel.evaluate(mel_test_data, mel_test_labels)



[0.5699630165698206, 0.8512111]

## 8. 集成

In [29]:
# 检查两种label是否相同
assert sum(delta_mfcc_test_labels != mel_test_labels) == 0

In [30]:
sum(
    (vgg.predict(delta_mfcc_test_data) + 
     resnet.predict(mel_test_data) + 
     densenet_mel.predict(mel_test_data)).argmax(axis=1) == mel_test_labels
) / len(mel_test_labels)

0.9065743944636678