# Speech Recognition: CNN for Spoken Language
畳み込みニューラルネットワークによる音声認識  
- https://chsasank.github.io/spoken-language-understanding.html

<img src="SR.png">

要点: 波形の形状のパターンを学習することで、畳み込みによって特徴を抽出する

### Process 手順
- 音声データ（Dataset: ）
- 波形獲得（.wav）
- 特徴抽出（転移学習: ）
- モデル構築（CNN）

### Terminology 専門用語
- メル周波数ケプストラム係数（MFCC）

## Step 1. Data Preparation

In [24]:
import keras
from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D
from keras.models import Sequential
import librosa
import librosa.display
import numpy as np
import pandas as pd
import random

import warnings
warnings.filterwarnings('ignore')

In [None]:
sounds = pd.read_csv('UrbanSound8K.csv')
sounds.head(3)

In [None]:
# 一定の長さ以上のデータのみを対象にする 3秒以上
sounds = sounds[sounds["start"] - sounds["end"] >= 3]
sounds.set_index("fsID")
sounds = sounds[["slice_file_name", "classID", "fold"]]
sounds.head()

In [None]:
sounds.shape

In [None]:

path1 = "/Users/akr712/Desktop/音声認識/UrbanSound8K/audio/fold6/135160-8-0-0.wav"
# 音声データの読み込み
y, sr = librosa.load(path1, duration=3.0)
# メル周波係数の取得
ps = librosa.feature.melspectrogram(y=y, sr=sr)
ps.shape

In [25]:
librosa.display.specshow(ps, y_axis="mel", x_axis="time", cmap="summer")

'/Users/akr712/Desktop/音声認識'

In [None]:
# chirdlen
path2 = "/Users/akr712/Desktop/音声認識/UrbanSound8K/audio/fold6/135160-8-0-0.wav"
y, sr = librosa.load(path2, duration=3.0)
ps2 = librosa.feature.melspectrogram(y=y, sr=sr)
librosa.display.specshow(ps2, y_axis="mel", x_axis="time", cmap="summer")

In [None]:
# chirdlen
path2 = "/Users/akr712/Desktop/音声認識/UrbanSound8K/audio/fold6/135160-8-0-0.wav"
y, sr = librosa.load(path2, duration=3.0)
ps2 = librosa.feature.melspectrogram(y=y, sr=sr)
librosa.display.specshow(ps2, y_axis="mel", x_axis="time", cmap="summer")

In [None]:
labels = ["", ]
base_path = "/Users/akr712/Desktop/音声認識/UrbanSound8K/audio/"
sounds["path"] = base_path + sounds['fold'].astype("str") + "/" +sounds["slice_file_name"].astype("str")
sounds.head()

![](http://dkopczyk.quantee.co.uk/wp-content/uploads/2018/08/wav-768x132.png)

In [26]:
# listにデータを集める
speech_datas = {}
for row in sounds.itertuples():
    y, sr = librosa.load(row[3], duration=2.97)  
    ps = librosa.feature.melspectrogram(y=y, sr=sr)
    if ps.shape != (128, 128): 
        continue
    speech_datas[row[1]] = ps

'/Users/akr712/Desktop/音声認識'

In [None]:
len(speech_datas)

In [None]:
# クラスをバラバラにするために一応シャッフル
from random import shuffle

melspectrograms = list(speech_datas.values())
shuffle(melspectrograms)
speech_datas = dict(zip(speech_datas.keys(), melspectrograms))

### Reshape Image Data into 3D for 2D CNN 

In [None]:
# reshape and データ作成

from keras.utils import to_categorical

x_train = []
y_train = []
x_test = []
x_test = []
counter = 0

for label, ps in speech_datas.items():
    
    label = np.array(to_categorical(label, 10))
    ps = np.array(ps.reshape((128, 128, 1)))
    
    if counter < 7000:
        x_train.append(ps)
        y_train.append(label)
        counter += 1
    else:
        x_test.append(ps)
        y_test.append(label)
        counter += 1
        
counter
print(len(x_train), len(x_test))

## Step 2. Build CNN Model
### CNN: Convolutional Neural Networks
The basic architecture of CNN includes:
1. Convolutional Layer – uses convolutional operator to filter input signal and extract some additional image features
2. Activation Function – applies non-linear function such as rectifier to the outputs of convolutional layer
3. Pooling Layer – performs a downsampling operation reducing the size of an input with max() or sum() operation
4. Fully-Connected Layer – each neuron in the previous layer is connected to each neuron on the next layer with last such layer producing outputs of neural network.

![](https://bookdown.org/wshuyi/dive-into-data-science-practically/assets/2018-06-27-11-12-04-076004.png)

In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization,Conv2D, MaxPooling2D

In [None]:
# Modeling
features_shape = (128, 128, 1)

inputs = Input(shape=features_shape)

# Block 1
o = Conv2D(24, (5, 5), strides=(1, 1), input_shape=features_shape)(inputs)
o = MaxPooling2D(pool_size=(4, 2), strides=(4, 2))(o)
# o = Activation('relu')(o)
o = BatchNormalization()(o)

# Block 2
o = Conv2D(48, (5, 5), padding="valid")(o)
o = MaxPooling2D((4, 2), strides=(4, 2))(o)
# o = Activation('relu')(o)
o = BatchNormalization()(o)

# Block 3
o = Conv2D(48, (5, 5), padding="valid")(o)
o = Activation("relu")(o)

# Flatten
o = Flatten()(o)
# o = Dropout(rate=0.5)(o)

# Dense layer
o = Dense(64, activation="relu")(o)
o = BatchNormalization()(o)
o = Dropout(rate=0.5)(o)

# Predictions
outputs = Dense(10, activation="softmax")(o)

model = Model(inputs, outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])

## Step 3. Training CNN for Classification

In [None]:
history = model.fit(x_train, y_train, 
                    epochs=12, batch_size=128,
                    validation_data=(x_test, y_test))

In [None]:
import matplotlib.pyplot as plt

training_loss = history.history['loss']
test_loss = history.history['val_loss']

epoch_count = range(1, len(training_loss) + 1)

plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show();

In [None]:
training_acc = history.history['acc']
test_acc = history.history['val_acc']

epoch_count = range(1, len(training_acc) + 1)

plt.plot(epoch_count, training_acc, 'r--')
plt.plot(epoch_count, test_acc, 'b-')
plt.legend(['Training Accuracy', 'Test Accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show();

In [None]:
score = model.evaluate(x_test, y_test)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

#### 今後: Data Augumentation を使った場合に分類の精度が向上するか試したい