[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rbg-research/AI-Training/blob/main/voice-analytics/voice-analytics-deep-learning/session-1/Tutorial-4.ipynb)

# Audio Classification - Common Voice Corpus - Gender Prediction

In [None]:
!pip3 install datasets

In [None]:
!pip3 install librosa

In [None]:
!pip3 install matplotlib

In [None]:
!pip3 install pandas

In [None]:
!pip3 install sklearn

In [None]:
!pip3 install tensorflow

In [1]:
import random
random.seed(7)

### 1. Data Preparation

In [2]:
import datasets
import pandas as pd

In [3]:
# loading train and test data 

dataset = datasets.load_dataset('common_voice', 'ta') # loads tamil corpus

train = dataset["train"] # get the train split

test = dataset["test"] # get the test split


Reusing dataset common_voice (/home/ubuntu/.cache/huggingface/datasets/common_voice/ta/6.1.0/078d412587e9efeb0ae2e574da99c31e18844c496008d53dc5c60f4159ed639b)


  0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
# making a dataframe for faster processing
train_files, train_labels = dataset["train"]["path"], dataset["train"]["gender"]
train_df = pd.DataFrame.from_dict({"path": train_files, "label": train_labels})

# filtering the sample that doesn't have the gender class
train_df = train_df[train_df["label"]!=""]
train_df = train_df[train_df["label"]!="others"]
# getting audio files and ground truth labels
train_files, train_labels = list(train_df["path"]), list(train_df["label"])



# repeating same steps to test corpus also
test_files, test_labels = dataset["test"]["path"], dataset["test"]["gender"]
test_df = pd.DataFrame.from_dict({"path": test_files, "label": test_labels})

# filtering the sample that doesn't have the gender class
test_df = test_df[test_df["label"]!=""]
test_df = test_df[test_df["label"]!="other"]
# getting audio files and ground truth labels
test_files, test_labels = list(test_df["path"]), list(test_df["label"])


In [5]:
# number of training samples availble for each class
print("number of train samples per class 'male':", train_labels.count("male"))
print("number of train samples per class 'female':", train_labels.count("female"))

number of train samples per class 'male': 885
number of train samples per class 'female': 770


In [6]:
# number of test samples availble for each class
print("number of test samples per class 'male':", test_labels.count("male"))
print("number of test samples per class 'female':", test_labels.count("female"))

number of test samples per class 'male': 1073
number of test samples per class 'female': 166


### 2. Feature Extraction

In [7]:
import librosa
import numpy as np

In [8]:
def feature_chromagram(waveform, sample_rate):
    # STFT computed here explicitly; mel spectrogram and MFCC functions do this under the hood
    stft_spectrogram=np.abs(librosa.stft(waveform))
    # Produce the chromagram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=sample_rate).T,axis=0)
    return chromagram

def feature_melspectrogram(waveform, sample_rate):
    # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128, fmax=8000).T,axis=0)
    return melspectrogram

def feature_mfcc(waveform, sample_rate):
    # Compute the MFCCs for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # 40 filterbanks = 40 coefficients
    mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=40).T, axis=0) 
    return mfc_coefficients

In [9]:
def get_features(file):
    # load an individual soundfile
     
    waveform , sr = librosa.load(file, sr=16000)
    sample_rate = sr
    # compute features of soundfile
    chromagram = feature_chromagram(waveform, sample_rate)
    melspectrogram = feature_melspectrogram(waveform, sample_rate)
    mfc_coefficients = feature_mfcc(waveform, sample_rate)

    feature_matrix=np.array([])
    # use np.hstack to stack our feature arrays horizontally to create a feature matrix
    feature_matrix = np.hstack((chromagram, melspectrogram, mfc_coefficients))

    return feature_matrix

In [10]:
def get_feature_matrix(files, labels):
    temp_feats = list()
    temp_labels = list()
    for file, label in zip(files, labels):
        features = get_features(file)
        temp_feats.append(features)
        temp_labels.append(label)
    return np.array(temp_feats), np.array(temp_labels)

In [11]:
train_features, train_labels = get_feature_matrix(train_files, train_labels)
test_features, test_labels = get_feature_matrix(test_files, test_labels)























































































































































In [12]:
train_features.shape

(1655, 180)

In [13]:
test_features.shape

(1239, 180)

### 3. Train, Validation and Test

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [15]:
encoder = LabelEncoder()
encoder.fit((train_labels.tolist() + test_labels.tolist()))

encoded_train_labels = encoder.transform(train_labels)
encoded_test_labels = encoder.transform(test_labels)

In [16]:

X_train, X_test, y_train, y_test = train_test_split(
    train_features, 
    encoded_train_labels, 
    test_size=0.2, 
    random_state=69
)


In [17]:
test_features = test_features
encoded_test_labels = encoded_test_labels

### 4. Neural Network Classifier

In [18]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder

##### Base Line Network

In [19]:
# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=train_features.shape[1], activation='relu')) # one hidden layer
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
model = create_baseline()

2021-10-25 01:53:17.021225: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-10-25 01:53:17.079107: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-25 01:53:17.079676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:00:1e.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-10-25 01:53:17.079920: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-10-25 01:53:17.081841: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-10-25 01:53:17.083010: I tensorflow/stream_executor/platform/default/d

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 60)                10860     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 61        
Total params: 10,921
Trainable params: 10,921
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.fit(X_train, y_train, batch_size=4, epochs=100, verbose=1, validation_data=(X_test, y_test))

Epoch 1/100


2021-10-25 01:53:18.237553: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<tensorflow.python.keras.callbacks.History at 0x7fae4043c8b0>

In [23]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

y_pred = model.predict_classes(test_features)
y_true = encoded_test_labels
print(f'Test Set Accuracy score =  {100*accuracy_score(y_true, y_pred):.3f}%') #same as model.score(X_test, y_test)
print(f'Test Set Precision score =  {100*precision_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set Recall score =  {100*recall_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set F-score score =  {100*f1_score(y_true, y_pred, average="macro"):.3}%')

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Test Set Accuracy score =  68.765%
Test Set Precision score =  47.700%
Test Set Recall score =  46.576%
Test Set F-score score =  46.6%


##### Simple U_network

In [24]:
# baseline model
def create_3_layer():
    # create model
    model = Sequential()
    model.add(Dense(64, input_dim=train_features.shape[1], activation='relu')) # one hidden layer
    model.add(Dense(128, activation='relu')) # one hidden layer
    model.add(Dense(256, activation='relu')) # one hidden layer
    model.add(Dense(128, activation='relu')) # one hidden layer
    model.add(Dense(64, activation='relu')) # one hidden layer
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [25]:
model = create_3_layer()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 64)                11584     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 94,145
Trainable params: 94,145
Non-trainable params: 0
__________________________________________________

In [26]:
model.fit(X_train, y_train, batch_size=4, epochs=100, verbose=1, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fae402013a0>

In [27]:
y_pred = model.predict_classes(test_features)
y_true = encoded_test_labels
print(f'Test Set Accuracy score =  {100*accuracy_score(y_true, y_pred):.3f}%') #same as model.score(X_test, y_test)
print(f'Test Set Precision score =  {100*precision_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set Recall score =  {100*recall_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set F-score score =  {100*f1_score(y_true, y_pred, average="macro"):.3}%')

Test Set Accuracy score =  67.635%
Test Set Precision score =  47.708%
Test Set Recall score =  46.433%
Test Set F-score score =  46.4%


##### Residual Network

![An example Residual Block](src/residual.png)

In [28]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="sigmoid", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)

In [29]:
model = build_model((180, 1), 1)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 180, 1)]     0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 180, 16)      64          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 180, 16)      0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 180, 16)      784         activation[0][0]                 
______________________________________________________________________________________________

In [30]:
model.compile(
    optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"]
)

In [31]:
model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

In [32]:
model.fit(X_train, y_train, batch_size=4, epochs=100, verbose=1, validation_data=(X_test, y_test),
         callbacks=[earlystopping_cb, mdlcheckpoint_cb],)

Epoch 1/100


2021-10-25 01:55:03.781245: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


<tensorflow.python.keras.callbacks.History at 0x7fadc85e7e20>

In [33]:
y_pred = model.predict(test_features)
y_pred = np.where(y_pred >= 0.5, 1, 0).flatten()
print(f'Test Set Accuracy score =  {100*accuracy_score(y_true, y_pred):.3f}%') #same as model.score(X_test, y_test)
print(f'Test Set Precision score =  {100*precision_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set Recall score =  {100*recall_score(y_true, y_pred, average="macro"):.3f}%')
print(f'Test Set F-score score =  {100*f1_score(y_true, y_pred, average="macro"):.3}%')

Test Set Accuracy score =  83.777%
Test Set Precision score =  65.196%
Test Set Recall score =  65.428%
Test Set F-score score =  65.3%
