In [None]:
import os
import shutil
import numpy as np
import tensorflow as tf
import keras
from pathlib import Path
from IPython.display import display, Audio
os.environ["KERAS_BACKEND"] = "tensorflow"

root = "../data/train/audio/audio/"
# Function to create dataset of audio paths and labels
def create_dataset(audio_paths, labels):
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: decode_file(x), num_parallel_calls=tf.data.AUTOTUNE)
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))

# Function to read and decode audio file
def decode_file(path):
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, sample_rate)
    return audio

# Function to add noise to audio
def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:
        tf_rnd = tf.random.uniform((tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32)
        noise = tf.gather(noises, tf_rnd, axis=0)
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)
        audio = audio + noise * prop * scale
    return audio

# Function to convert audio wave to FFT
def fft(audio):
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64))
    fft = tf.expand_dims(fft, axis=-1)
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

# Get audio file paths and corresponding labels
class_names = os.listdir(root)
audio_paths = []
labels = []
for label, name in enumerate(class_names):
    if name == '.DS_Store':
        continue
    dir_path = Path(root) / name
    if not dir_path.is_dir():
        continue
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)


# Define validation split percentage
validation = 0.1
shuffle = 43
sample_rate = 16000
# Shuffle the data
rng = np.random.RandomState(shuffle)
rng.shuffle(audio_paths)
rng = np.random.RandomState(shuffle)
rng.shuffle(labels)

# Split data into training and validation sets
num_val_samples = int(validation * len(audio_paths))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]



scale = 0.5 # for noise addition
batch_size = 128
epochs = 100

# Create datasets for training and validation
train_ds = create_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=batch_size * 8, seed=shuffle).batch(batch_size)
valid_ds = create_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=shuffle).batch(32)

# Apply transformations to datasets
train_ds = train_ds.map(lambda x, y: (fft(x), y), num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.map(lambda x, y: (fft(x), y), num_parallel_calls=tf.data.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

# Define residual block for the model
def residual_block(x, filters, conv_num=3, activation="relu", dropout_rate=0.2):
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
        x = keras.layers.Dropout(dropout_rate)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)

# Build the model
def build_model(input_shape, num_classes, dropout_rate=0.2):
    inputs = keras.layers.Input(shape=input_shape, name="input")
    x = residual_block(inputs, 16, 2, dropout_rate=dropout_rate)
    x = residual_block(x, 32, 2, dropout_rate=dropout_rate)
    x = residual_block(x, 64, 3, dropout_rate=dropout_rate)
    x = residual_block(x, 128, 3, dropout_rate=dropout_rate)
    x = residual_block(x, 128, 3, dropout_rate=dropout_rate)
    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dropout(dropout_rate)(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)
    return keras.models.Model(inputs=inputs, outputs=outputs)

# Compile the model
dropout_rate = 0.2
model = build_model((sample_rate // 2, 1), len(class_names), dropout_rate=dropout_rate)
model.compile(optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Define callbacks
earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    "speaker_recognition_model.h5", monitor="val_accuracy", save_best_only=True
)

# Train the model
history = model.fit(
    train_ds,
    epochs=epochs,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

# Evaluate the model
print(model.evaluate(valid_ds))

# Save class names to JSON file
import json
with open("class_names.json", "w") as f:
    json.dump(class_names, f)

# Save the model
model.save("model2.h5")


In [16]:
import wave
import numpy as np
from scipy.signal import resample

# Open the WAV file
wav_file = wave.open('../sample/irani/4.wav', 'r')

frames = wav_file.getnframes()
frame_rate = wav_file.getframerate()
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()

# Read the frames from the WAV file
frames = wav_file.readframes(frames)

# Convert the frames to a NumPy array
frames = np.frombuffer(frames, dtype=np.int16)

# Resample the audio to match the model's input sampling rate and length
target_length = sample_rate // 2  # Model expects input of length 8000
resampled_frames = resample(frames, target_length)

# Expand the dimensions to match the model's input shape
audio_input = np.expand_dims(resampled_frames, axis=-1)

wav_file.close()

prediction = model.predict(np.expand_dims(audio_input, axis=0))
predicted_label_index = np.argmax(prediction)
predicted_speaker = class_names[predicted_label_index]
print("Predicted Speaker:", predicted_speaker)


Predicted Speaker: irani
