Step 1: Setup and Imports

In [3]:
# Step 1: Setup and Imports
import os
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import logging
import soundfile as sf
import noisereduce as nr
from scipy.signal import butter, lfilter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


Step 2: Audio Preprocessing


In [4]:
# Step 2: Audio Preprocessing
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return b, a

def bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

def preprocess_audio(audio_path, output_path):
    y, sr = librosa.load(audio_path, sr=None)
    y_reduced = nr.reduce_noise(y=y, sr=sr, prop_decrease=0.9)
    y_equalized = bandpass_filter(y_reduced, lowcut=50, highcut=8000, fs=sr, order=6)
    y_normalized = librosa.util.normalize(y_equalized)
    sf.write(output_path, y_normalized, sr)
    print(f"Processed and saved: {output_path}")

# Directory paths
input_dir = "D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/recordings"
output_dir = "D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Preprocess all audio files
for filename in os.listdir(input_dir):
    if filename.endswith(".wav"):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)
        preprocess_audio(input_path, output_path)


  y, sr = librosa.load(audio_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_All_work_and_no_play_makes_Jack_a_dull_boy..wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_A_big_black_bear_sat_on_a_big_black_rug..wav


  y, sr = librosa.load(audio_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Bright_vases_are_kept_on_the_wooden_shelf..wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Choose_a_sentence_to_record.wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Current_Sentence_to_Record.wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_How_are_you_today.wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_How_much_wood_would_a_woodchuck_chuck_if_a_woodchuck_could_chuck_wood_.wav
Processed and saved: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Peter_Piper_picked_a_peck_of_pickled_peppers..wav
Processed and saved: D:

Step 3: Data Augmentation and Feature Extraction


In [5]:
# Step 3: Data Augmentation and Feature Extraction
def augment_audio(y, sr):
    noise = np.random.randn(len(y))
    y_noise = y + 0.005 * noise
    y_shift = np.roll(y, sr // 10)
    y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=2.0)
    return [y, y_noise, y_shift, y_pitch]

def extract_features(audio_paths, sr=22050, n_mels=128, max_len=500):
    mel_specs = []
    for audio_path in audio_paths:
        y, sr = librosa.load(audio_path, sr=sr)
        logger.info(f"Audio loaded: {audio_path}, sample rate: {sr}, length: {len(y)}")
        augmented_audios = augment_audio(y, sr)
        for i, y_aug in enumerate(augmented_audios):
            mel_spec = librosa.feature.melspectrogram(y=y_aug, sr=sr, n_mels=n_mels)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            mel_spec_db = (mel_spec_db - np.mean(mel_spec_db)) / np.std(mel_spec_db)
            if mel_spec_db.shape[1] > max_len:
                mel_spec_db = mel_spec_db[:, :max_len]
            else:
                pad_width = max_len - mel_spec_db.shape[1]
                mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant', constant_values=(0,))
            mel_specs.append(mel_spec_db)
    mel_specs = np.array(mel_specs)
    mel_specs = mel_specs.reshape((len(audio_paths), 4, n_mels, max_len))
    return mel_specs, mel_specs  # Dummy return for both X and y

# Load preprocessed audio files and extract features
audio_folder = "D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings"
audio_paths = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith('.wav')]
X_train, y_train = extract_features(audio_paths)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


INFO:__main__:Audio loaded: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_All_work_and_no_play_makes_Jack_a_dull_boy..wav, sample rate: 22050, length: 123039
INFO:__main__:Audio loaded: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_A_big_black_bear_sat_on_a_big_black_rug..wav, sample rate: 22050, length: 150822
INFO:__main__:Audio loaded: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Bright_vases_are_kept_on_the_wooden_shelf..wav, sample rate: 22050, length: 124362
INFO:__main__:Audio loaded: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Choose_a_sentence_to_record.wav, sample rate: 22050, length: 101871
INFO:__main__:Audio loaded: D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/preprocessed_recordings\test_Current_Sentence_to_Record.wav, sample rate: 2205

X_train shape: (20, 4, 128, 500)
y_train shape: (20, 4, 128, 500)


Step 4: Model Training


In [6]:
# Step 4: Model Training
def train_tts_model(X_train, y_train):
    input_shape = (X_train.shape[2], X_train.shape[3], 1)
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(0.01)),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(y_train.shape[-1] * y_train.shape[-2], activation='linear'),
        layers.Reshape((y_train.shape[-2], y_train.shape[-1]))
    ])

    initial_learning_rate = 0.001
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=100000,
        decay_rate=0.96,
        staircase=True
    )
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
    ]

    history = model.fit(
        X_train.reshape((-1, *input_shape)),
        y_train.reshape((-1, *input_shape)),
        validation_split=0.2,
        epochs=100,
        batch_size=32,
        callbacks=callbacks
    )
    return model

model_path = "D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/models/test_tts_model.keras"
model = train_tts_model(X_train, y_train)
model.save(model_path)


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14s/step - loss: 11.5582 - val_loss: 4.9843
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13s/step - loss: 4.5344 - val_loss: 2.8532
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - loss: 2.9337 - val_loss: 3.1971
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - loss: 3.3368 - val_loss: 3.4983
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - loss: 3.5029 - val_loss: 3.0799
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8s/step - loss: 3.0104 - val_loss: 2.3287
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9s/step - loss: 2.2599 - val_loss: 1.7101
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 17s/step - loss: 1.6898 - val_loss: 1.4117
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

Step 5: Model Loading and Audio Synthesis


In [10]:
# Step 5: Model Loading and Audio Synthesis with Audio Playback
from IPython.display import Audio, display

def load_tts_model(model_path):
    if os.path.exists(model_path):
        model = load_model(model_path)
        return model
    else:
        raise FileNotFoundError(f"Model not found at {model_path}")

def text_to_sequence(text, max_len=500, n_mels=128):
    sequence = [ord(char) for char in text]
    if len(sequence) > max_len:
        sequence = sequence[:max_len]
    else:
        sequence = sequence + [0] * (max_len - len(sequence))
    sequence = np.array(sequence)
    sequence = sequence.reshape((1, max_len))
    sequence_3d = np.zeros((1, n_mels, max_len))
    sequence_3d[0, 0, :] = sequence
    return sequence_3d

def griffin_lim(mel_spec, n_iter=100):
    stft_matrix = librosa.feature.inverse.mel_to_stft(mel_spec)
    stft_matrix = stft_matrix.astype(np.complex64)
    waveform = librosa.griffinlim(stft_matrix, n_iter=n_iter, hop_length=512)
    return waveform

def generate_audio(text, model_path):
    model = load_tts_model(model_path)
    sequence = text_to_sequence(text)
    mel_spec = model.predict(sequence)
    audio = griffin_lim(mel_spec[0])
    return audio

text = "Hello, this is a test."
result_path = "D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/synthesized/test_synthesized_audio.wav"
audio = generate_audio(text, model_path)
sf.write(result_path, audio, 22050)
print(f"Generated audio saved to {result_path}")

# Display audio player
display(Audio(audio, rate=22050))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Generated audio saved to D:/NEU/NUwork/Voice Cloning Web Application Project/VoiceCloner/media/synthesized/test_synthesized_audio.wav
