<a href="https://colab.research.google.com/github/rabebsdiri45/tts-spongebob-voice-clone/blob/main/wavs_preproccessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code prepares the Spongebob dataset for use in training a TTS model. The dataset consists of WAV files containing Spongebob's voice,
 which were collected for this project. The code performs tasks such as file formatting, preprocessing, and data cleaning to ensure that
  the dataset is properly formatted and ready for use in training the TTS model.


**Install Coqui TTS**

In [None]:
!sudo apt-get install espeak-ng
!git clone https://github.com/coqui-ai/TTS
!pip install TTS

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#checking the sample rate of wav files
import os
import librosa

# Set the path to your folder containing the WAV files
folder_path = '/content/drive/MyDrive/VoiceCloning/datasets/wavs'

# Loop over all files in the folder with .wav extension
for file_name in os.listdir(folder_path):
    if file_name.endswith('.wav'):
        file_path = os.path.join(folder_path, file_name)
        audio, sr = librosa.load(file_path, sr=None)
        print(f'File name: {file_name}, Sample rate: {sr}')


the sample rate=32000

In [None]:
import os
ds_name = "vits-vctk-narrator-22k-ds" #@param {type:"string"}
output_directory = "traineroutput" #@param {type:"string"}
upload_dir = "wavs" #@param {type:"string"}
#MODEL_FILE = "/root/.local/share/tts/tts_models--en--ljspeech--vits/model_file.pth" #@param {type:"string"}
MODEL_FILE = "/root/.local/share/tts/tts_models--en--vctk--vits/model_file.pth" #@param {type:"string"}
upload_dir = "/content/drive/MyDrive/" + upload_dir
RUN_NAME = "VITS-en" #@param {type:"string"}


OUT_PATH = "/content/drive/MyDrive/"+ds_name+"/traineroutput/"
!mkdir $upload_dir
!mkdir /content/drive/MyDrive/$ds_name

In [None]:
#@title
%cd $upload_dir
!ls -al

**SNR :SIGNAL-TO-NOISE RATIO**



Signal-to-noise ratio (SNR) is a measure of the strength of a signal relative to the background noise present in a system. It is typically expressed as a ratio of the power of the signal to the power of the noise, and is often measured in decibels (dB).

A higher SNR indicates that the signal is stronger relative to the noise, which can lead to improved signal quality and more accurate measurements or analysis.

In [None]:
import os
import glob
import subprocess
import IPython
import soundfile as sf
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool
from matplotlib import pylab as plt
%matplotlib inline

In [None]:
!curl http://www.cs.cmu.edu/~robust/archive/algorithms/WADA_SNR_IS_2008/WadaSNR.tar.gz --output /content/drive/MyDrive/VoiceCloning/WadaSNR/WadaSNR.tar.gz
%cd /content/drive/MyDrive/VoiceCloning/TTS/WadaSNR
!tar -xvf /content/drive/MyDrive/VoiceCloning/WadaSNR/WadaSNR.tar.gz -C /content/drive/MyDrive/VoiceCloning/WadaSNR
%cd ..

In [None]:
# Set the meta parameters
DATA_PATH = "/content/drive/MyDrive/VoiceCloning/[dataset]" 
NUM_PROC = 1
CURRENT_PATH = '/content/drive/MyDrive/VoiceCloning' 

Set Up SNR Evaulation

In [None]:
def compute_file_snr(file_path):
    """ Convert given file to required format with FFMPEG and process with WADA."""
    _, sr = sf.read(file_path)
    new_file = file_path.replace(".wav", "_tmp.wav")
    if sr != 16000:
        command = f'ffmpeg -i "{file_path}" -ac 1 -acodec pcm_s16le -y -ar 16000 "{new_file}"'
    else:
        command = f'cp "{file_path}" "{new_file}"'
    os.system(command)
    command = [f'"{CURRENT_PATH}/WadaSNR/Exe/WADASNR"', f'-i "{new_file}"', f'-t "{CURRENT_PATH}/WadaSNR/Exe/Alpha0.400000.txt"', '-ifmt mswav']
    output = subprocess.check_output(" ".join(command), shell=True)
    try:
        output = float(output.split()[-3].decode("utf-8"))
    except:
        raise RuntimeError(" ".join(command))
    os.system(f'rm "{new_file}"')
    return output, file_path

Confirm the amount of wav files the dataset has

In [None]:
import glob
wav_files = glob.glob(f"{DATA_PATH}/**/*.wav", recursive=True)
print(f" > Number of wav files {len(wav_files)}")


Processes Dataset for SNR

In [None]:
if NUM_PROC == 1:
    file_snrs = [None] * len(wav_files) 
    for idx, wav_file in tqdm(enumerate(wav_files)):
        tup = compute_file_snr(wav_file)
        file_snrs[idx] = tup
else:
    with Pool(NUM_PROC) as pool:
        file_snrs = list(tqdm(pool.imap(compute_file_snr, wav_files), total=len(wav_files)))

Calculates Average SNR (Higher the better, max 100)

In [None]:
snrs = [tup[0] for tup in file_snrs]

error_idxs = np.where(np.isnan(snrs) == True)[0]
error_files = [wav_files[idx] for idx in error_idxs]

file_snrs = [i for j, i in enumerate(file_snrs) if j not in error_idxs]
file_names = [tup[1] for tup in file_snrs]
snrs = [tup[0] for tup in file_snrs]
file_idxs = np.argsort(snrs)


print(f" > Average SNR of the dataset:{np.mean(snrs)}")

In [None]:
def output_snr_with_audio(idx):
    file_idx = file_idxs[idx]
    file_name = file_names[file_idx]
    wav, sr = sf.read(file_name)
    # multi channel to single channel
    if len(wav.shape) == 2:
        wav = wav[:, 0]
    print(f" > {file_name} - snr:{snrs[file_idx]}")
    IPython.display.display(IPython.display.Audio(wav, rate=sr))


Calculate Worst & Best Performing Audio Files

In [None]:
# find worse SNR files
N = 10  # number of files to fetch
for i in range(N):
    output_snr_with_audio(i)

In [None]:
# find best recordings
N = 10  # number of files to fetch
for i in range(N):
    output_snr_with_audio(-i-1)

**Demucs on Uploaded Samples**

Samples will be moved to subdirectory named backup

Vocals.wav will be renamed to [samplefile].wav, resampled to 22khz mono, and moved back to the sample working directory.

Final output is mono 22khz wav.

In [None]:
!pip install demucs

In [None]:
S_D_WORKDIR = "/content/drive/MyDrive/wavs/" 
S_D_EXT = "wav" 

In [None]:
S_D_SCRIPT="""
#!/bin/bash
EXT='"""+S_D_EXT+"""'
WORK_DIR='"""+S_D_WORKDIR+"""'
rename 's/\.([^.]+)$/.\L$1/' "$WORK_DIR"/*
cd "$WORK_DIR"
mkdir backup
mv *.$EXT backup
demucs --two-stems=vocals -n htdemucs_ft backup/*.$EXT
cd separated/htdemucs_ft
for dir in */; do
	echo "$dir"
	cd "$dir"
	dir="${dir%/}"
	ffmpeg -i vocals.wav -ar 22050 -ac 1 "$dir".wav
	mv "$dir".wav $WORK_DIR
	cd ..
done
cd "$WORK_DIR"
rm -rf separated
"""
with open('/content/drive/MyDrive/script-demucs-samples.sh', 'w') as file:
  file.write(S_D_SCRIPT)

In [None]:
!sh /content/drive/MyDrive/script-demucs-samples.sh

**NORMALIZE-AUDIO**

normalize files to -27dbFS.

In [None]:
!apt-get install normalize-audio

In [None]:
N_EXT = "wav" 
N_WORKDIR = "/content/drive/MyDrive/wavs" 


In [None]:
#@title
N_SCRIPT="""
#!/bin/bash
EXT='"""+N_EXT+"""'
WORK_DIR='"""+N_WORKDIR+"""'
rename 's/\.([^.]+)$/.\L$1/' "$WORK_DIR"/*
cd "$WORK_DIR"
normalize-audio -a -27dBFS *.$EXT
"""
with open('/content/drive/MyDrive/script-normalize.sh', 'w') as file:
  file.write(N_SCRIPT)

In [None]:
!sh /content/drive/MyDrive/script-normalize.sh

In [None]:
!pip install librosa  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import librosa
import os

# Path to the folder containing the audio files
folder_path = "/content/drive/MyDrive/wavs/backup"

for file_name in os.listdir(folder_path):
    if file_name.endswith('.wav'):
        # Load audio file
        file_path = os.path.join(folder_path, file_name)
        audio, sr = librosa.load(file_path, sr=16000)

        # Compute the root mean square (RMS) energy of the audio
        rms = librosa.feature.rms(y=audio)[0]

        # Compute the median value of the RMS energy
        median_rms = np.median(rms)

        # Print the median RMS energy
        print(f"File: {file_name} - Median RMS energy: {median_rms}")
        if median_rms < 0.01:
            print(f"The audio file {filename} contains white silence.")
        # else:
        #     print(f"The audio file {filename} does not contain white silence.")

File: LJ001-0090.wav - Median RMS energy: 0.08819131553173065
File: LJ002-0145.wav - Median RMS energy: 0.0844791978597641
File: LJ002-0107.wav - Median RMS energy: 0.11311400681734085
File: LJ001-0076.wav - Median RMS energy: 0.08749743551015854
File: LJ002-0204.wav - Median RMS energy: 0.08874053508043289
File: LJ003-0040.wav - Median RMS energy: 0.08424635231494904
File: LJ001-0006.wav - Median RMS energy: 0.10674095898866653
File: LJ002-0186.wav - Median RMS energy: 0.07650020718574524
File: LJ001-0151.wav - Median RMS energy: 0.08721309900283813
File: LJ002-0025.wav - Median RMS energy: 0.10257001966238022
File: LJ002-0036.wav - Median RMS energy: 0.0812864601612091
File: LJ003-0278.wav - Median RMS energy: 0.09330209344625473
File: LJ001-0101.wav - Median RMS energy: 0.101053886115551
File: LJ001-0178.wav - Median RMS energy: 0.10626619309186935
File: LJ003-0123.wav - Median RMS energy: 0.07805871963500977
File: LJ003-0174.wav - Median RMS energy: 0.08261675387620926
File: LJ003-

In [None]:
import os
import wave
import math

directory = "/content/drive/MyDrive/wavs/backup"  
total_duration = 0

for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        wav_file = wave.open(os.path.join(directory, filename), 'r')
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
        total_duration += duration

total_duration_minutes = math.floor(total_duration / 60)
print("Total duration in minutes: ", total_duration_minutes)

Total duration in minutes:  105


**Split Audio Samples with SOX**

In [None]:
#@title
%cd /content
!sudo apt install sox

In [None]:
SP_WORKDIR = "/content/drive/MyDrive/wavs/" 
SP_EXT = "wav" 

In [None]:
SP_SCRIPT="""
#!/bin/bash
EXT='"""+SP_EXT+"""'
WORK_DIR='"""+SP_WORKDIR+"""'
rename 's/\.([^.]+)$/.\L$1/' "$WORK_DIR"/*
cd "$WORK_DIR"
mkdir backup-presplit
mv *.$EXT backup-presplit
cd backup-presplit
rm -rf splits
mkdir splits
for FILE in *.wav; do sox "$FILE" splits/"$FILE" --show-progress silence 1 0.2 0.1% 1 0.2 0.1% : newfile : restart ; done
cd splits
mkdir resplit
for FILE in *.wav; do sox "$FILE" resplit/"$FILE" --show-progress trim 0 9 : newfile : restart ; done
cd resplit
find . -name "*.wav" -type f -size -35k -delete
mv *.wav $WORK_DIR
cd $WORK_DIR
rm -rf splits
ls -al
"""
with open('/content/drive/MyDrive/script-split-samples.sh', 'w') as file:
  file.write(SP_SCRIPT)

In [None]:
!sh /content/drive/MyDrive/script-split-samples.sh

**Install OpenAI Whisper STT+Translation (https://github.com/openai/whisper)**

**Run Whisper on generated audio clips.**
Transcripts will be formatted for the VCTK-style dataset and placed in the<br>
->dataset directory<br>
---->txt<br>
-------->speakername<br>


datasetfolder<br>
->txt<br>
---->nameone<br>
---->nametwo<br>
->wav48_silence_trimmed<br>
---->nameone<br>
---->nametwo<br>

**Select Whisper STT model. Large-v2 slowest, most accurate but most memory usage**

In [None]:
whisper_model = "large-v2" #@param ["large-v2", "large-v1", "medium.en", "small.en", "base.en"]

In [None]:
#@title
import whisper
import os, os.path
import glob
import pandas as pd

from pathlib import Path


#model = whisper.load_model("medium.en")
model = whisper.load_model(whisper_model)

In [None]:
newspeakername = "spongebob" 


In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
options = dict(language="english", beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)

#@title
wavs = '/content/drive/MyDrive/'+ds_name+'/wav48_silence_trimmed/'+newspeakername

paths = glob.glob(os.path.join(wavs, '*.flac'))
print(len(paths))
all_filenames = []
transcript_text = []
try: 
	os.mkdir('/content/drive/MyDrive/'+ds_name+'/txt/') 
except OSError as error: 
	print(error)  
try: 
	os.mkdir('/content/drive/MyDrive/'+ds_name+'/txt/'+newspeakername+'/') 
except OSError as error: 
	print(error)  

for filepath in paths:
	base = os.path.basename(filepath)
	all_filenames.append(base)
	result = model.transcribe(filepath, **options)
	output = result["text"].lstrip()
	output = output.replace("\n","")
	print(output)
	thefile = str(os.path.basename(filepath).lstrip(".")).rsplit(".")[0]
	thefile = thefile[:-5]
	print(thefile)
	outfilepath = '/content/drive/MyDrive/'+ds_name+'/txt/'+newspeakername+'/'+thefile+'.txt'
	with open(outfilepath, 'w', encoding='utf-8') as indfile:
		indfile.write(output)

**Check dataset for empty transcript files.Bad file sets will be moved to a folder called badfiles in your dataset directory.**

checks for empty transcript files and transcripts containing characters not matching this regex pattern: 
[A-Z0-9a-zŽžÀ-ÿ]

Moves orphaned flac files and mistranscriptions to badfiles directory

In [None]:
check_datasetbase = "newdatasetspong" 
check_speakername = "spongebob" 

In [None]:
#@title
import re
import os, os.path
import glob
import shutil
rootpath="/content/drive/MyDrive/"
ds_name2=check_datasetbase
newspeakername2=check_speakername
flac = rootpath+ds_name2+'/wav48_silence_trimmed/'+newspeakername2+'/'
# wav = rootpath+ds_name2+'/wavs/'
wav = '/content/drive/MyDrive/wavs'
txt = rootpath+ds_name2+'/txt/'+newspeakername2+'/'
outfilepath = rootpath+ds_name2+'/txt/'+newspeakername2
backup_path = rootpath+ds_name2+'/badfiles/'
if not os.path.exists(backup_path):
    os.makedirs(backup_path)
searchstr = "(?i)(?!(?![×Þß÷þø])[a-zÀ-ÿ])\S"
txtfiles = glob.glob(os.path.join(txt, '*.txt'))
EMPTY_TRANSCRIPTS = []
for txts in txtfiles:
    with open(txts, 'r', encoding='utf-8') as outfile:
        state = os.stat(txts).st_size == 0
        if state==True:
            print(str(txts))
            basename_without_ext = os.path.splitext(os.path.basename(txts))[0]
            print(basename_without_ext)
            bad_wav = basename_without_ext+".wav"
            bad_txt = basename_without_ext+".txt"
            bad_flac = basename_without_ext+"_mic1.flac"
#            print(bad_wav)
#            print(bad_txt)
            if os.path.exists(wav+bad_wav):
                bad_wav_file_size = os.path.getsize(wav+bad_wav)
            if os.path.exists(txt+bad_txt):
                bad_txt_file_size = os.path.getsize(txt+bad_txt)
            if os.path.exists(flac+bad_flac):
                bad_flac_file_size = os.path.getsize(flac+bad_flac)
#            print(bad_wav_file_size)
#            print(bad_txt_file_size)
            if os.path.exists(wav+bad_wav):
                shutil.move(wav+bad_wav, backup_path+bad_wav)
            if os.path.exists(txt+bad_txt):
                shutil.move(txt+bad_txt, backup_path+bad_txt)
            if os.path.exists(flac+bad_flac):
                shutil.move(flac+bad_flac, backup_path+bad_flac)
        if state==False:
            for line in outfile:
                match = re.search('^((?![A-Z0-9a-zŽžÀ-ÿ]).)*$', line)
                if match:
                    print(line)
                    basename_without_ext = os.path.splitext(os.path.basename(txts))[0]
                    print(basename_without_ext)
                    bad_wav = basename_without_ext+".wav"
                    bad_txt = basename_without_ext+".txt"
                    bad_flac = basename_without_ext+"_mic1.flac"
                    if os.path.exists(wav+bad_wav):
                        bad_wav_file_size = os.path.getsize(wav+bad_wav)
                    if os.path.exists(txt+bad_txt):
                        bad_txt_file_size = os.path.getsize(txt+bad_txt)
                    if os.path.exists(flac+bad_flac):
                        bad_flac_file_size = os.path.getsize(flac+bad_flac)
                    if os.path.exists(wav+bad_wav):
                        shutil.move(wav+bad_wav, backup_path+bad_wav)
                    if os.path.exists(txt+bad_txt):
                        shutil.move(txt+bad_txt, backup_path+bad_txt)
                    if os.path.exists(flac+bad_flac):
                        shutil.move(flac+bad_flac, backup_path+bad_flac)

**Scan for orphaned transcriptions or audio files.  Orphaned files moved to missing directory.**

In [None]:
#@title
import re
import os, os.path
import glob
import shutil

rootpath="/content/drive/MyDrive/"
ds_name2=or_datasetbase
newspeakername2=or_speakername
flac = rootpath+ds_name2+'/wav48_silence_trimmed/'+newspeakername2+'/'
# wav = rootpath+ds_name2+'/wavs/'
wav = '/content/drive/MyDrive/wavs'
txt = rootpath+ds_name2+'/txt/'+newspeakername2+'/'
outfilepath = rootpath+ds_name2+'/txt/'+newspeakername2
mis_path = rootpath+ds_name2+'/missing/'+newspeakername2+'/'
backup_path = rootpath+ds_name2+'/badfiles/'+newspeakername2+'/'

if not os.path.exists(mis_path):
	os.makedirs(mis_path)

txtfiles = glob.glob(os.path.join(txt, '*.txt'))
flacfiles = glob.glob(os.path.join(flac, '*.flac'))

EMPTY_TRANSCRIPTS = []
for txts in txtfiles:
	basename_without_ext = os.path.splitext(os.path.basename(txts))[0]
	wav_file = basename_without_ext+".wav"
	txt_file = basename_without_ext+".txt"
	flac_file = basename_without_ext+"_mic1.flac"
	if not os.path.exists(str(flac + flac_file)):
		print("flac " + flac_file + " missing")
		if os.path.exists(txt+txt_file):
			print(str(txt+txt_file))
			shutil.move(txt+txt_file, mis_path+txt_file)

for flacs in flacfiles:
	flac_base = os.path.splitext(os.path.basename(flacs))[0]
	flac_base = flac_base[:-5]
	wav_file = flac_base+".wav"
	txt_file = flac_base+".txt"
	flac_file = flac_base+"_mic1.flac"
	if not os.path.exists(str(txt + txt_file)):
		print("text " + txt_file + " missing")
		if os.path.exists(flac+flac_file):
			print(str(flac+flac_file))
			shutil.move(flac+flac_file, mis_path+flac_file)

**Now the data is ready to be used**