In [1]:
import os
import librosa
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
import keras
import tensorflow as tf

In [2]:
# Define the paths to your audio and text data folders
df = pd.read_csv('dataset/dataset_paths.csv')

In [3]:
def load_audio(audio_path):
    try:
        audio, sr = librosa.load(audio_path)
        return audio, sr
    except Exception as e:
        print(f"Error in loading audio file {audio_path}: {str(e)}")
        return None

In [4]:
def preprocess_audio(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    return mfccs

In [5]:
audio, sr = load_audio(df['audio path'][0])

In [6]:
mfcc = preprocess_audio(audio, sr = sr)

In [7]:
# Function to clean and preprocess text transcriptions (you can add more text cleaning steps)
def clean_and_preprocess_text(text):
    # Remove leading/trailing whitespaces and convert to lowercase
    text = text.strip().lower()
    text = text.replace(" ", "<SPACE>")
    text = re.sub(r'[^a-z ]', '', text)
    return text

In [8]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' '] (size =28)


In [9]:
# Data preprocessing pipeline
def data_preprocessing_pipeline(df):

    for audio_path, transcription in tqdm(zip(df['audio path'], df['prompt'])):
        # Load and preprocess audio
        audio, sr = load_audio(audio_path)
        mfccs = preprocess_audio(audio, sr)

        if mfccs is not None:            
            # Clean and preprocess text transcription
            cleaned_transcription = clean_and_preprocess_text(transcription)
            label = tf.strings.unicode_split(cleaned_transcription, input_encoding="UTF-8")

            # Convert text to integer sequence
            integer_sequence = char_to_num(label)
            file_name = audio_path.split('\\')[-1].split('.')[0]
            # Save the preprocessed data (you can modify this to save in a suitable format)
            audio_output_path = os.path.join('preprocessed','audio', f"{file_name}.npy")
            np.save(audio_output_path, mfccs)

            text_output_path = os.path.join('preprocessed','text', f"{file_name}.npy")
            np.save(text_output_path, integer_sequence)

In [10]:
# Execute the data preprocessing pipeline
data_preprocessing_pipeline(df)

0it [00:00, ?it/s]

7816it [05:57, 21.85it/s]


In [11]:
import pandas as pd
afile_names = []
ffile_names = []

for i in os.listdir('preprocessed/audio'):
    afile_names.append(os.path.join('preprocessed','audio', i))
for i in os.listdir('preprocessed/text'):
    ffile_names.append(os.path.join('preprocessed','text', i))

df = pd.DataFrame({"audio path":afile_names, "text path":ffile_names})
df.to_csv('preprocessed/file_paths.csv')