This jupyter notebook includes the preprocessing steps required to implement text to speech GAN model.

# Import Modules

In [2]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize

# Import regular expressions module for processing text
import re

# Import the Natural Language Toolkit (nltk) for text processing
import nltk
import unicodedata
from nltk.corpus import cmudict

# Download the required datasets from nltk
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')

# Import stopwords from nltk
from nltk.corpus import stopwords

# Import NumPy for numerical computing
import numpy as np
import string

import librosa
import os

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


# Download data from kaggle

In [3]:
#steps found in "https://www.kaggle.com/general/74235"
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download showmik50/ljspeech-sr16k-dataset

Downloading ljspeech-sr16k-dataset.zip to /content
100% 2.18G/2.18G [01:41<00:00, 25.1MB/s]
100% 2.18G/2.18G [01:41<00:00, 23.1MB/s]


In [4]:
!unzip -q "/content/ljspeech-sr16k-dataset.zip" -d "/content/"

# Load Text Data

In [5]:
df = pd.read_csv('metadata.csv')
print(df['sentence'][0])

Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


# Clean text data

In [7]:
def preprocess_text(text):
    # remove punctuation from the text
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # tokenize the text
    tokens = word_tokenize(text)
    
    return " ".join(tokens) +'.'

# Apply the processing function to captions column in the DataFrame
df["preprocessed_sentences"] = df["sentence"].apply(preprocess_text)
print(df['preprocessed_sentences'][0])

printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the exhibition.


# Represent text by phonems

In [9]:
# Load the CMU Pronouncing Dictionary
phonetics = cmudict.dict()

# Define a function to convert text to phonemes
def text_to_phonemes(text):
  # Define an empty list to store word pronunciation
  phonemes = []
  # Enumerate over each word in the sentence
  for word in text:
    if word in phonetics:
      # Convert each word to its list of phonemes 
      phonemes.extend(phonetics[word][0])
  return phonemes

# Convert Phonemes to numerical secuence

In [10]:
# Define a function to convert phonemes to a numerical sequence
def phonemes_to_sequence(phonemes):
    # Define a mapping from phonemes to unique integers
    all_phonems = []
    
    phoneme_set = set(all_pronounciations)
    phoneme_to_int = {phoneme: i for i, phoneme in enumerate(phoneme_set)}
    
    # Convert each phoneme to its corresponding integer
    sequence = [phoneme_to_int[phoneme] for phoneme in phonemes]
    
    return sequence

In [None]:
word_pronounciation = df["preprocessed_sentences"].apply(text_to_phonemes)
all_pronounciations = []
i = 0
for u in word_pronounciation:
  if i == 0:
    all_pronounciations = u
    i = i+1
  else:
    all_pronounciations.extend(u)
numerical_representation = word_pronounciation.apply(phonemes_to_sequence)

## Audio

In [None]:
# Define Sampling Rate
sr = 22050 
# Define number of FFT points
n_fft = 2048
# Define number of samples between frames
hop_length = 512 
# Define number of Mel frequency bins
n_mels = 128 
# Load audio files and transform into mel-spectogram
mel_spec_norm_all = []
all_files = os.istdir('/content/wavs')
for i in all_files:
  audio, sr = librosa.load('/content/wavs'+i, sr=sr)
  # Extract mel- spectogram
  mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, S=None, n_fft=n_fft,, hop_length=hop_length, n_mels=n_mels,  fmax=sr/2)
  # Convert the mel-spectogram to decibels
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  # Normalize between o and 1
  mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
