<a href="https://colab.research.google.com/github/shlphw07/TTS-Audio-Generator/blob/main/TTS_optimised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow numpy



In [None]:
!git clone https://github.com/NVIDIA/tacotron2.git
%cd tacotron2
!ls

Cloning into 'tacotron2'...
remote: Enumerating objects: 412, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 412 (delta 2), reused 3 (delta 1), pack-reused 406[K
Receiving objects: 100% (412/412), 2.70 MiB | 22.50 MiB/s, done.
Resolving deltas: 100% (202/202), done.
/content/tacotron2
audio_processing.py  filelists	      logger.py		plotting_utils.py  text
data_utils.py	     hparams.py       loss_function.py	README.md	   train.py
demo.wav	     inference.ipynb  loss_scaler.py	requirements.txt   utils.py
distributed.py	     layers.py	      model.py		stft.py		   waveglow
Dockerfile	     LICENSE	      multiproc.py	tensorboard.png


In [None]:
!pip install tacotron2
!pip install unidecode

Collecting tacotron2
  Downloading tacotron2-22.12.28-py3-none-any.whl (204 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.3/204.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting Unidecode<2.0.0,>=1.3.6 (from tacotron2)
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting inflect<7.0.0,>=6.0.2 (from tacotron2)
  Downloading inflect-6.2.0-py3-none-any.whl (35 kB)
Collecting librosa<0.10.0,>=0.9.2 (from tacotron2)
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.3/214.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX<3.0.0,>=2.5.1 (from tacotron2)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.4 MB/s[0m eta [36

In [None]:
!ls

audio_processing.py  filelists	      logger.py		plotting_utils.py  text
data_utils.py	     hparams.py       loss_function.py	README.md	   train.py
demo.wav	     inference.ipynb  loss_scaler.py	requirements.txt   utils.py
distributed.py	     layers.py	      model.py		stft.py		   waveglow
Dockerfile	     LICENSE	      multiproc.py	tensorboard.png


In [None]:
import torch
from tacotron2.model import Tacotron2
from tacotron2.text import text_to_sequence
from text import symbols

class HParams:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

# Load the model hyperparameters
hparams = HParams(
        ################################
        # Experiment Parameters        #
        ################################
        epochs=500,
        iters_per_checkpoint=1000,
        seed=1234,
        dynamic_loss_scaling=True,
        fp16_run=False,
        distributed_run=False,
        dist_backend="nccl",
        dist_url="tcp://localhost:54321",
        cudnn_enabled=True,
        cudnn_benchmark=False,
        ignore_layers=['embedding.weight'],

        ################################
        # Data Parameters             #
        ################################
        load_mel_from_disk=False,
        training_files='filelists/ljs_audio_text_train_filelist.txt',
        validation_files='filelists/ljs_audio_text_val_filelist.txt',
        text_cleaners=['english_cleaners'],

        ################################
        # Audio Parameters             #
        ################################
        max_wav_value=32768.0,
        sampling_rate=22050,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        n_mel_channels=80,
        mel_fmin=0.0,
        mel_fmax=8000.0,

        ################################
        # Model Parameters             #
        ################################
        n_symbols=len(symbols),
        symbols_embedding_dim=512,

        # Encoder parameters
        encoder_kernel_size=5,
        encoder_n_convolutions=3,
        encoder_embedding_dim=512,

        # Decoder parameters
        n_frames_per_step=1,  # currently only 1 is supported
        decoder_rnn_dim=1024,
        prenet_dim=256,
        max_decoder_steps=1000,
        gate_threshold=0.5,
        p_attention_dropout=0.1,
        p_decoder_dropout=0.1,

        # Attention parameters
        attention_rnn_dim=1024,
        attention_dim=128,

        # Location Layer parameters
        attention_location_n_filters=32,
        attention_location_kernel_size=31,

        # Mel-post processing network parameters
        postnet_embedding_dim=512,
        postnet_kernel_size=5,
        postnet_n_convolutions=5,

        ################################
        # Optimization Hyperparameters #
        ################################
        use_saved_learning_rate=False,
        learning_rate=1e-3,
        weight_decay=1e-6,
        grad_clip_thresh=1.0,
        batch_size=64,
        mask_padding=True  # set model's padded outputs to padded values
    )


# Create an instance of the Tacotron2 model with hyperparameters
model = Tacotron2(hparams)

# Load the model weights from the checkpoint file
checkpoint_path = "D:\Documents\TTS Audio Generator\samples\speech-wav-00001-mel.wav"  # Change this to the actual path of your model checkpoint
checkpoint = torch.load(checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['state_dict'])

# Set the model to evaluation mode
model.eval()

# Example text for synthesis
input_text = "Hello everyone and thank you for hearing me out."

# Convert text to sequence
sequence = text_to_sequence(input_text)

# Run inference
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, alignments = model.inference(sequence)

# Continue with post-processing and vocoder steps as needed
