# DL Based Emotional Text to Speech

In this demo, we provide an interface to generate emotional speech from user inputs for both the emotional label and the text.

The models that are trained are [Tacotron](https://github.com/Emotional-Text-to-Speech/tacotron_pytorch) and [DC-TTS](https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts).

Further information about our approaches and *exactly how* did we develop this demo can be seen [here](https://github.com/Emotional-Text-to-Speech/dl-for-emo-tts).

---
---


## Download the required code and install the dependences

- Make sure you have clicked on ```Open in Playground``` to be able to run the cells. Set your runtime to ```GPU```. This can be done with the following steps:
  - Click on ```Runtime``` on the menubar above
  - Select ```Change runtime type```
  - Select ```GPU``` from the ```Hardware accelerator``` dropdown and save.
- Run the cell below. It will automatically create the required directory structure. In order to run the cell, click on the **arrow** that is on the left column of the cell (hover over the ```[]``` symbol). Optionally, you can also press ```Shift + Enter ```




In [None]:
# ! git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
# ! git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
# ! cd "tacotron_pytorch/" && pip install -e .
# ! pip install unidecod
# ! pip install gdown
# ! mkdir trained_models

# import gdown
# url = 'https://drive.google.com/uc?id=1QDqQ28-9HKwbk-tvbuxdVgZWqP4esAWi'
# output = 'trained_models/angry_dctts.pth'
# gdown.download(url, output, quiet=False)
# url = 'https://drive.google.com/file/d/1m8Fa_Vsxio21U75V63KPUNVEKM6lUSAX/view?usp=sharing'
# output = 'trained_models/neutral_dctts.pth'
# gdown.download(url, output, quiet=False)
# url = 'https://drive.google.com/file/d/12kwVgdKlcwQJpM7vahF80eVu62Trqrda/view?usp=sharing'
# output = 'trained_models/ssrn.pth'
# gdown.download(url, output, quiet=False)
# url = 'https://drive.google.com/file/d/1mZSD-xJ4aqmTYBMgEzyGCZpEJ24hL4Zm/view?usp=sharing'
# output = 'trained_models/disgust_tacotron.pth'
# gdown.download(url, output, quiet=False)
# url = 'https://drive.google.com/file/d/10LUfP-HCxavX94t7pgF-OInrx2WZm97q/view?usp=sharing'
# output = 'trained_models/amused_tacotron.pth'
# gdown.download(url, output, quiet=False)
# url = 'https://drive.google.com/file/d/1s38HkH0rXBnJJY3CT8SFw64xP7yCMJwv/view?usp=sharing'
# output = 'trained_models/sleepiness_tacotron.pth'
# gdown.download(url, output, quiet=False)



## Setup the required code

- Run the cell below. It will automatically create the required directory structure. In order to run the cell, click on the **arrow** that is on the left column of the cell (hover over the ```[]``` symbol). Optionally, you can also press ```Shift + Enter ```

In [None]:
# %tensorflow_version 1.x
%pylab inline
rcParams["figure.figsize"] = (10,5)

import os
import sys
import numpy as np
sys.path.append('pytorch-dc-tts/')
sys.path.append('pytorch-dc-tts/models')
sys.path.append("tacotron_pytorch/")
sys.path.append("tacotron_pytorch/lib/tacotron")

# For the DC-TTS
import torch
from text2mel import Text2Mel
from ssrn import SSRN
from audio import save_to_wav, spectrogram2wav
from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
from datasets.emovdb import vocab, get_test_data

# For the Tacotron
from text import text_to_sequence, symbols
# from util import audio

from tacotron_pytorch import Tacotron
from synthesis import tts as _tts

# For Audio/Display purposes
import matplotlib.pyplot as plt
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import display
# from google.colab import widgets
# from google.colab import output
import warnings
warnings.filterwarnings('ignore')


torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab).eval()

ssrn = SSRN().eval()
load_checkpoint('trained_models/ssrn.pth', ssrn, None)

model = Tacotron(n_vocab=len(symbols),
                 embedding_dim=256,
                 mel_dim=80,
                 linear_dim=1025,
                 r=5,
                 padding_idx=None,
                 use_memory_mask=False,
                 )

def visualize(alignment, spectrogram, Emotion):
    label_fontsize = 16
    
    if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
        plt.figure(figsize=(10, 6))
        plt.subplot(2, 1, 1)
        plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
        plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
        plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
        
        plt.subplot(2, 1, 2)
        librosa.display.specshow(spectrogram.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear")
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
    else:
        plt.figure(figsize=(10, 6))
        plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
        plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
        plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
        
        plt.figure(figsize=(10, 6))
        librosa.display.specshow(spectrogram, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear")
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)

def tts_dctts(text2mel, ssrn, text):
  sentences = [text]

  max_N = len(text)
  L = torch.from_numpy(get_test_data(sentences, max_N))
  zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
  Y = zeros
  A = None

  for t in range(210):
      _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
      Y = torch.cat((zeros, Y_t), -1)
      _, attention = torch.max(A[0, :, -1], 0)
      attention = attention.item()
      if L[0, attention] == vocab.index('E'):  # EOS
          break

  _, Z = ssrn(Y)
  Y = Y.cpu().detach().numpy()
  A = A.cpu().detach().numpy()
  Z = Z.cpu().detach().numpy()

  return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]


def tts_tacotron(model, text):
    waveform, alignment, spectrogram = _tts(model, text)
    return waveform, alignment, spectrogram

def present(waveform, Emotion, figures=False):
  if figures!=False:
        visualize(figures[0], figures[1], Emotion)
  IPython.display.display(Audio(waveform, rate=fs))


fs = 20000 #20000
hop_length = 250
model.decoder.max_decoder_steps = 200

## Run the Demo

- Select an ```Emotion``` from the dropdown and enter the ```Text``` that you want to be generated.
- Run the cell below. It will automatically create the required directory structure. In order to run the cell, click on the **arrow** that is on the left column of the cell (hover over the ```[]``` symbol). Optionally, you can also press ```Shift + Enter ```

**Play the speech with the generated audio player and view the required plots by clicking on their respective tabs!**




In [None]:
import re
from collections import Counter

def clean_text(text):
    # Define a list of stop words
    stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "can", "could", "may", "might", "must", "ought"])
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase to maintain consistency
    text = text.lower()
    
    # Split text into words
    words = text.split()
    
    # Remove stop words and duplicates
    cleaned_words = []
    word_counts = Counter(words)  # This helps keep track of duplicates
    for word in words:
        if word not in stop_words and word_counts[word] == 1:
            cleaned_words.append(word)
    
    # Join words back to a single string
    return ' '.join(cleaned_words)

In [None]:
#@title Select the emotion and type the text

%pylab inline

Emotion = "Sleepiness" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
Text = input()#@param {type:"string"}
clean_text(Text)

    
wav, align, mel = None, None, None

if Emotion == "Neutral":
  load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
  wav, align, mel = tts_dctts(text2mel, ssrn, Text)
elif Emotion == "Angry":
  load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
  wav, align, mel = tts_dctts(text2mel, ssrn, Text)
  # wav = wav.T
elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
  checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
  model.load_state_dict(checkpoint["state_dict"])
  wav, align, mel = tts_tacotron(model, Text)

present(wav, Emotion, (align,mel))

