Cell 1: Install necessary libraries

In [12]:
# These commands are for installing required libraries if running the code on a local machine.
# These libraries include HuggingFace Transformers, Gradio, Timm (PyTorch image models), Inflect for handling singular/plural, 
# and Phonemizer for converting text to phonemes.

!pip install transformers  # For HuggingFace Transformers (Text-to-Speech model)
!pip install gradio        # For creating web-based interfaces (not essential in this example)
!pip install timm          # PyTorch image models, not required for TTS but used in advanced models
!pip install inflect        # For text processing (e.g., converting numbers to words)
!pip install phonemizer     # For converting text to phonemes (improves speech generation accuracy)


Collecting timm
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
Downloading timm-1.0.9-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.9
Collecting inflect
  Downloading inflect-7.3.1-py3-none-any.whl.metadata (21 kB)
Collecting typeguard>=4.0.1 (from inflect)
  Downloading typeguard-4.3.0-py3-none-any.whl.metadata (3.7 kB)
Downloading inflect-7.3.1-py3-none-any.whl (34 kB)
Downloading typeguard-4.3.0-py3-none-any.whl (35 kB)
Installing collected packages: typeguard, inflect
Successfully installed inflect-7.3.1 typeguard-4.3.0
Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-1.2.1-py3-none-any.

Cell 2: Install espeak-ng and py-espeak-ng on a Linux machine

In [14]:
# These are required if you want to use py-espeak-ng for TTS on Linux only.
# These commands install the `espeak-ng` library and the Python wrapper for it.

sudo apt-get update                   # Update the package lists
sudo apt-get install espeak-ng         # Install the espeak-ng package for speech synthesis
pip install py-espeak-ng               # Install the Python wrapper for espeak-ng


SyntaxError: invalid syntax (566090914.py, line 4)

Cell 3: Suppress warning messages and import the pipeline

In [None]:
# Suppress warning messages to avoid cluttering the output.
from transformers.utils import logging
logging.set_verbosity_error()  # Set the logging level to suppress all warnings and errors

# Import the `pipeline` function from HuggingFace Transformers library.
# This function will be used to build the text-to-speech model pipeline.
from transformers import pipeline


Cell 4: Initialize the text-to-speech pipeline

In [None]:
# Create a Text-to-Speech (TTS) pipeline using a pre-trained model.
# The model "vits-ljs" from Kakao Enterprise is used here for text-to-speech conversion.
# Ensure that the model is located in the "./models/kakao-enterprise/vits-ljs" directory.
narrator = pipeline("text-to-speech", model="./models/kakao-enterprise/vits-ljs")


Cell 5: Define the text to be narrated

In [None]:
# This is the sample text that the TTS model will convert to speech.
# The text is a description of a tool developed by AI researchers to measure carbon emissions by cloud servers.
text = """
Researchers at the Allen Institute for AI, \
HuggingFace, Microsoft, the University of Washington, \
Carnegie Mellon University, and the Hebrew University of \
Jerusalem developed a tool that measures atmospheric \
carbon emitted by cloud servers while training machine \
learning models. After a model’s size, the biggest variables \
were the server’s location and time of day it was active.
"""


Cell 6: Generate speech from the text

In [None]:
# Use the TTS pipeline (narrator) to convert the text into speech.
# The resulting speech data will be stored in the `narrated_text` variable.
narrated_text = narrator(text)


Cell 7: Play the generated audio

In [None]:
# Import IPython's `Audio` class to play the generated audio in the notebook.
# The TTS pipeline returns audio data in a dictionary. 
# Here, we extract the first audio clip from the generated result.
from IPython.display import Audio as IPythonAudio

# Play the audio using IPython's `Audio` widget. 
# The sampling rate for the audio is extracted from the output dictionary.
IPythonAudio(narrated_text["audio"][0], rate=narrated_text["sampling_rate"])
