In [4]:
!pip install gTTS pyttsx3 torch torchaudio transformers soundfile



In [5]:
from IPython import get_ipython
from IPython.display import display, Audio
from gtts import gTTS
import io
import pyttsx3
import torch
import torchaudio
from transformers import pipeline
import soundfile as sf

# %%
# It's generally better to have installation steps separate and only run them once.
# If you need to ensure libraries are installed, you can run this cell before the rest.
# !pip install gTTS pyttsx3 torch torchaudio transformers soundfile

# %%
# Define the text_to_speech_gtts function using gTTS
def text_to_speech_gtts(text):
    """
    Converts text to speech using gTTS and returns an IPython.display.Audio object.
    """
    tts = gTTS(text=text, lang='te', slow=True )
    # Save the audio to a BytesIO object instead of a file
    fp = io.BytesIO()
    tts.write_to_fp(fp)
    fp.seek(0) # Rewind the BytesIO object to the beginning
    # Create an Audio object. gTTS typically outputs at 24000 Hz.
    return Audio(fp.read(), rate=24000)

# Simple conversion
audio = text_to_speech_gtts("హలో వరల్డ్! ఇది శ్రీకాంత్ మద్దుల.")
display(audio)

In [9]:
from IPython import get_ipython
from IPython.display import display, Audio
# from gtts import gTTS # Not used for emotional TTS
# import pyttsx3 # Not typically used for emotional TTS
import io
import torch
import torchaudio
from transformers import VitsModel, AutoTokenizer # Example using VitsModel, check model requirements
import soundfile as sf
import numpy as np
import os # Import os for accessing environment variables

# %%
# Ensure necessary libraries are installed
# !pip install torch torchaudio transformers soundfile numpy

# %%
def text_to_speech_emotional_transformers(text, emotion="neutral", hf_token=None):
    """
    Converts text to speech using a transformers model that supports emotions.
    Requires a model trained for expressive speech.
    NOTE: Finding a suitable model, especially for specific languages and emotions,
          can be challenging and requires exploring the Hugging Face Hub.
          The model and parameters below are illustrative.

    Args:
        text (str): The text to synthesize.
        emotion (str): The desired emotion (model-dependent).
        hf_token (str, optional): Your Hugging Face authentication token, if needed
                                  for private models or rate limits. Defaults to None.
    """
    # --- Model Selection and Loading ---
    # You need to find a model on Hugging Face Hub that supports emotional TTS.
    # The model's documentation will explain how to control emotion.
    # THIS IS A PLACEHOLDER. YOU MUST REPLACE THIS WITH A REAL MODEL NAME.
    # Search the Hugging Face Hub (https://huggingface.co/models?library=transformers&sort=downloads&search=text%20to%20speech)
    # for models, specifically looking for models trained for expressive or multi-speaker speech
    # that might offer emotional control or distinct speaker styles.
    # Example of a potential model (might not support specific emotions or Telugu):
    # model_name = "facebook/mms-tts-tel" # MMS model for Telugu (may not support emotions)
    # model_name = "espnet/tts_vits_ljspeech" # Example VITS model (often trained on specific datasets like LJSpeech)

    #model_name = "Telugu-LLM-Labs/telugu_alpaca_yahma_cleaned_filtered_romanized" # Replacing the placeholder with an actual (though perhaps not emotional) Telugu model
    model_name = "facebook/mms-tts-tel"
    try:
        # Load the model and tokenizer, passing the token if provided
        # The exact classes (VitsModel, AutoTokenizer) might vary based on the model architecture
        # For facebook/mms-tts models, VitsModel and AutoTokenizer are often used.
        # If the model is not a VitsModel, this line will also fail.
        model = VitsModel.from_pretrained(model_name, token=hf_token)
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

        # Ensure the model is in evaluation mode
        model.eval()

        print(f"Successfully loaded model: {model_name}")

    except Exception as e:
        print(f"Error loading transformers model '{model_name}': {e}")
        print("Please ensure you have installed the correct libraries and the model name is valid.")
        print("If it's a private model, ensure you have a valid token and pass it.")
        print("Also, double-check if this model is actually a Text-to-Speech model (like VitsModel) and not just a language model.")
        print("Check the model page on Hugging Face Hub for details.")
        return None

    # --- Emotional Control (Model-Specific) ---
    # This part is HIGHLY dependent on the specific model you are using.
    # Different models use different methods for controlling emotion:
    # - Some might use a simple 'emotion' parameter (less common)
    # - Some might require a speaker embedding or a reference audio sample
    # - Some might use style tokens or latent variables
    # - Some might simply not support explicit emotional control.

    # The following is based on how facebook/mms-tts models *might* be used,
    # but specific emotional control is NOT guaranteed. These models are often
    # trained on large datasets and provide a standard voice.
    # Emotional control might require a different, more specialized model.

    inputs = tokenizer(text, return_tensors="pt")

    # The facebook/mms-tts models usually don't have a direct 'emotion' parameter.
    # If you used a multi-speaker model trained with emotionally labeled data,
    # you might select a speaker ID associated with 'angry'.

    # Example using a hypothetical speaker ID for "angry" (unlikely to work with facebook/mms-tts)
    # speaker_id_for_angry = None # You would need to know the speaker IDs from the model's config or documentation
    # if emotion == "angry" and model_name == "YourMultiSpeakerEmotionalModel": # Replace with your model name
    #      speaker_id_for_angry = 5 # HYPOTHETICAL speaker ID for an angry speaker

    try:
         with torch.no_grad():
              # Pass speaker_id if you are using a multi-speaker model and know the ID
              # If the model doesn't support speaker_id, remove this parameter
              # synthesis = model(**inputs, speaker_id=speaker_id_for_angry).waveform.squeeze(0) if speaker_id_for_angry is not None else model(**inputs).waveform.squeeze(0)
              synthesis = model(**inputs).waveform.squeeze(0) # Basic synthesis for models without explicit emotion/speaker control

         # Check if the model supports specific emotional control based on emotion parameter (unlikely for most)
         # This part is commented out as direct emotion parameter is rare
         # if 'emotion' in model.parameters and emotion != "neutral":
         #      print(f"Note: Model {model_name} likely does not support direct '{emotion}' parameter.")


    except Exception as e:
         print(f"Error during synthesis with model {model_name}: {e}")
         print("This could be due to incorrect input format or model not supporting the attempted parameters (like speaker_id).")
         return None

    # --- Convert and Return Audio ---
    if synthesis is None:
        return None

    # Convert tensor to numpy array and ensure correct data type (int16 for WAV)
    # If synthesis is float, convert to int16
    if synthesis.dtype == torch.float32:
         audio_np = (synthesis * 32767).cpu().numpy().astype(np.int16)
    else: # Assuming it's already in a suitable integer format
         audio_np = synthesis.cpu().numpy().astype(np.int16)


    # Save to a BytesIO object in WAV format
    audio_bytes = io.BytesIO()
    try:
        # Use the model's sampling rate
        sf.write(audio_bytes, audio_np, model.config.sampling_rate, format='WAV')
        audio_bytes.seek(0)
        # Create and return IPython.display.Audio object
        return Audio(audio_bytes.read(), rate=model.config.sampling_rate)
    except Exception as e:
        print(f"Error writing audio data to BytesIO: {e}")
        return None

# %%
# Example Usage:

# --- How to get and use your Hugging Face Token ---
# 1. Go to Hugging Face website (huggingface.co) and log in.
# 2. Click your profile picture -> 'Settings'.
# 3. Click 'Access Tokens' in the left-hand menu.
# 4. Click 'New token', give it a name (e.g., "Colab TTS"), and select the 'read' Role.
# 5. Click 'Generate a token'.
# 6. COPY the token string displayed. Keep this token secret!
#
# In Colab, you can store this token securely using Environment Variables:
# - Click the '🔑 Secrets' icon on the left sidebar in Colab.
# - Click '+ New secret'.
# - For Key, enter `HF_TOKEN` (or any name you prefer, but `HF_TOKEN` is conventional).
# - For Value, paste the token string you copied from Hugging Face.
# - Make sure the 'Notebook access' toggle is ON for this secret.
#
# The code below attempts to read the token from the environment variable 'HF_TOKEN'.
# Alternatively, you could paste the token string directly here, but this is less secure:
# my_hf_token = "paste_your_token_here" # NOT RECOMMENDED FOR SENSITIVE TOKENS

my_hf_token = os.getenv("HF_TOKEN") # Read token from Colab secrets (recommended)

if my_hf_token is None:
    print("Hugging Face token not found in environment variables. Model loading might fail for private models.")
    print("Please add your HF_TOKEN as a secret in Colab's left sidebar.")

# Attempt to synthesize with a hypothetical "angry" emotion
# Pass the token to the function
print("Attempting to synthesize with 'angry' emotion (requires a suitable model):")
# We are still passing emotion="angry", but the function's ability to use it
# depends *entirely* on the chosen model. With facebook/mms-tts-tel, it likely won't
# produce an angry voice, but rather the standard voice of that model.
audio_angry = text_to_speech_emotional_transformers("నేను కోపంగా ఉన్నాను!", emotion="angry", hf_token=my_hf_token)
if audio_angry:
    display(audio_angry)
else:
    print("Failed to synthesize angry speech.")


# Synthesize with default settings (if the model was loaded)
print("\nSynthesizing with default settings (if model loaded successfully):")
# Even if we pass emotion="neutral", if the model doesn't support emotional control,
# the output will be the standard voice.
audio_default = text_to_speech_emotional_transformers("నేను కోపంగా ఉన్నాను!", emotion="neutral", hf_token=my_hf_token)
if audio_default:
     display(audio_default)
else:
    print("Failed to synthesize default speech.")

Hugging Face token not found in environment variables. Model loading might fail for private models.
Please add your HF_TOKEN as a secret in Colab's left sidebar.
Attempting to synthesize with 'angry' emotion (requires a suitable model):
Error loading transformers model 'facebook/mms-tts-tel': There was a specific connection error when trying to load facebook/mms-tts-tel:
401 Client Error: Unauthorized for url: https://huggingface.co/facebook/mms-tts-tel/resolve/main/config.json (Request ID: Root=1-6830182c-721ba4cd6d285e645f00078a;161d1009-d9a2-4a54-91a0-4c6dd1058c21)

Invalid credentials in Authorization header
Please ensure you have installed the correct libraries and the model name is valid.
If it's a private model, ensure you have a valid token and pass it.
Also, double-check if this model is actually a Text-to-Speech model (like VitsModel) and not just a language model.
Check the model page on Hugging Face Hub for details.
Failed to synthesize angry speech.

Synthesizing with defa

In [8]:
# Attempt to synthesize with a hypothetical "angry" emotion
# This requires replacing "SomeOrg/some-emotional-tts-model" with a real model name
# that supports emotional synthesis and potentially adjusting how emotion is specified
# based on the model's documentation.
print("Attempting to synthesize with 'angry' emotion (requires a suitable model):")
# We are still passing emotion="angry", but the function's ability to use it
# depends *entirely* on the chosen model. With facebook/mms-tts-tel, it likely won't
# produce an angry voice, but rather the standard voice of that model.
my_hf_token = os.getenv("HF_TOKEN")  # Read token from Colab secrets (recommended)

audio_angry = text_to_speech_emotional_transformers("హలో వరల్డ్! నేను శ్రీకాంత్ మద్దులని.!", hf_token=my_hf_token)
if audio_angry:
    display(audio_angry)
else:
    print("Failed to synthesize angry speech.")


# Synthesize with default settings (if the model was loaded)
print("\nSynthesizing with default settings (if model loaded successfully):")
# Even if we pass emotion="neutral", if the model doesn't support emotional control,
# the output will be the standard voice.
audio_default = text_to_speech_emotional_transformers("హలో వరల్డ్! నేను శ్రీకాంత్ మద్దులని.!", hf_token=my_hf_token)
if audio_default:
     display(audio_default)
else:
    print("Failed to synthesize default speech.")

Attempting to synthesize with 'angry' emotion (requires a suitable model):
Error loading transformers model 'facebook/mms-tts-tel': There was a specific connection error when trying to load facebook/mms-tts-tel:
401 Client Error: Unauthorized for url: https://huggingface.co/facebook/mms-tts-tel/resolve/main/config.json (Request ID: Root=1-683017d5-30eaf5f0182541a245a56858;e201d88a-fc9e-4c4f-8d35-db8da7cb7544)

Invalid credentials in Authorization header
Please ensure you have installed the correct libraries and the model name is valid.
If it's a private model, ensure you have a valid token and pass it.
Also, double-check if this model is actually a Text-to-Speech model (like VitsModel) and not just a language model.
Check the model page on Hugging Face Hub for details.
Failed to synthesize angry speech.

Synthesizing with default settings (if model loaded successfully):
Error loading transformers model 'facebook/mms-tts-tel': There was a specific connection error when trying to load fa