# 🎤 TTS Voice Cloning

Clone voices with just a few seconds of audio using text-to-speech technology. This notebook runs in Colab's free tier and requires no API key.

## Model Information
This notebook uses YourTTS, a state-of-the-art multilingual voice cloning model that offers:

- Zero-shot voice cloning from just 3-10 seconds of audio
- Support for 40+ languages with natural accents
- Emotion and style control
- Cross-lingual voice cloning (speak in any language)

### Advanced Features
- **Voice Similarity Control**: Adjust how closely the output matches the reference voice
- **Speaking Rate**: Control the speed of generated speech
- **Style Transfer**: Combine multiple reference voices
- **Emotion Injection**: Add emotional qualities to the speech

## Features
- Voice cloning from short audio samples
- Support for multiple languages
- Real-time audio generation
- Adjustable voice parameters

## Setup
First, let's install the required packages:

In [None]:
!pip install -q huggingface_hub==0.14.1
!pip install -q TTS torch gradio

## Import Dependencies

In [None]:
from TTS.api import TTS
import torch
import gradio as gr
import os

## Load Model

In [None]:
# Initialize TTS with voice cloning model
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=True)

# Available languages
LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Polish": "pl",
    "Turkish": "tr",
    "Russian": "ru",
    "Dutch": "nl",
    "Czech": "cs",
    "Arabic": "ar",
    "Chinese (Mandarin)": "zh-cn",
    "Hindi": "hi",
    "Japanese": "ja",
    "Korean": "ko"
}

print(f"Model loaded on: {'GPU' if torch.cuda.is_available() else 'CPU'}")

## Create Voice Cloning Function

In [None]:
def clone_voice(audio_file, text, language="en", speaking_rate=1.0, 
                voice_similarity=0.75, emotion="neutral", style_weight=0.7):
    try:
        # Generate speech using the reference audio
        output_path = "output.wav"
        
        # Apply emotion and style settings
        emotion_settings = {
            "neutral": {"pitch_scale": 1.0, "energy_scale": 1.0},
            "happy": {"pitch_scale": 1.2, "energy_scale": 1.3},
            "sad": {"pitch_scale": 0.9, "energy_scale": 0.8},
            "angry": {"pitch_scale": 1.3, "energy_scale": 1.5},
            "calm": {"pitch_scale": 0.95, "energy_scale": 0.9}
        }
        
        settings = emotion_settings.get(emotion, emotion_settings["neutral"])
        
        tts.tts_to_file(
            text=text,
            file_path=output_path,
            speaker_wav=audio_file,
            language=language,
            speaking_rate=speaking_rate,
            voice_similarity_weight=voice_similarity,
            style_weight=style_weight,
            **settings
        )
        
        return output_path
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        return None

## Create Gradio Interface

In [None]:
with gr.Blocks() as interface:
    gr.Markdown(
        """
        # TTS Voice Cloning
        Clone a voice from a short audio sample and make it say anything in multiple languages.
        
        ### Tips for best results:
        1. Use clear audio samples (minimal background noise)
        2. 5-30 seconds of speech is ideal
        3. Adjust voice similarity for better matching
        4. Try different emotions and speaking rates
        """
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(
                type="filepath",
                label="Upload reference audio (5-30 seconds)"
            )
            text_input = gr.Textbox(
                label="Text to speak",
                placeholder="Enter the text you want the cloned voice to say...",
                lines=3
            )
            language = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language"
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                speaking_rate = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    value=1.0,
                    step=0.1,
                    label="Speaking Rate",
                    info="Adjust speech speed (1.0 is normal)"
                )
                voice_similarity = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.75,
                    step=0.05,
                    label="Voice Similarity",
                    info="Higher values = closer to reference voice"
                )
                emotion = gr.Radio(
                    choices=["neutral", "happy", "sad", "angry", "calm"],
                    value="neutral",
                    label="Emotion",
                    info="Add emotional qualities to the speech"
                )
                style_weight = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.7,
                    step=0.1,
                    label="Style Weight",
                    info="Control the influence of speaking style"
                )
        
        with gr.Column(scale=2):
            audio_output = gr.Audio(label="Generated Audio")
    
    gr.Examples([
        ["sample.wav", "Hello, this is a test of the voice cloning system.", "English", 1.0, 0.75, "neutral", 0.7],
        ["sample.wav", "Bonjour, ceci est un test du système de clonage vocal.", "French", 1.0, 0.75, "happy", 0.7],
        ["sample.wav", "Hola, esta es una prueba del sistema de clonación de voz.", "Spanish", 1.0, 0.75, "calm", 0.7]
    ], [audio_input, text_input, language, speaking_rate, voice_similarity, emotion, style_weight])
    
    inputs = [audio_input, text_input, 
              lambda x: LANGUAGES[x], # Convert language name to code
              speaking_rate, voice_similarity, emotion, style_weight]
    
    gr.Interface(fn=clone_voice, inputs=inputs, outputs=audio_output)

interface.launch(share=True)