<a href="https://colab.research.google.com/github/rkvishnoi21/Aviator-Data-Analysis/blob/main/openvoice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages
!pip install torch==2.0.1 torchaudio==2.0.2
!pip install numpy scipy librosa soundfile matplotlib
!pip install gradio==3.50.2
!pip install pydub
!pip install indic-nlp-library
!pip install indic-transliteration
!pip install langid
!pip install phonemizer



In [2]:
# Create the directory structure
!mkdir -p /content/hindi_voice_clone
!mkdir -p /content/hindi_voice_clone/preprocessing
!mkdir -p /content/hindi_voice_clone/checkpoints

In [3]:
# Change to the main project directory
%cd /content/hindi_voice_clone

/content/hindi_voice_clone


In [4]:
# Clone OpenVoice repository
!git clone https://github.com/myshell-ai/openvoice

fatal: destination path 'openvoice' already exists and is not an empty directory.


In [5]:
# Download the necessary model files
!wget -O checkpoints/hubert_base.pt https://huggingface.co/myshell-ai/OpenVoice/resolve/main/hubert_base.pt

# Download base speakers
!wget -O checkpoints/base_speakers.zip https://huggingface.co/myshell-ai/OpenVoice/resolve/main/base_speakers.zip
!unzip -o checkpoints/base_speakers.zip -d checkpoints/

# Download converter
!wget -O checkpoints/converter.zip https://huggingface.co/myshell-ai/OpenVoice/resolve/main/converter.zip
!unzip -o checkpoints/converter.zip -d checkpoints/

--2025-02-28 08:56:58--  https://huggingface.co/myshell-ai/OpenVoice/resolve/main/hubert_base.pt
Resolving huggingface.co (huggingface.co)... 3.171.171.104, 3.171.171.128, 3.171.171.6, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.104|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-02-28 08:56:58 ERROR 404: Not Found.

--2025-02-28 08:56:58--  https://huggingface.co/myshell-ai/OpenVoice/resolve/main/base_speakers.zip
Resolving huggingface.co (huggingface.co)... 3.171.171.104, 3.171.171.128, 3.171.171.6, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.104|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-02-28 08:56:58 ERROR 404: Not Found.

Archive:  checkpoints/base_speakers.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the la

In [6]:
# Create the Hindi phonemizer
with open('/content/hindi_voice_clone/hindi_phonemizer.py', 'w') as f:
    f.write('''
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

class HindiPhonemizer:
    def __init__(self):
        # Hindi vowels and consonants
        self.vowels = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'ऋ']
        self.consonants = ['क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण',
                           'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह']

        # Mapping of Hindi characters to phonetic representation
        self.phoneme_map = {
            # Vowels
            'अ': 'a', 'आ': 'aa', 'इ': 'i', 'ई': 'ii', 'उ': 'u', 'ऊ': 'uu',
            'ए': 'e', 'ऐ': 'ai', 'ओ': 'o', 'औ': 'au', 'ऋ': 'ri',

            # Consonants
            'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'ङ': 'ng',
            'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ny',
            'ट': 't', 'ठ': 'th', 'ड': 'd', 'ढ': 'dh', 'ण': 'n',
            'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n',
            'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm',
            'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh',
            'ष': 'sh', 'स': 's', 'ह': 'h',

            # Matras (vowel diacritics)
            'ा': 'aa', 'ि': 'i', 'ी': 'ii', 'ु': 'u', 'ू': 'uu',
            'े': 'e', 'ै': 'ai', 'ो': 'o', 'ौ': 'au', 'ृ': 'ri',

            # Special characters
            'ं': 'n', 'ः': 'h', '्': ''
        }

    def _preprocess(self, text):
        # Clean the text
        text = text.strip()
        # Remove any non-Hindi characters
        text = re.sub(r'[^\u0900-\u097F\s]', '', text)
        return text

    def _get_phonemes(self, word):
        phonemes = []
        i = 0

        while i < len(word):
            char = word[i]
            # Handle special cases
            if char in self.phoneme_map:
                phonemes.append(self.phoneme_map[char])
            else:
                # If character not in map, keep it as is
                phonemes.append(char)
            i += 1

        return phonemes

    def text_to_phonemes(self, text):
        preprocessed_text = self._preprocess(text)
        words = preprocessed_text.split()

        all_phonemes = []
        for word in words:
            word_phonemes = self._get_phonemes(word)
            all_phonemes.extend(word_phonemes)
            all_phonemes.append(' ')

        # Join all phonemes with spaces
        return ' '.join(all_phonemes).strip()

    def __call__(self, text):
        return self.text_to_phonemes(text)
''')

In [7]:
# Create the audio preprocessing module
with open('/content/hindi_voice_clone/preprocessing/audio_utils.py', 'w') as f:
    f.write('''
import librosa
import numpy as np
import soundfile as sf
from pydub import AudioSegment
import os

class AudioPreprocessor:
    def __init__(self, target_sr=16000):
        self.target_sr = target_sr

    def load_audio(self, file_path):
        """Load audio file with librosa"""
        audio, sr = librosa.load(file_path, sr=None)
        return audio, sr

    def convert_sample_rate(self, audio, original_sr):
        """Convert audio to target sample rate"""
        if original_sr != self.target_sr:
            audio = librosa.resample(audio, orig_sr=original_sr, target_sr=self.target_sr)
        return audio

    def trim_silence(self, audio, top_db=20):
        """Trim silence from the beginning and end"""
        trimmed_audio, _ = librosa.effects.trim(audio, top_db=top_db)
        return trimmed_audio

    def normalize_volume(self, audio):
        """Normalize audio volume"""
        return librosa.util.normalize(audio)

    def split_audio(self, audio, segment_length=5.0):
        """Split audio into segments of specified length in seconds"""
        segment_samples = int(segment_length * self.target_sr)
        segments = []

        # If audio is shorter than segment_length, pad it
        if len(audio) < segment_samples:
            padded_audio = np.zeros(segment_samples)
            padded_audio[:len(audio)] = audio
            segments.append(padded_audio)
        else:
            # Split audio into segments
            for i in range(0, len(audio), segment_samples):
                segment = audio[i:i+segment_samples]
                if len(segment) == segment_samples:
                    segments.append(segment)
                else:
                    # If last segment is shorter, pad it
                    padded_segment = np.zeros(segment_samples)
                    padded_segment[:len(segment)] = segment
                    segments.append(padded_segment)

        return segments

    def convert_mp3_to_wav(self, mp3_file, output_dir=None):
        """Convert MP3 to WAV format"""
        if output_dir is None:
            output_dir = os.path.dirname(mp3_file)

        filename = os.path.splitext(os.path.basename(mp3_file))[0]
        output_path = os.path.join(output_dir, f"{filename}.wav")

        audio = AudioSegment.from_mp3(mp3_file)
        audio = audio.set_frame_rate(self.target_sr)
        audio = audio.set_channels(1)  # Convert to mono
        audio.export(output_path, format="wav")

        return output_path

    def process_audio(self, file_path, output_path=None):
        """Complete audio preprocessing pipeline"""
        # Check if file is MP3, convert if needed
        if file_path.lower().endswith('.mp3'):
            file_path = self.convert_mp3_to_wav(file_path)

        # Load audio
        audio, sr = self.load_audio(file_path)

        # Convert sample rate
        audio = self.convert_sample_rate(audio, sr)

        # Trim silence
        audio = self.trim_silence(audio)

        # Normalize volume
        audio = self.normalize_volume(audio)

        # Save processed audio if output path is provided
        if output_path:
            sf.write(output_path, audio, self.target_sr)

        return audio, self.target_sr
''')

In [8]:
# Create empty __init__.py files for proper imports
with open('/content/hindi_voice_clone/preprocessing/__init__.py', 'w') as f:
    f.write('')

# Create the language processor
with open('/content/hindi_voice_clone/language_utils.py', 'w') as f:
    f.write('''
import langid
import sys
import os
from hindi_phonemizer import HindiPhonemizer

# Make sure we can import from the OpenVoice package
sys.path.append(os.path.join(os.path.dirname(__file__), 'openvoice'))

# Import OpenVoice classes
try:
    from openvoice.utils.tokenizer import EnglishTokenizer, ChineseTokenizer
except ImportError:
    print("Failed to import OpenVoice tokenizers. Check the path.")
    raise

class LanguageProcessor:
    def __init__(self):
        self.hindi_phonemizer = HindiPhonemizer()
        self.english_tokenizer = EnglishTokenizer()
        self.chinese_tokenizer = ChineseTokenizer()

        # Language detection
        langid.set_languages(['hi', 'en', 'zh'])

    def detect_language(self, text):
        lang, _ = langid.classify(text)
        return lang

    def process_text(self, text):
        lang = self.detect_language(text)

        if lang == 'hi':
            # Process Hindi text
            phonemes = self.hindi_phonemizer(text)
            return phonemes, 'en'  # Use 'en' as base since we don't have Hindi base
        elif lang == 'en':
            # Process English text
            phonemes = self.english_tokenizer(text)
            return phonemes, 'en'
        elif lang == 'zh':
            # Process Chinese text
            phonemes = self.chinese_tokenizer(text)
            return phonemes, 'zh'
        else:
            # Default to English if language not supported
            phonemes = self.english_tokenizer(text)
            return phonemes, 'en'
''')

In [9]:
# Create the voice cloner
with open('/content/hindi_voice_clone/voice_cloner.py', 'w') as f:
    f.write('''
import torch
import numpy as np
import sys
import os
import librosa
import soundfile as sf

# Make sure we can import from the OpenVoice package
sys.path.append(os.path.join(os.path.dirname(__file__), 'openvoice'))

# Import OpenVoice
try:
    from openvoice.api import ToneColorConverter
except ImportError:
    print("Failed to import OpenVoice ToneColorConverter. Check the path.")
    raise

from language_utils import LanguageProcessor
from preprocessing.audio_utils import AudioPreprocessor

class VoiceCloner:
    def __init__(self, checkpoint_dir='./checkpoints'):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        # Initialize the tone color converter
        hubert_path = os.path.join(checkpoint_dir, 'hubert_base.pt')
        converter_path = os.path.join(checkpoint_dir, 'converter')

        print(f"HuBERT path: {hubert_path}")
        print(f"Converter path: {converter_path}")
        print(f"Files in checkpoint dir: {os.listdir(checkpoint_dir)}")

        self.converter = ToneColorConverter(
            hubert_path,
            converter_path,
            device=self.device
        )

        # Language processor
        self.language_processor = LanguageProcessor()

        # Audio preprocessor
        self.audio_preprocessor = AudioPreprocessor()

    def preprocess_reference_audio(self, reference_audio_path):
        """Preprocess the reference audio for voice cloning"""
        processed_audio, sr = self.audio_preprocessor.process_audio(reference_audio_path)
        return processed_audio, sr

    def clone_voice(self, reference_audio_path, text, output_path=None):
        """Clone voice using OpenVoice with Hindi support"""
        # Preprocess reference audio
        source_audio, sr = self.preprocess_reference_audio(reference_audio_path)

        # Process text and detect language
        phonemes, lang = self.language_processor.process_text(text)

        print(f"Detected language: {lang}")
        print(f"Phonemes: {phonemes[:50]}...")

        # Generate speech with cloned voice
        cloned_audio = self.converter.convert(
            source_audio,
            phonemes,
            lang,  # en or zh
            output_sampling_rate=sr
        )

        # Save the cloned audio if output path is provided
        if output_path:
            sf.write(output_path, cloned_audio, sr)

        return cloned_audio, sr
''')

In [10]:
# Create the Gradio app
with open('/content/hindi_voice_clone/app.py', 'w') as f:
    f.write('''
import gradio as gr
import os
import tempfile
import sys
from voice_cloner import VoiceCloner

# Initialize the voice cloner
voice_cloner = VoiceCloner()

def clone_voice(reference_audio, text_to_speak):
    """Function to clone voice using the voice cloner"""
    print(f"Reference audio: {reference_audio}")
    print(f"Text to speak: {text_to_speak}")

    # Create a temporary file for the output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        output_path = f.name

    try:
        # Clone the voice
        cloned_audio, sr = voice_cloner.clone_voice(reference_audio, text_to_speak, output_path)
        return output_path
    except Exception as e:
        print(f"Error in voice cloning: {e}")
        import traceback
        traceback.print_exc()
        return None

# Create Gradio interface
demo = gr.Interface(
    fn=clone_voice,
    inputs=[
        gr.Audio(type="filepath", label="Reference Voice (30 seconds to 1 minute)"),
        gr.Textbox(label="Text to speak", placeholder="Enter text in Hindi or English...")
    ],
    outputs=gr.Audio(label="Cloned Voice"),
    title="Voice Cloning for Indian Languages",
    description="Upload a reference audio file (30 seconds to 1 minute) and enter text to hear it spoken in the reference voice. Supports Hindi and English."
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)
''')

In [11]:
# Create a test script
with open('/content/hindi_voice_clone/test_setup.py', 'w') as f:
    f.write('''
import sys
import os

# Check directory structure
print("Current directory:", os.getcwd())
print("Directory contents:", os.listdir())
print("OpenVoice directory contents:", os.listdir("openvoice") if os.path.exists("openvoice") else "OpenVoice not found")
print("Checkpoints directory contents:", os.listdir("checkpoints") if os.path.exists("checkpoints") else "Checkpoints not found")

# Test imports
try:
    import torch
    import torchaudio
    import librosa
    import soundfile
    import gradio
    from hindi_phonemizer import HindiPhonemizer
    from preprocessing.audio_utils import AudioPreprocessor

    print("Basic imports successful!")
except ImportError as e:
    print(f"Import error: {e}")

# Test OpenVoice imports
try:
    sys.path.append("./openvoice")
    from openvoice.utils.tokenizer import EnglishTokenizer, ChineseTokenizer
    from openvoice.api import ToneColorConverter

    print("OpenVoice imports successful!")
except ImportError as e:
    print(f"OpenVoice import error: {e}")

print("Setup test complete!")
''')

In [12]:
# Run the test script to check setup
%cd /content/hindi_voice_clone
!python test_setup.py

/content/hindi_voice_clone
Current directory: /content/hindi_voice_clone
Directory contents: ['voice_cloner.py', 'test_setup.py', 'checkpoints', 'preprocessing', 'openvoice', 'app.py', '__pycache__', 'language_utils.py', 'hindi_phonemizer.py']
OpenVoice directory contents: ['.gitignore', 'demo_part2.ipynb', 'setup.py', 'openvoice', 'README.md', 'resources', '.git', 'demo_part3.ipynb', 'demo_part1.ipynb', 'docs', 'requirements.txt', 'LICENSE']
Checkpoints directory contents: ['hubert_base.pt', 'converter.zip', 'base_speakers.zip']
Basic imports successful!
OpenVoice import error: No module named 'openvoice.utils.tokenizer'; 'openvoice.utils' is not a package
Setup test complete!


In [13]:
# Run the app
%cd /content/hindi_voice_clone
!python app.py

/content/hindi_voice_clone
Failed to import OpenVoice ToneColorConverter. Check the path.
Traceback (most recent call last):
  File "/content/hindi_voice_clone/app.py", line 6, in <module>
    from voice_cloner import VoiceCloner
  File "/content/hindi_voice_clone/voice_cloner.py", line 14, in <module>
    from openvoice.api import ToneColorConverter
  File "/content/hindi_voice_clone/openvoice/openvoice/api.py", line 9, in <module>
    from openvoice.text import text_to_sequence
  File "/content/hindi_voice_clone/openvoice/openvoice/text/__init__.py", line 2, in <module>
    from openvoice.text import cleaners
  File "/content/hindi_voice_clone/openvoice/openvoice/text/cleaners.py", line 2, in <module>
    from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
  File "/content/hindi_voice_clone/openvoice/openvoice/text/english.py", line 21, in <module>
    from unidecode import unidecode
ModuleNotFoundError: No module named 'unidecode'
