In [None]:
# ===================================================================
# COMPLETE VOICE AGENT WITH MURF WEBSOCKET - COLAB READY (V5)
# - Uses Colab register_callback for stable voice input -
# ===================================================================

# --- CELL 1: DEPENDENCY INSTALLATION ---
# NOTE: Run this cell block entirely!

# Install system dependencies (ffmpeg is needed by pydub, portaudio for pyaudio)
!apt install ffmpeg portaudio19-dev -y -qq

# Install Python libraries, including the crucial nest-asyncio fix, Google STT, and SciPy
!pip install google-genai websockets pyaudio pydub numpy nest-asyncio google-cloud-speech scipy -q

# --- CELL 2: COMPLETE CODE ---

import os
import io
import re
import json
import base64
import asyncio
import wave
import struct
import numpy as np
import nest_asyncio
from typing import Literal, Optional, List, Dict
from google import genai
from google.genai import types
from IPython.display import display, Markdown, Audio as IPAudio, HTML
from google.colab import userdata, output
from pydub import AudioSegment
import pyaudio
import websockets
# NEW IMPORTS FOR SPEECH-TO-TEXT (STT) and WAV handling
from google.cloud import speech_v1p1beta1 as speech
from google.oauth2 import service_account
from scipy.io.wavfile import write as write_wav

# ===================================================================
# INITIALIZATION
# ===================================================================

print("üöÄ Initializing Voice Agent System...\n")

# Initialize Gemini
try:
    gemini_key = userdata.get('GEMINI_API_KEY')
    if gemini_key:
        os.environ['GEMINI_API_KEY'] = gemini_key
        GEMINI_CLIENT = genai.Client()
        print("‚úÖ Gemini initialized")
    else:
        GEMINI_CLIENT = None
        print("‚ùå GEMINI_API_KEY not found")
except Exception as e:
    GEMINI_CLIENT = None
    print(f"‚ùå Gemini error: {e}")

# Get Murf API Key
try:
    MURF_API_KEY = userdata.get('MURF_API_KEY')
    if MURF_API_KEY:
        print("‚úÖ Murf API key loaded")
    else:
        print("‚ö†Ô∏è ¬†MURF_API_KEY not found - TTS will not work")
        MURF_API_KEY = None
except:
    MURF_API_KEY = None
    print("‚ö†Ô∏è ¬†MURF_API_KEY not found")

# Fix for Colab: Allow asyncio.run to be called from a running event loop
try:
    nest_asyncio.apply()
    print("‚úÖ nest_asyncio applied (Fixes TTS failure)")
except Exception as e:
    print(f"‚ùå nest_asyncio error: {e}")

# Global variables for callback communication
CALLBACK_DATA = {}
CALLBACK_EVENT = asyncio.Event()

print("="*60 + "\n")

# ===================================================================
# STORY GENERATION
# ===================================================================

def generate_story(topic: str, language: str, age_mode: str, length_min: int):
    """Generate structured story with Gemini"""
    if not GEMINI_CLIENT:
        print("‚ùå Gemini not initialized")
        return None

    system_prompt = (
        "You are a storyteller. Generate stories in tagged format. "
        "Each line: <EMOTION:tag> <SFX:effect> Story text. "
        "First line: <TITLE:title>"
    )

    segments = max(4, length_min * 3)

    user_prompt = f"""
Create a {language} story about: '{topic}'
Audience: {age_mode}
Length: {length_min} minutes (~{segments} segments)

Format:
<TITLE:Story Title>
<EMOTION:calm> <SFX:wind> Story text here.
<EMOTION:happy> <SFX:None> More story text.

Emotions: calm, happy, adventurous, sad, mystery, excitement, cinematic
Generate {segments} segments. Keep each segment 2-3 sentences.
"""

    try:
        response = GEMINI_CLIENT.models.generate_content(
            model='gemini-2.0-flash-exp',
            contents=user_prompt,
            config=types.GenerateContentConfig(
                system_instruction=system_prompt,
                temperature=0.7,
                max_output_tokens=4096
            )
        )
        return response.text.strip()
    except Exception as e:
        print(f"‚ùå Story generation failed: {e}")
        return None

def parse_story(raw_text: str):
    """Parse story into segments"""
    lines = [l.strip() for l in raw_text.split('\n') if l.strip()]

    if not lines:
        return "Untitled", []

    title_match = re.match(r'<TITLE:(.*?)>', lines[0], re.IGNORECASE)
    title = title_match.group(1).strip() if title_match else "Untitled Story"
    story_lines = lines[1:] if title_match else lines

    segments = []
    regex = re.compile(r'<EMOTION:(.*?)>\s*<SFX:(.*?)>\s*(.*)')

    for i, line in enumerate(story_lines):
        match = regex.match(line)
        if match:
            segments.append({
                'id': i,
                'emotion': match.group(1).strip(),
                'sfx': match.group(2).strip(),
                'text': match.group(3).strip()
            })

    return title, segments

# ===================================================================
# COLAB AUDIO RECORDING FUNCTION (FIXED with register_callback)
# ===================================================================

def audio_callback(data):
    """Called by the JavaScript when recording is complete."""
    global CALLBACK_DATA
    global CALLBACK_EVENT
    CALLBACK_DATA = data
    CALLBACK_EVENT.set() # Signal that data is ready

def record_audio_colab(filename='audio.wav', duration_sec=8, sample_rate=24000):
    """
    Records audio using the Colab/IPython microphone widget.
    Saves the result as a WAV file.
    """
    global CALLBACK_DATA
    global CALLBACK_EVENT

    # 1. Reset communication variables
    CALLBACK_DATA = {}
    CALLBACK_EVENT.clear()

    # 2. Register the Python function to be called by JavaScript
    output.register_callback('notebook.audio_callback', audio_callback)

    print(f"üé§ Recording {duration_sec} seconds via Colab widget...")
    display(HTML("""
        <script>
            const record = async (duration, sampleRate, filename) => {
                const stream = await navigator.mediaDevices.getUserMedia({audio: true});
                const audioContext = new AudioContext({sampleRate: sampleRate});
                const mediaRecorder = new MediaRecorder(stream);
                const audioChunks = [];

                mediaRecorder.ondataavailable = event => audioChunks.push(event.data);

                const data = new Promise(resolve => {
                    mediaRecorder.onstop = () => {
                        const audioBlob = new Blob(audioChunks, {type: 'audio/wav'});
                        const fileReader = new FileReader();
                        fileReader.onload = () => resolve(fileReader.result.split(',')[1]);
                        fileReader.readAsDataURL(audioBlob);
                    };
                    mediaRecorder.start();
                    setTimeout(() => mediaRecorder.stop(), duration * 1000);
                });

                const base64Data = await data;

                // Call the registered Python function back with the data
                google.colab.kernel.invokeFunction('notebook.audio_callback', [
                    {'record_data': base64Data, 'record_filename': filename}
                ], {});
            };

            // Invoke the record function with parameters
            record(%d, %d, '%s');

            // Display recording controls
            document.querySelector('#record-status').innerText = 'üî¥ Recording... Speak Now! (Wait for completion message)';
        </script>
        <div id="record-status">Click RUN to start recording.</div>
    """ % (duration_sec, sample_rate, filename)))

    # 3. Wait for the JavaScript callback to signal data readiness (asyncio.run is safe due to nest_asyncio)
    try:
        asyncio.run(asyncio.wait_for(CALLBACK_EVENT.wait(), timeout=duration_sec + 5))
    except asyncio.TimeoutError:
        print("‚ùå Recording timed out while waiting for user input.")
        return None

    # 4. Process data from the global variable
    if 'record_data' in CALLBACK_DATA:
        base64_data = CALLBACK_DATA['record_data']
        file_data = base64.b64decode(base64_data)

        # Save the raw data as the specified WAV file
        with open(filename, 'wb') as f:
            f.write(file_data)

        print(f"‚úÖ Colab recording complete and saved as {filename}")
        return filename
    else:
        print("‚ùå Colab recording failed or data not received.")
        return None

# ===================================================================
# SPEECH-TO-TEXT (STT)
# ===================================================================

def transcribe_audio(file_path: str) -> Optional[str]:
    """Transcribe a local audio file using Google Cloud Speech-to-Text."""
    print("üß† Transcribing audio with Google STT...")
    try:
        # 1. Load the audio file content
        with io.open(file_path, "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)

        # 2. Configure the transcription request
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, # WAV format
            sample_rate_hertz=24000, # Matches your recording rate
            language_code="en-IN",
        )

        # 3. Initialize and run the client (requires authentication)
        stt_client = speech.SpeechClient()
        response = stt_client.recognize(config=config, audio=audio)

        if not response.results:
            print("‚ùå STT failed: No speech detected in the audio.")
            return None

        # 4. Get the best transcription result
        transcript = response.results[0].alternatives[0].transcript
        print(f"‚úÖ Transcription: '{transcript}'")
        return transcript

    except Exception as e:
        print(f"‚ùå Transcription failed (Is Google STT API enabled and authenticated?): {e}")
        return None

# ===================================================================
# MURF WEBSOCKET TTS (REAL IMPLEMENTATION)
# ===================================================================

EMOTION_TO_STYLE = {
    'calm': 'Conversation',
    'happy': 'Conversation',
    'adventurous': 'Conversation',
    'sad': 'Conversation',
    'mystery': 'Conversation',
    'excitement': 'Conversation',
    'cinematic': 'Conversation',
    'default': 'Conversation'
}

async def murf_tts_websocket(text: str, emotion: str = "calm",
                             voice_id: str = "Anisha") -> bytes:
    """
    Generate TTS using Murf WebSocket API (REAL IMPLEMENTATION)
    Returns audio bytes
    """
    if not MURF_API_KEY:
        raise Exception("MURF_API_KEY not set")

    WS_URL = (
        f"wss://global.api.murf.ai/v1/speech/stream-input?"
        f"api-key={MURF_API_KEY}&"
        f"model=FALCON&"
        f"sample_rate=24000&"
        f"channel_type=MONO&"
        f"format=WAV"
    )

    style = EMOTION_TO_STYLE.get(emotion.lower(), 'Conversation')

    all_audio_bytes = b""
    first_chunk = True

    try:
        async with websockets.connect(WS_URL) as ws:
            # Send voice configuration
            voice_config = {
                "voice_config": {
                    "voiceId": voice_id,
                    "multiNativeLocale": "en-IN",
                    "style": style,
                    "rate": 0,
                    "pitch": 0,
                    "variation": 1
                }
            }
            await ws.send(json.dumps(voice_config))

            # Send text
            text_msg = {
                "text": text,
                "end": True
            }
            await ws.send(json.dumps(text_msg))

            # Receive audio chunks
            while True:
                response = await ws.recv()
                data = json.loads(response)

                if "audio" in data:
                    audio_bytes = base64.b64decode(data["audio"])

                    # Skip WAV header for first chunk only
                    if first_chunk and len(audio_bytes) > 44:
                        audio_bytes = audio_bytes[44:]
                        first_chunk = False

                    all_audio_bytes += audio_bytes

                if data.get("final"):
                    break

        return all_audio_bytes

    except Exception as e:
        print(f"‚ùå Murf WebSocket error: {e}")
        raise

def text_to_speech_sync(text: str, emotion: str) -> AudioSegment:
    """Synchronous wrapper for async TTS"""
    try:
        audio_bytes = asyncio.run(murf_tts_websocket(text, emotion))

        if not audio_bytes:
            raise Exception("No audio received")

        # Convert raw PCM to AudioSegment
        audio = AudioSegment(
            data=audio_bytes,
            sample_width=2,  # 16-bit
            frame_rate=24000,
            channels=1
        )

        return audio

    except Exception as e:
        print(f"‚ö†Ô∏è ¬†TTS failed: {e}, using silence")
        duration = len(text) * 50
        return AudioSegment.silent(duration=duration, frame_rate=24000)

# ===================================================================
# REAL-TIME AUDIO CLEANUP
# ===================================================================

class RealTimeAudio:
    """Handle audio cleanup and optional pyaudio playback (not used for recording)"""

    def __init__(self):
        self.CHUNK = 1024
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 24000  # Match Murf output
        # Initialize pyaudio only if needed for playback or cleanup
        try:
            self.p = pyaudio.PyAudio()
        except:
             self.p = None
        self.frames = []

    def play_audio_realtime(self, audio_segment: AudioSegment):
        """Stream audio playback in real-time with proper format (optional, may fail)"""
        if not self.p:
            print("üîä PyAudio not initialized. Skipping real-time playback.")
            return

        print("üîä Playing audio in real-time...")

        audio_segment = audio_segment.set_frame_rate(self.RATE).set_channels(self.CHANNELS).set_sample_width(2)
        raw_data = audio_segment.raw_data

        stream = self.p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, output=True)
        chunk_size = self.CHUNK * 2

        for i in range(0, len(raw_data), chunk_size):
            chunk = raw_data[i:i+chunk_size]
            stream.write(chunk)

            if i % (chunk_size * 20) == 0:
                progress = int((i / len(raw_data)) * 30)
                bar = "‚ñà" * progress + "‚ñë" * (30 - progress)
                print(f"\r ¬† [{bar}]", end="", flush=True)

        print("\r ¬† [" + "‚ñà"*30 + "]")
        stream.stop_stream()
        stream.close()
        print("‚úÖ Playback complete!")

    def cleanup(self):
        """Clean up resources"""
        if self.p:
            self.p.terminate()

# ===================================================================
# COMPLETE AUDIOBOOK PIPELINE
# ===================================================================

def create_audiobook(topic: str, language: str = "English",
                     age_mode: str = "kids", length_min: int = 2):
    """Complete pipeline: Story ‚Üí TTS ‚Üí Audio file"""

    print(f"\n{'='*60}")
    print(f"üé¨ CREATING {age_mode.upper()} AUDIOBOOK")
    print(f" ¬† Topic: {topic}")
    print(f" ¬† Length: {length_min} minutes")
    print(f"{'='*60}\n")

    # 1. Generate story
    print("üìù Step 1: Generating story with Gemini...")
    raw_story = generate_story(topic, language, age_mode, length_min)
    if not raw_story:
        return None

    # 2. Parse
    print("\nüìñ Step 2: Parsing story structure...")
    title, segments = parse_story(raw_story)
    print(f" ¬† ‚úì Title: {title}")
    print(f" ¬† ‚úì Segments: {len(segments)}")

    if not segments:
        print("‚ùå No segments found")
        return None

    # Preview
    print("\n ¬† Preview:")
    for seg in segments[:3]:
        print(f" ¬† ¬† ¬†[{seg['id']}] {seg['emotion']}: {seg['text'][:50]}...")
    if len(segments) > 3:
        print(f" ¬† ¬† ¬†... and {len(segments)-3} more segments")

    # 3. Generate TTS for each segment
    print(f"\nüéôÔ∏è ¬†Step 3: Generating audio with Murf WebSocket (Voice: Anisha)...")

    full_audio = AudioSegment.empty()

    for i, seg in enumerate(segments):
        print(f" ¬† [{i+1}/{len(segments)}] {seg['emotion']}: ", end="", flush=True)

        try:
            audio = text_to_speech_sync(seg['text'], seg['emotion'])
            full_audio += audio

            full_audio += AudioSegment.silent(duration=800, frame_rate=24000)

            duration = len(audio) / 1000.0
            print(f"‚úì ({duration:.1f}s)")

        except Exception as e:
            print(f"‚úó Error: {e}")

    # 4. Export
    os.makedirs("output", exist_ok=True)
    filename = title.replace(' ', '_').replace('/', '-')[:50]
    output_file = f"output/{filename}.mp3"

    print(f"\nüíæ Step 4: Exporting audiobook...")
    full_audio.export(output_file, format="mp3", bitrate="128k")

    total_duration = len(full_audio) / 1000.0
    print(f" ¬† ‚úì File: {output_file}")
    print(f" ¬† ‚úì Duration: {total_duration:.1f}s ({total_duration/60:.1f} min)")

    # 5. Display player
    print(f"\nüéâ SUCCESS! Playing audiobook...")
    display(IPAudio(output_file, autoplay=False))

    return output_file

# ===================================================================
# NEW PIPELINE FUNCTION (VOICE PROMPT)
# ===================================================================

def create_audiobook_from_voice_prompt(language: str = "English", age_mode: str = "kids", length_min: int = 2):
    """Complete pipeline: Voice Prompt ‚Üí STT ‚Üí Story ‚Üí TTS ‚Üí Audio file"""

    audio_sys = RealTimeAudio()
    recording_filename = "user_prompt.wav"

    try:
        # A. VOICE INPUT & RECORDING (Using Colab Native Recorder)
        print(f"\n{'='*60}")
        print("üó£Ô∏è ¬†START: Please state the topic for your story in a clear voice.")
        print(f" ¬† Recording will start below and last 8 seconds.")
        print(f"{'='*60}")

        recorded_file = record_audio_colab(
            filename=recording_filename,
            duration_sec=8,
            sample_rate=24000
        )

        if not recorded_file:
             print("‚ùå Recording failed, stopping.")
             return None

        # B. TRANSCRIPTION (STT)
        topic_prompt = transcribe_audio(recording_filename)

        if not topic_prompt:
            print("‚ùå Cannot proceed without a topic.")
            return None

        # C. STORY GENERATION & TTS (Reuse existing function)
        print(f"\n‚úÖ Topic received: '{topic_prompt}'")
        result_file = create_audiobook(
            topic=topic_prompt,
            language=language,
            age_mode=age_mode,
            length_min=length_min
        )

        return result_file

    finally:
        audio_sys.cleanup()

# ===================================================================
# USAGE EXAMPLES
# ===================================================================

def example_1_kids_story():
    """Example 1: Create a kids story"""
    return create_audiobook(
        topic=input("Enter the topic:"),
        language=input("Enter the language:"),
        age_mode=input("kids or teen or adult:"),
        length_min=int(input("time duration:"))
    )

def example_2_record_audio():
    """Example 2: Record 10 seconds of audio and play it back"""
    audio_sys = RealTimeAudio()
    recording_file = "my_voice.wav"

    file = record_audio_colab(filename=recording_file, duration_sec=10, sample_rate=24000)
    if not file:
        audio_sys.cleanup()
        return None

    audio_seg = AudioSegment.from_wav(file)

    print(f"\nüìä Audio Info:")
    print(f" ¬† Duration: {len(audio_seg)/1000:.1f}s")
    print(f" ¬† Sample Rate: {audio_seg.frame_rate}Hz")

    audio_sys.play_audio_realtime(audio_seg)

    print("\nüñ•Ô∏è ¬†HTML5 Player (Guaranteed Playback):")
    display(IPAudio(file, autoplay=False))

    audio_sys.cleanup()
    return file

def example_3_teen_story():
    """Example 3: Teen adventure story"""
    return create_audiobook(
        topic="friends discovering a mysterious portal in an ancient library",
        age_mode="teen",
        length_min=2
    )

def example_4_test_tts():
    """Example 4: Test Murf TTS directly"""
    print("üß™ Testing Murf TTS WebSocket (Voice: Anisha)...")

    test_text = "Hello! This is a test of the Murf text to speech system using the Anisha Indian English voice. It should sound clear!"

    try:
        audio = text_to_speech_sync(test_text, "happy")
        print(f"‚úÖ TTS Success! Generated {len(audio)/1000:.1f}s of audio")

        audio.export("test_tts.mp3", format="mp3")
        display(IPAudio("test_tts.mp3", autoplay=True))

        return "test_tts.mp3"

    except Exception as e:
        print(f"‚ùå TTS Test Failed: {e}")
        return None

def example_5_voice_story():
    """Example 5: Record voice prompt and generate story"""
    return create_audiobook_from_voice_prompt(
        age_mode="kids",
        length_min=1
    )

# ===================================================================
# READY TO USE
# ===================================================================

print("\n" + "="*60)
print("‚ú® VOICE AGENT READY! (Using Anisha - Indian English)")
print("="*60)
print("\nüìö Available Examples:")
print(" ¬†1. example_1_kids_story() ¬† - Create 2min kids story (Default Topic)")
print(" ¬†2. example_2_record_audio() - Record 10s & playback (Colab-native recording)")
print(" ¬†3. example_3_teen_story() ¬† - Create 2min teen story")
print(" ¬†4. example_4_test_tts() ¬† ¬† - Test Murf TTS directly (Anisha voice test)")
print(" ¬†5. example_5_voice_story() ¬†- **Record voice prompt and generate story**")
print("\nüí° Quick Start:")
print(" ¬†result = example_5_voice_story()")
print("="*60 + "\n")

# UNCOMMENT TO RUN:
result = example_1_kids_story()

In [None]:
# Colab cell (code)
!apt-get update -qq
!apt-get install -y ffmpeg
!pip install soundfile librosa

In [None]:
# Colab cell (code)
from google.colab import files
uploaded = files.upload()  # Use the chooser that appears to upload sample_voice_raw.wav/mp3
print("Uploaded:", uploaded.keys())

In [None]:
import os, glob, subprocess

# Find uploaded file
files = glob.glob('*sample_voice_raw*')
if not files:
    # Add .mpeg to the search patterns to correctly identify the uploaded file
    files = glob.glob('*.wav') + glob.glob('*.mp3') + glob.glob('*.mpeg')

# Check if any suitable file was found
if not files:
    raise FileNotFoundError("No suitable audio file found. Please upload a .wav, .mp3, or .mpeg file or ensure 'sample_voice_raw' is in the filename.")

input_file = files[0]
print("Using input:", input_file)

# Convert to mono 22050Hz wav and normalize loudness
output_file = "sample_voice.wav"
cmd = f'ffmpeg -y -i "{input_file}" -ar 22050 -ac 1 -af "loudnorm=I=-16:TP=-1.5:LRA=11" "{output_file}"'
print(cmd)
subprocess.run(cmd, shell=True, check=True)
print("Converted ->", output_file)

In [None]:
# Colab cell (code)
import IPython.display as ipd
from scipy.io import wavfile
import numpy as np

sr, wav = wavfile.read("sample_voice.wav")
print("Sample rate:", sr, "Length (s):", len(wav)/sr)
ipd.display(ipd.Audio("sample_voice.wav"))

# crude SNR-ish check: ratio of RMS to noise floor (very approximate)
rms = np.sqrt(np.mean(wav.astype(float)**2))
noise_floor = np.percentile(np.abs(wav.astype(float)), 5) + 1e-9
print("RMS:", rms, "Est. noise-floor:", noise_floor, "Ratio:", rms/noise_floor)

In [None]:
from pydub import AudioSegment

input_file = "sample_voice.wav"     # change filename if needed
output_file = "processed_voice.wav"

# Load audio
audio = AudioSegment.from_file(input_file)

# Convert: set frame rate, channels, sample width
audio = audio.set_frame_rate(16000)
audio = audio.set_channels(1)
audio = audio.set_sample_width(2)  # 16-bit PCM

# Export
audio.export(output_file, format="wav")

print("Preprocessed file saved as:", output_file)

In [None]:
#!/usr/bin/env python3
"""
OpenVoice Complete Setup Script
"""

import sys
import subprocess
import os

def run_command(command, check=True):
    """Execute shell command"""
    try:
        result = subprocess.run(command, shell=True, check=check, capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print(f"Stderr: {result.stderr}")
        return result.returncode == 0
    except subprocess.CalledProcessError as e:
        print(f"Command failed: {command}")
        print(f"Error: {e.stderr}")
        return False

def main():
    print("OpenVoice Complete Setup Script")
    print("=" * 60)

    # Step 1: Clone OpenVoice repository if not exists
    print("\n[1/4] Checking/Cloning OpenVoice repository...")
    openvoice_path = "/content/OpenVoice"

    if not os.path.exists(openvoice_path):
        print("Cloning OpenVoice repository...")
        run_command("git clone https://github.com/myshell-ai/OpenVoice.git /content/OpenVoice")
    else:
        print(f"‚úì OpenVoice repository found at {openvoice_path}")

    # Check directory structure
    print("\nChecking directory structure...")
    for item in os.listdir(openvoice_path):
        print(f"  - {item}")

    # Step 2: Install dependencies
    print("\n[2/4] Installing dependencies...")
    dependencies = [
        "pypinyin",
        "cn2an",
        "jieba",
        "numpy",
        "scipy",
        "torch",
        "torchaudio",
        "librosa",
        "matplotlib",
        "tqdm"
    ]

    for dep in dependencies:
        print(f"\nInstalling {dep}...")
        run_command(f"pip install {dep}", check=False)

    # Step 3: Install OpenVoice package
    print("\n[3/4] Installing OpenVoice as package...")

    # Try to install in development mode
    if os.path.exists(os.path.join(openvoice_path, "setup.py")):
        print("Found setup.py, installing in development mode...")
        run_command(f"cd {openvoice_path} && pip install -e .", check=False)
    else:
        print("No setup.py found, adding to path manually...")

    # Add to Python path
    if openvoice_path not in sys.path:
        sys.path.insert(0, openvoice_path)
        print(f"‚úì Added {openvoice_path} to Python path")

    # Step 4: Test imports with multiple attempts
    print("\n[4/4] Testing imports...")

    # Try different import approaches
    import_attempts = [
        # Standard import
        "from openvoice import se_extractor",
        "from openvoice.api import ToneColorConverter",

        # Alternative imports
        "import openvoice",
        "from openvoice import api",

        # Direct module access
        "import sys; import os"
    ]

    print("\nTesting import paths...")
    print(f"Current sys.path entries containing 'OpenVoice':")
    for path in sys.path:
        if 'OpenVoice' in path:
            print(f"  - {path}")

    print("\nTrying imports...")

    try:
        # First check what's in the OpenVoice directory
        openvoice_contents = os.listdir(openvoice_path)
        print(f"\nContents of OpenVoice directory:")
        for item in openvoice_contents:
            full_path = os.path.join(openvoice_path, item)
            if os.path.isdir(full_path):
                print(f"  üìÅ {item}/")
                subitems = os.listdir(full_path)[:3]  # Show first 3 items
                for subitem in subitems:
                    print(f"    - {subitem}")
                if len(os.listdir(full_path)) > 3:
                    print(f"    ... and {len(os.listdir(full_path)) - 3} more")
            else:
                print(f"  üìÑ {item}")

        # Try to find the actual openvoice module
        print("\nSearching for Python modules...")
        for root, dirs, files in os.walk(openvoice_path):
            if "__init__.py" in files:
                relative_path = root.replace(openvoice_path, "").lstrip("/")
                if relative_path:
                    print(f"  Found Python package: {relative_path}")

        # Try importing with the correct path
        print("\nTrying to import...")

        # Add the parent directory too
        parent_path = "/content"
        if parent_path not in sys.path:
            sys.path.insert(0, parent_path)

        # Try importing
        import importlib.util

        # Check if openvoice module exists
        spec = importlib.util.find_spec("openvoice")
        if spec is None:
            print("openvoice module not found in standard locations")
            print("\nTrying manual import...")

            # Look for the actual module
            for root, dirs, files in os.walk(openvoice_path):
                if "openvoice" in root and "__init__.py" in files:
                    module_path = root
                    if module_path not in sys.path:
                        sys.path.insert(0, module_path)
                    print(f"Added module path: {module_path}")

        # Final attempt to import
        try:
            # Try different possible module structures
            try:
                from openvoice import se_extractor
                print("‚úì Successfully imported: from openvoice import se_extractor")
            except ImportError:
                try:
                    import openvoice.se_extractor
                    print("‚úì Successfully imported: import openvoice.se_extractor")
                except ImportError:
                    # Try direct file import
                    se_extractor_path = os.path.join(openvoice_path, "openvoice", "se_extractor.py")
                    if os.path.exists(se_extractor_path):
                        print(f"Found se_extractor.py at: {se_extractor_path}")
                        # Add the openvoice directory to path
                        openvoice_module_path = os.path.join(openvoice_path, "openvoice")
                        if os.path.exists(openvoice_module_path):
                            if openvoice_module_path not in sys.path:
                                sys.path.insert(0, openvoice_module_path)
                            print(f"Added module directory: {openvoice_module_path}")

            print("\n‚úÖ OpenVoice setup completed!")

            # Show usage example
            print("\n" + "=" * 60)
            print("Usage Example:")
            print("=" * 60)
            print("""
# Add to your script:
import sys
sys.path.insert(0, '/content/OpenVoice')

# Then try importing
try:
    from openvoice import se_extractor
    from openvoice.api import ToneColorConverter
    print("OpenVoice loaded successfully!")
except ImportError as e:
    print(f"Import error: {e}")
    print("Trying alternative import...")
    import openvoice.se_extractor as se_extractor
            """)

        except Exception as import_error:
            print(f"Import failed: {import_error}")
            print("\nTroubleshooting steps:")
            print("1. Check the directory structure above")
            print("2. The 'openvoice' directory should contain __init__.py")
            print("3. If structure is different, adjust the import path")

    except Exception as e:
        print(f"Error during setup: {e}")

if __name__ == "__main__":
    main()

In [None]:
!git lfs install
!git clone https://huggingface.co/myshell-ai/OpenVoice openvoice_checkpoints

In [None]:
!find openvoice_checkpoints -name "config.json" -o -name "checkpoint.pth"

In [None]:
!mkdir -p checkpoints/converter
!cp openvoice_checkpoints/checkpoints/converter/* checkpoints/converter/

In [None]:
!ls -l checkpoints/converter

In [None]:
import sys
sys.path.insert(0, '/content/OpenVoice')

import torch
from openvoice.api import ToneColorConverter

# Path to converter checkpoints
ckpt_converter = "checkpoints/converter"

# Select device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load the converter model
tone_color_converter = ToneColorConverter(
    f"{ckpt_converter}/config.json",
    device=device
)
tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")

print("‚úì ToneColorConverter loaded successfully!")

In [None]:
import torch
import os
from openvoice import se_extractor

# Your reference audio (16s, clean, processed)
audio_file = "processed_voice.wav"

if not os.path.exists(audio_file):
    print("‚ùå processed_voice.wav not found in this directory!")
else:
    print("Found:", audio_file)

    # Extract speaker embedding
    target_se, audio_name = se_extractor.get_se(
        audio_file,
        tone_color_converter,
        target_dir='processed',  # temporary dir
        vad=True                # removes silence
    )

    # Save embedding
    torch.save(target_se, "speaker_embedding.pt")

    print("\n‚úÖ Speaker embedding extracted!")
    print("Saved as: speaker_embedding.pt")
    print("Shape:", target_se.shape)

In [None]:
import os
import torch
from openvoice import se_extractor

# Reuse the already-loaded tone_color_converter and device
# Make sure these exist from previous steps:
# - tone_color_converter
# - device

# 1Ô∏è‚É£ Paths
source_audio = "download (3).mp3"      # audio whose content you want to convert
embedding_path = "speaker_embedding.pt"   # your saved target voice embedding
output_path = "converted_voice.wav"       # output file

# 2Ô∏è‚É£ Basic checks
if not os.path.exists(source_audio):
    raise FileNotFoundError(f"Source audio not found: {source_audio}")

if not os.path.exists(embedding_path):
    raise FileNotFoundError(f"Speaker embedding not found: {embedding_path}")

print("‚úì Found source audio and embedding")

# 3Ô∏è‚É£ Load target speaker embedding
tgt_se = torch.load(embedding_path).to(device)
print("‚úì Loaded target speaker embedding with shape:", tgt_se.shape)

# 4Ô∏è‚É£ Get source speaker embedding (from the same file for now)
src_se, _ = se_extractor.get_se(
    source_audio,
    tone_color_converter,
    target_dir='processed_src',
    vad=True
)
src_se = src_se.to(device)
print("‚úì Extracted source speaker embedding with shape:", src_se.shape)

# 5Ô∏è‚É£ Run conversion
print("\nüéß Converting voice...")
tone_color_converter.convert(
    audio_src_path=source_audio,
    src_se=src_se,
    tgt_se=tgt_se,
    output_path=output_path,
    message="Converting voice to target speaker..."
)

print(f"\n‚úÖ Conversion complete! Saved as: {output_path}")

In [None]:
from IPython.display import Audio, display

output_path = "converted_voice.wav"
display(Audio(output_path, autoplay=False))