In [1]:
# @title Setup: CSM Voice Service

# @markdown ## 1. Install Required Dependencies
# @markdown Run this cell to install all necessary packages for CSM voice generation

!pip install torch==2.4.0 torchaudio==2.4.0 transformers==4.49.0 huggingface_hub==0.28.1 tokenizers==0.21.0 moshi==0.2.2 torchtune==0.4.0 torchao==0.9.0
!pip install git+https://github.com/SesameAILabs/silentcipher@master
!pip install flask flask-socketio

# Add this to a new cell
!apt-get update && apt-get install -y ffmpeg
!ffmpeg -version



[0mCollecting git+https://github.com/SesameAILabs/silentcipher@master
  Cloning https://github.com/SesameAILabs/silentcipher (to revision master) to /tmp/pip-req-build-e54ub9ga
  Running command git clone --filter=blob:none --quiet https://github.com/SesameAILabs/silentcipher /tmp/pip-req-build-e54ub9ga
  Resolved https://github.com/SesameAILabs/silentcipher to commit d46d7d0893a583d8968ab3a6626e2289faec9152
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting librosa>=0.10.0 (from silentcipher==1.0.5)
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting SoundFile>=0.12.1 (from silentcipher==1.0.5)
  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting Flask>=2.2.5 (from silentcipher==1.0.5)
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting blinker>=1.9 (from Flask>=2.2.5->si

In [4]:
# Verify CUDA availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: CUDA not available. Please ensure you're using a GPU runtime.")

# Disable lazy compilation in Mimi as specified in CSM setup
import os
os.environ["NO_TORCH_COMPILE"] = "1"

# Clone the CSM Repository if it doesn't exist yet
import os
if not os.path.exists('csm'):
    !git clone https://github.com/SesameAILabs/csm.git
    
# Add CSM to Python path instead of changing directory
import sys
sys.path.append(os.path.join(os.getcwd(), 'csm'))
print("CSM repository prepared and added to Python path")


CUDA available: True
CUDA device: NVIDIA RTX A5000
CSM repository prepared and added to Python path


In [6]:
# @markdown This sets up local directories for saving audio and session data

# Create local directories for storing data (instead of using Google Drive)
!mkdir -p ./psychoanalyst-assistant/generated_audio
!mkdir -p ./psychoanalyst-assistant/session_data

print("Local storage directories created successfully")

Local storage directories created successfully


In [7]:

# @markdown ## 4. Hugging Face Authentication
# @markdown You'll need to authenticate with Hugging Face to access the models

from huggingface_hub import login
from getpass import getpass

# @markdown Enter your Hugging Face token (create one at https://huggingface.co/settings/tokens)
hf_token = 

if not hf_token:
    hf_token = getpass("Enter your Hugging Face token: ")

login(token=hf_token)
print("Authentication completed")


Authentication completed


In [12]:

# Install silentcipher if not already installed
!pip install git+https://github.com/SesameAILabs/silentcipher@master


Collecting git+https://github.com/SesameAILabs/silentcipher@master
  Cloning https://github.com/SesameAILabs/silentcipher (to revision master) to /tmp/pip-req-build-7kt8oftb
  Running command git clone --filter=blob:none --quiet https://github.com/SesameAILabs/silentcipher /tmp/pip-req-build-7kt8oftb
  Resolved https://github.com/SesameAILabs/silentcipher to commit d46d7d0893a583d8968ab3a6626e2289faec9152
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting librosa>=0.10.0 (from silentcipher==1.0.5)
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting SoundFile>=0.12.1 (from silentcipher==1.0.5)
  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting Flask>=2.2.5 (from silentcipher==1.0.5)
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting blinker>=1.9 (from Flask>=2.2.5->silent

In [11]:
# ============================================
# @title CSM Voice Generation Wrapper

import os
import sys
import torch
import torchaudio
import json
import uuid
from datetime import datetime
import numpy as np
from pathlib import Path

# Install silentcipher if not already installed
!pip install git+https://github.com/SesameAILabs/silentcipher@master

# Add the CSM directory to path properly
csm_path = os.path.join(os.getcwd(), 'csm')
if csm_path not in sys.path:
    sys.path.append(csm_path)

# Import CSM modules with correct path
from csm.generator import load_csm_1b, Segment

class CSMVoiceService:
    """Wrapper for the CSM voice generation service with persistence."""

    def __init__(self, storage_dir='./psychoanalyst-assistant'):
        """Initialize the CSM voice service."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Initializing CSM Voice Service on {self.device}...")

        self.generator = None
        self.storage_dir = Path(storage_dir)
        self.audio_dir = self.storage_dir / 'generated_audio'
        self.session_dir = self.storage_dir / 'session_data'

        # Ensure directories exist
        self.audio_dir.mkdir(parents=True, exist_ok=True)
        self.session_dir.mkdir(parents=True, exist_ok=True)

    def load_model(self):
        """Load the CSM model."""
        if self.generator is None:
            print("Loading CSM 1B model...")
            self.generator = load_csm_1b(device=self.device)
            print("Model loaded successfully")
        return self.generator

    def generate_voice(self, text, speaker_id=0, context=None, max_audio_length_ms=10000):
        """Generate voice audio from text.

        Args:
            text (str): The text to convert to speech
            speaker_id (int): Speaker identifier (0 or 1)
            context (list): Optional list of previous conversation segments
            max_audio_length_ms (int): Maximum audio length in milliseconds

        Returns:
            dict: Information about the generated audio
        """
        generator = self.load_model()

        # Process context if provided
        processed_context = []
        if context and isinstance(context, list):
            for segment in context:
                if 'text' in segment and 'speaker' in segment and 'audio_path' in segment:
                    # Load the audio from the path in the segment
                    audio_path = segment['audio_path']
                    if os.path.exists(audio_path):
                        audio_tensor, sample_rate = torchaudio.load(audio_path)
                        audio_tensor = torchaudio.functional.resample(
                            audio_tensor.squeeze(0),
                            orig_freq=sample_rate,
                            new_freq=generator.sample_rate
                        )

                        processed_context.append(
                            Segment(
                                text=segment['text'],
                                speaker=segment['speaker'],
                                audio=audio_tensor
                            )
                        )

        # Generate the audio
        try:
            audio = generator.generate(
                text=text,
                speaker=speaker_id,
                context=processed_context,
                max_audio_length_ms=max_audio_length_ms
            )

            # Save the audio to disk
            audio_id = str(uuid.uuid4())[:8]
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{timestamp}_{audio_id}.wav"
            audio_path = self.audio_dir / filename

            torchaudio.save(
                str(audio_path),
                audio.unsqueeze(0).cpu(),
                generator.sample_rate
            )

            # Log the generation
            metadata = {
                'id': audio_id,
                'timestamp': timestamp,
                'text': text,
                'speaker': speaker_id,
                'audio_path': str(audio_path),
                'sample_rate': generator.sample_rate,
                'duration_ms': len(audio) * 1000 / generator.sample_rate
            }

            # Save metadata
            with open(self.session_dir / f"{audio_id}_metadata.json", 'w') as f:
                json.dump(metadata, f, indent=2)

            return metadata

        except Exception as e:
            print(f"Error generating voice: {e}")
            raise

    def create_conversation_context(self, texts, speaker_ids):
        """Create a conversation context from texts and speaker ids.

        Args:
            texts (list): List of utterance texts
            speaker_ids (list): List of speaker identifiers matching texts

        Returns:
            list: Context segments for use in generate_voice
        """
        if len(texts) != len(speaker_ids):
            raise ValueError("Number of texts must match number of speaker IDs")

        context = []

        for i, (text, speaker_id) in enumerate(zip(texts, speaker_ids)):
            # Generate audio for this segment
            metadata = self.generate_voice(
                text=text,
                speaker_id=speaker_id,
                context=context.copy()  # Use the context up to this point
            )

            # Add this segment to the context
            context.append({
                'text': text,
                'speaker': speaker_id,
                'audio_path': metadata['audio_path']
            })

        return context


Collecting git+https://github.com/SesameAILabs/silentcipher@master
  Cloning https://github.com/SesameAILabs/silentcipher (to revision master) to /tmp/pip-req-build-0nj6byv5
  Running command git clone --filter=blob:none --quiet https://github.com/SesameAILabs/silentcipher /tmp/pip-req-build-0nj6byv5
  Resolved https://github.com/SesameAILabs/silentcipher to commit d46d7d0893a583d8968ab3a6626e2289faec9152
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting librosa>=0.10.0 (from silentcipher==1.0.5)
  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting SoundFile>=0.12.1 (from silentcipher==1.0.5)
  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting Flask>=2.2.5 (from silentcipher==1.0.5)
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting blinker>=1.9 (from Flask>=2.2.5->silent

ModuleNotFoundError: No module named 'silentcipher'

In [None]:

# ============================================
# @title Flask API for Voice Service

from flask import Flask, request, jsonify
import threading
import base64
import io

class VoiceServiceAPI:
    """Simple Flask API for the CSM Voice Service."""

    def __init__(self, voice_service, port=5000):
        """Initialize the API server."""
        self.voice_service = voice_service
        self.port = port
        self.app = Flask("CSM Voice Service API")
        self.setup_routes()

    def setup_routes(self):
        """Set up the API routes."""

        @self.app.route("/api/generate", methods=["POST"])
        def generate_voice():
            data = request.json
            if not data or "text" not in data:
                return jsonify({"error": "Text is required"}), 400

            # Extract parameters with defaults
            text = data["text"]
            speaker_id = data.get("speaker_id", 0)
            max_length_ms = data.get("max_length_ms", 10000)

            # Process context if provided
            context = data.get("context", None)

            try:
                # Generate voice
                result = self.voice_service.generate_voice(
                    text=text,
                    speaker_id=speaker_id,
                    context=context,
                    max_audio_length_ms=max_length_ms
                )

                # Encode audio as base64 if requested
                if data.get("include_audio_data", False):
                    audio_path = result["audio_path"]
                    with open(audio_path, "rb") as audio_file:
                        audio_data = audio_file.read()
                        result["audio_base64"] = base64.b64encode(audio_data).decode("utf-8")

                return jsonify(result)
            except Exception as e:
                return jsonify({"error": str(e)}), 500

        @self.app.route("/api/health", methods=["GET"])
        def health_check():
            return jsonify({
                "status": "ok",
                "device": self.voice_service.device,
                "model_loaded": self.voice_service.generator is not None
            })

    def start(self, debug=False):
        """Start the API server in a separate thread."""
        thread = threading.Thread(
            target=self.app.run,
            kwargs={
                "host": "0.0.0.0",
                "port": self.port,
                "debug": debug,
                "use_reloader": False
            }
        )
        thread.daemon = True
        thread.start()
        print(f"API server started on port {self.port}")
        return thread


In [10]:
# Add this to a new cell and run it
import sys
sys.path.append('/content/csm')

def custom_load_csm_model():
    """Custom implementation to load the CSM 1B model properly."""
    import torch
    import os
    from models import Model, ModelArgs
    from generator import Generator, load_llama3_tokenizer
    from huggingface_hub import hf_hub_download
    from moshi.models import loaders

    print("Creating model configuration...")
    # Updated configuration to match the checkpoint dimensions
    config = ModelArgs(
        backbone_flavor="llama-1B",
        decoder_flavor="llama-100M",
        text_vocab_size=128256,  # Updated from 32000
        audio_vocab_size=2051,   # Updated from 1024
        audio_num_codebooks=32
    )

    print("Initializing model with config...")
    model = Model(config)

    print("Downloading model weights...")
    model_file = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
    print(f"Downloaded model weights to {model_file}")

    print("Loading weights into model...")
    state_dict = torch.load(model_file, map_location="cuda")
    model.load_state_dict(state_dict)

    print("Moving model to GPU...")
    model = model.to(device="cuda", dtype=torch.bfloat16)

    print("Setting up model caches...")
    model.setup_caches(1)

    print("Setting up generator...")
    generator = Generator(model)

    print("CSM model loaded successfully!")
    return generator

# Replace the load_model method
def load_model(self):
    """Load the CSM model."""
    if self.generator is None:
        print("Loading CSM 1B model using custom loader...")
        try:
            self.generator = custom_load_csm_model()
            print("Model loaded successfully")
        except Exception as e:
            print(f"Error loading model: {e}")
            import traceback
            traceback.print_exc()

            # If CSM fails, try the Edge-TTS fallback
            print("\nFalling back to Edge-TTS...")
            try:
                self.generator = create_edge_tts_fallback()
                print("Fallback TTS loaded successfully")
            except Exception as e2:
                print(f"Error loading fallback TTS: {e2}")
                traceback.print_exc()
                raise
    return self.generator

# Create an Edge-TTS fallback that mimics the CSM generator API
def create_edge_tts_fallback():
    """Create a fallback generator using Edge-TTS."""
    # Install Edge-TTS if needed
    import subprocess, sys
    try:
        import edge_tts
    except ImportError:
        print("Installing Edge-TTS...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "edge-tts"])
        import edge_tts

    import asyncio
    import os
    import tempfile
    import torch
    import torchaudio

    class EdgeTTSGenerator:
        """A Generator-like interface using Edge-TTS."""

        def __init__(self):
            self.sample_rate = 24000
            self.voices = {
                0: "en-US-AriaNeural",  # Female voice
                1: "en-US-GuyNeural"    # Male voice
            }
            print("Edge-TTS fallback initialized")

        def generate(self, text, speaker=0, context=None, max_audio_length_ms=10000):
            """Generate speech for the given text."""
            voice = self.voices.get(speaker, self.voices[0])

            # Create a temporary file for the audio
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                temp_path = temp_file.name

            # Run the Edge-TTS synthesis (requires async)
            async def run_tts():
                communicate = edge_tts.Communicate(text, voice)
                await communicate.save(temp_path)

            # Run the async function
            asyncio.run(run_tts())

            # Load the audio file
            waveform, sample_rate = torchaudio.load(temp_path)
            waveform = waveform.mean(dim=0)  # Convert to mono

            # Resample if needed
            if sample_rate != self.sample_rate:
                waveform = torchaudio.functional.resample(
                    waveform, orig_freq=sample_rate, new_freq=self.sample_rate
                )

            # Clean up the temporary file
            os.unlink(temp_path)

            return waveform

    return EdgeTTSGenerator()

# Update the method in the class
CSMVoiceService.load_model = load_model

NameError: name 'CSMVoiceService' is not defined

In [9]:

# ============================================
# @title Test Voice Generation

# Simple test function
def test_voice_generation():
    print("Testing CSM Voice Generation...")

    # Initialize the voice service
    voice_service = CSMVoiceService()

    # Load the model
    voice_service.load_model()

    # Generate a simple utterance
    test_text = "Hello, I'm your therapeutic assistant based on Karen Horney's psychoanalytic framework. How are you feeling today?"

    result = voice_service.generate_voice(
        text=test_text,
        speaker_id=0
    )

    print(f"Generated audio saved to: {result['audio_path']}")

    # Play the audio if in Colab
    if IN_COLAB:
        from IPython.display import Audio, display
        display(Audio(result['audio_path']))

    return result


In [10]:

# ============================================
# @title Create a Demo Conversation

def create_demo_conversation():
    print("Creating a demo conversation...")

    # Initialize the voice service
    voice_service = CSMVoiceService()

    # Define a conversation between assistant (speaker 0) and user (speaker 1)
    texts = [
        "Hello, I'm your therapeutic assistant based on Karen Horney's psychoanalytic framework. How are you feeling today?",  # Assistant
        "I've been feeling anxious lately, especially at work.",  # User
        "I understand. According to Karen Horney's framework, anxiety often stems from basic conflict between opposing forces in our personality. Can you tell me more about when this anxiety appears?",  # Assistant
        "Mostly in meetings when I have to present my ideas to the team.",  # User
        "That suggests what Horney would call a 'moving toward' pattern, where anxiety appears when seeking approval from others. Let's explore this further."  # Assistant
    ]

    speaker_ids = [0, 1, 0, 1, 0]  # Alternating between assistant and user

    # Create the conversation context
    context = voice_service.create_conversation_context(texts, speaker_ids)

    print(f"Created conversation with {len(context)} segments")

    # Play the last response if in Colab
    if IN_COLAB and context:
        from IPython.display import Audio, display
        last_segment = context[-1]
        display(Audio(last_segment['audio_path']))

    return context


In [11]:

# ============================================
# @title Start API Service

def start_api_service():
    print("Starting Voice Service API...")

    # Initialize the voice service
    voice_service = CSMVoiceService()

    # Initialize and start the API
    api = VoiceServiceAPI(voice_service, port=8080)
    api_thread = api.start()

    # Instructions for accessing the API
    print("\nAPI is running on port 8080")
    print("\nTo generate voice, send a POST request to /api/generate with JSON body:")
    print("""{
    "text": "Your text to convert to speech",
    "speaker_id": 0,
    "max_length_ms": 10000,
    "include_audio_data": true
}""")

    print("\nIn Colab, you'll need to use ngrok to expose the API to the internet.")
    print("Run the following commands in a new cell:")
    print("!pip install pyngrok")
    print("from pyngrok import ngrok")
    print("url = ngrok.connect(8080)")
    print("print(f'API available at: {url}')")

    return api

# Uncomment the function you want to run
# test_voice_generation()
#create_demo_conversation()
start_api_service()

Testing CSM Voice Generation...
Initializing CSM Voice Service on cuda...
Loading CSM 1B model using custom loader...
Creating model configuration...
Initializing model with config...
Downloading model weights...
Downloaded model weights to /root/.cache/huggingface/hub/models--sesame--csm-1b/snapshots/03ab46ff5cfdcc783cc76fcf9ea6fd0838503093/ckpt.pt
Loading weights into model...


  state_dict = torch.load(model_file, map_location="cuda")


Moving model to GPU...




Setting up model caches...
Setting up generator...


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

(…)nizer-e351c8d8-checkpoint125.safetensors:   0%|          | 0.00/385M [00:00<?, ?B/s]

ckpt path or config path does not exist! Downloading the model from the Hugging Face Hub...


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

dec_c.ckpt:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

dec_m_0.ckpt:   0%|          | 0.00/9.54M [00:00<?, ?B/s]

dec_m_0.ckpt:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

enc_c.ckpt:   0%|          | 0.00/170k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

opt.ckpt:   0%|          | 0.00/23.4M [00:00<?, ?B/s]

dec_c.ckpt:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

enc_c.ckpt:   0%|          | 0.00/185k [00:00<?, ?B/s]

opt.ckpt:   0%|          | 0.00/23.4M [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

  self.enc_c.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, "enc_c.ckpt"), map_location=self.device)))
  self.dec_c.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, "dec_c.ckpt"), map_location=self.device)))
  m.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, f"dec_m_{i}.ckpt"), map_location=self.device)))


CSM model loaded successfully!
Model loaded successfully
Generated audio saved to: /content/drive/MyDrive/psychoanalyst-assistant/generated_audio/20250319_204509_d3f43773.wav


Creating a demo conversation...
Initializing CSM Voice Service on cuda...
Loading CSM 1B model using custom loader...
Creating model configuration...
Initializing model with config...
Downloading model weights...
Downloaded model weights to /root/.cache/huggingface/hub/models--sesame--csm-1b/snapshots/03ab46ff5cfdcc783cc76fcf9ea6fd0838503093/ckpt.pt
Loading weights into model...
Moving model to GPU...




Setting up model caches...
Setting up generator...
ckpt path or config path does not exist! Downloading the model from the Hugging Face Hub...


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

CSM model loaded successfully!
Model loaded successfully
Created conversation with 5 segments


[{'text': "Hello, I'm your therapeutic assistant based on Karen Horney's psychoanalytic framework. How are you feeling today?",
  'speaker': 0,
  'audio_path': '/content/drive/MyDrive/psychoanalyst-assistant/generated_audio/20250319_204632_ef5db903.wav'},
 {'text': "I've been feeling anxious lately, especially at work.",
  'speaker': 1,
  'audio_path': '/content/drive/MyDrive/psychoanalyst-assistant/generated_audio/20250319_204642_b17a1a0e.wav'},
 {'text': "I understand. According to Karen Horney's framework, anxiety often stems from basic conflict between opposing forces in our personality. Can you tell me more about when this anxiety appears?",
  'speaker': 0,
  'audio_path': '/content/drive/MyDrive/psychoanalyst-assistant/generated_audio/20250319_204714_18059f3c.wav'},
 {'text': 'Mostly in meetings when I have to present my ideas to the team.',
  'speaker': 1,
  'audio_path': '/content/drive/MyDrive/psychoanalyst-assistant/generated_audio/20250319_204728_0a59cc1e.wav'},
 {'text': "T

In [12]:
start_api_service()

Starting Voice Service API...
Initializing CSM Voice Service on cuda...
API server started on port 8080

API is running on port 8080

To generate voice, send a POST request to /api/generate with JSON body:
{
    "text": "Your text to convert to speech",
    "speaker_id": 0,
    "max_length_ms": 10000,
    "include_audio_data": true
}

In Colab, you'll need to use ngrok to expose the API to the internet.
Run the following commands in a new cell:
!pip install pyngrok
from pyngrok import ngrok
url = ngrok.connect(8080)
print(f'API available at: {url}')
 * Serving Flask app 'CSM Voice Service API'
 * Debug mode: off


<__main__.VoiceServiceAPI at 0x7fac2caa4510>

Address already in use
Port 8080 is in use by another program. Either identify and stop that program, or start the server with a different port.
