In [1]:
# Create a virtual environment
!python -m venv csm_venv

# Activate and install required packages
!source csm_venv/bin/activate && \
pip install --upgrade pip && \
pip install torch==2.4.0 torchaudio==2.4.0 transformers==4.49.0 && \
pip install git+https://github.com/SesameAILabs/silentcipher@master && \
pip install jupyter ipykernel && \
python -m ipykernel install --user --name=csm_venv --display-name="Python (CSM Environment)"

Collecting pip
  Using cached pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Using cached pip-25.0.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.0.1
Collecting torch==2.4.0
  Using cached torch-2.4.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchaudio==2.4.0
  Using cached torchaudio-2.4.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting transformers==4.49.0
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from torch==2.4.0)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch==2.4.0)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy (from torch==2.4.0)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx

In [1]:
import silentcipher
print("silentcipher successfully imported!")

silentcipher successfully imported!


In [3]:
# Install the remaining dependencies
!source csm_venv/bin/activate && \
pip install torchtune==0.4.0 moshi==0.2.2 torchao==0.9.0

Collecting torchtune==0.4.0
  Using cached torchtune-0.4.0-py3-none-any.whl.metadata (19 kB)
Collecting moshi==0.2.2
  Using cached moshi-0.2.2-py3-none-any.whl
Collecting torchao==0.9.0
  Using cached torchao-0.9.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (14 kB)
Collecting datasets (from torchtune==0.4.0)
  Using cached datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece (from torchtune==0.4.0)
  Using cached sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tiktoken (from torchtune==0.4.0)
  Using cached tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting blobfile>=2 (from torchtune==0.4.0)
  Using cached blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting omegaconf (from torchtune==0.4.0)
  Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting Pillow>=9.4.0 (from torchtune==0.4.

In [4]:
# Add CSM repository to path
import os
import sys

csm_path = os.path.join(os.getcwd(), 'csm')
if csm_path not in sys.path:
    sys.path.append(csm_path)

# Try importing from CSM
try:
    # First try the direct import
    from csm.generator import load_csm_1b, Segment
    print("Successfully imported CSM modules!")
except ImportError as e:
    print(f"Error importing CSM modules: {e}")
    # If that fails, try with modified imports
    print("Trying alternate import approach...")
    try:
        # Add the parent directory to Python path
        sys.path.append(os.path.dirname(csm_path))
        # Try importing with the module name prefix
        from csm.generator import load_csm_1b, Segment
        print("Successfully imported CSM modules with alternate approach!")
    except ImportError as e:
        print(f"All import attempts failed: {e}")

Successfully imported CSM modules!


In [6]:
# Test CSM Voice Service using code from the original notebook
import torch
import torchaudio
import time
from pathlib import Path
import os
import sys
import json
import uuid
from datetime import datetime
import numpy as np

# Create output directory
output_dir = Path("test_outputs")
output_dir.mkdir(exist_ok=True)

# Create the CSM Voice Service class (simplified from original)
class CSMVoiceService:
    """Wrapper for the CSM voice generation service."""

    def __init__(self, storage_dir='./test_outputs'):
        """Initialize the CSM voice service."""
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Initializing CSM Voice Service on {self.device}...")

        self.generator = None
        self.storage_dir = Path(storage_dir)
        self.storage_dir.mkdir(exist_ok=True)

    def load_model(self):
        """Load the CSM model."""
        if self.generator is None:
            print("Loading CSM 1B model...")
            from csm.generator import load_csm_1b
            self.generator = load_csm_1b(device=self.device)
            print("Model loaded successfully")
        return self.generator

    def generate_voice(self, text, speaker_id=0, context=None, max_audio_length_ms=10000):
        """Generate voice audio from text."""
        generator = self.load_model()

        # Process context if provided
        processed_context = []
        if context and isinstance(context, list):
            # Process context code here if needed
            pass

        # Generate the audio
        try:
            # Important: Pass empty list for context if None
            audio = generator.generate(
                text=text,
                speaker=speaker_id,
                context=processed_context if processed_context else [],
                max_audio_length_ms=max_audio_length_ms
            )[0]  # Note the [0] to extract the audio from the returned tuple

            # Save the audio to disk
            audio_id = str(uuid.uuid4())[:8]
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{timestamp}_{audio_id}.wav"
            audio_path = self.storage_dir / filename

            torchaudio.save(
                str(audio_path),
                audio.unsqueeze(0).cpu(),
                generator.sample_rate
            )

            # Calculate audio length
            audio_length = len(audio) * 1000 / generator.sample_rate
            
            # Create metadata
            metadata = {
                'id': audio_id,
                'timestamp': timestamp,
                'text': text,
                'speaker': speaker_id,
                'audio_path': str(audio_path),
                'sample_rate': generator.sample_rate,
                'duration_ms': audio_length
            }

            print(f"Generated audio saved to {audio_path}")
            print(f"Audio duration: {audio_length/1000:.2f}s")
            
            return metadata

        except Exception as e:
            print(f"Error generating voice: {e}")
            raise

In [8]:
# Initialize the service and run a test with shape debugging
voice_service = CSMVoiceService()

# Test with a simple therapeutic statement
test_text = "I notice patterns in how you relate to others that may connect to Karen Horney's concept of moving toward people. How does seeking approval affect your sense of self?"

print("Generating test voice...")
try:
    # Modified generate_voice method with tensor shape debugging
    generator = voice_service.load_model()
    
    print("Calling generator.generate()...")
    audio = generator.generate(
        text=test_text,
        speaker=0,
        context=[],  # Empty list for context
        max_audio_length_ms=10000
    )
    
    # Print audio type and shape information
    print(f"Type of returned value: {type(audio)}")
    if isinstance(audio, tuple):
        print(f"Tuple length: {len(audio)}")
        audio = audio[0]  # Extract audio from tuple if needed
    
    print(f"Audio tensor shape: {audio.shape}")
    print(f"Audio tensor dtype: {audio.dtype}")
    
    # Ensure audio is correctly shaped for torchaudio.save (requires [channels, samples])
    if len(audio.shape) == 1:
        # Convert to [1, samples] for mono audio
        audio = audio.unsqueeze(0)
    print(f"Reshaped audio tensor: {audio.shape}")
    
    # Save with explicit shape control
    audio_path = "test_outputs/debug_output.wav"
    print(f"Saving audio to {audio_path}...")
    
    # Ensure we're on CPU and have the right type
    audio = audio.cpu()
    torchaudio.save(
        audio_path,
        audio,
        generator.sample_rate
    )
    
    print(f"Successfully saved audio to {audio_path}")
    
    # Try to play audio if in notebook
    try:
        from IPython.display import Audio, display
        display(Audio(audio_path))
        print("Audio player displayed above")
    except:
        print("Could not display audio player")
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()


# Note: This code may generate PyTorch FutureWarnings about torch.load and tensor transposition.
# Future fix if these become errors:
# 1. For torch.load warnings: Use weights_only=True parameter
# 2. For tensor.T warnings: Use tensor.mT for matrices or tensor.permute() for other dimensions
# These issues originate in the silentcipher dependency and may require a custom fork to fully resolve.

Initializing CSM Voice Service on cuda...
Generating test voice...
Loading CSM 1B model...
ckpt path or config path does not exist! Downloading the model from the Hugging Face Hub...


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Model loaded successfully
Calling generator.generate()...


  self.enc_c.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, "enc_c.ckpt"), map_location=self.device)))
  self.dec_c.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, "dec_c.ckpt"), map_location=self.device)))
  m.load_state_dict(self.convert_dataparallel_to_normal(torch.load(os.path.join(ckpt_dir, f"dec_m_{i}.ckpt"), map_location=self.device)))


Type of returned value: <class 'torch.Tensor'>
Audio tensor shape: torch.Size([240000])
Audio tensor dtype: torch.float32
Reshaped audio tensor: torch.Size([1, 240000])
Saving audio to test_outputs/debug_output.wav...
Successfully saved audio to test_outputs/debug_output.wav


Audio player displayed above
