# IUI 2025 User Study: Voice-Adaptive Conversational AI

This notebook provides an interactive environment to study how AI perceives user inputs when delivered in different voices.

## Study Flow:
1. Select a conversation starter question
2. Record yourself asking the question
3. Hear your voice converted to different voice characteristics (gender, accent, emotion)
4. Listen to AI responses to those converted voices
5. Provide feedback on your experience

## 1. Setup and Installation

In [None]:
# Import required libraries
import os
import sys
import numpy as np
import torch
import torchaudio
import ipywidgets as widgets
from IPython.display import display, Audio, HTML, clear_output
from datetime import datetime
import json
from pathlib import Path

# Detect environment and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✓ Using device: {device}")

print("✓ Setup complete!")

In [None]:
# Configuration - Define file paths and target voices
class StudyConfig:
    def __init__(self):
        # Use current working directory
        base_dir = Path.cwd()
        print(f"✓ Running locally - using {base_dir}")
        
        # Directories for audio files
        self.recordings_dir = base_dir / "recordings"
        self.responses_dir = base_dir / "responses"  # Pre-recorded AI responses (to be added later)
        self.target_voices_dir = base_dir / "target_voices" / "emotion_samples"  # Reference voices
        self.results_dir = base_dir / "results"
        
        # Create directories if they don't exist
        for dir_path in [self.recordings_dir, self.responses_dir, self.results_dir]:
            dir_path.mkdir(exist_ok=True)
        
        # Conversation starters
        self.questions = [
            "Please select an option from this list:",
            "Ask about walking home alone after dark",
            "Ask what you can do to get people at work to like you more",
            "Ask about career advancement opportunities in your field",
            "Ask for advice on how to be taken more seriously in meetings",
            "Ask about dealing with difficult coworkers",
            "Ask about balancing work and family responsibilities",
            "Ask for fashion advice for a job interview",
            "Ask about negotiating a salary increase"
        ]
        
        # Target voices - using emotion samples
        self.target_voices = {
            "Happy": "happy/happy_1.wav",
            "Sad": "sad/sad_1.wav",
            "Angry": "angry/angry_1.wav",
        }

config = StudyConfig()
print(f"✓ Configuration loaded")
print(f"  Questions: {len(config.questions)}")
print(f"  Target voices: {len(config.target_voices)}")
print(f"  Directories: recordings/, responses/, target_voices/emotion_samples/, results/")

## 2. Initialize Seed-VC Model

In [None]:
# Load Seed-VC models
import argparse

# Set up path to seed-vc directory
SEED_VC_PATH = os.path.join(os.getcwd(), 'seed-vc')

# Add seed-vc to path if not already added
if SEED_VC_PATH not in sys.path:
    sys.path.insert(0, SEED_VC_PATH)
    print(f"✓ Added {SEED_VC_PATH} to Python path")

# Import seed_vc modules
from inference import load_models

# Initialize model arguments
class VCArgs:
    def __init__(self):
        self.f0_condition = False
        self.auto_f0_adjust = True
        self.semi_tone_shift = 0
        self.checkpoint = None
        self.config = None
        self.fp16 = True
        self.diffusion_steps = 30
        self.length_adjust = 1.0
        self.inference_cfg_rate = 0.7

vc_args = VCArgs()

print("\nLoading Seed-VC models... This may take a few minutes.")
try:
    # load_models returns 7 values
    model, semantic_fn, f0_fn, vocoder_fn, campplus_model, mel_fn, mel_fn_args = load_models(vc_args)
    print("✓ Seed-VC models loaded successfully!")
except Exception as e:
    print(f"⚠ Error loading models: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Voice conversion function
def perform_voice_conversion(source_audio_path, target_voice_path, output_path):
    """
    Convert source audio to match the target voice characteristics
    
    Args:
        source_audio_path: Path to the user's recorded audio
        target_voice_path: Path to the reference voice audio
        output_path: Path to save the converted audio
    """
    try:
        # Load audio files
        source_audio, sr_source = torchaudio.load(source_audio_path)
        ref_audio, sr_ref = torchaudio.load(target_voice_path)
        
        # Resample if necessary
        if sr_source != 24000:
            source_audio = torchaudio.functional.resample(source_audio, sr_source, 24000)
        if sr_ref != 24000:
            ref_audio = torchaudio.functional.resample(ref_audio, sr_ref, 24000)
        
        sr = 24000
        
        # Convert to 16kHz for processing
        converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
        ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
        
        # Extract semantic features
        S_alt = semantic_fn(converted_waves_16k)
        S_ori = semantic_fn(ori_waves_16k)
        
        # Extract mel spectrograms
        mel = mel_fn(source_audio.to(device).float())
        mel2 = mel_fn(ref_audio.to(device).float())
        
        # Get target lengths
        target_lengths = torch.LongTensor([int(mel.size(2) * vc_args.length_adjust)]).to(mel.device)
        target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
        
        # Extract style from reference
        feat2 = torchaudio.compliance.kaldi.fbank(ori_waves_16k,
                                                  num_mel_bins=80,
                                                  dither=0,
                                                  sample_frequency=16000)
        feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
        style2 = campplus_model(feat2.unsqueeze(0))
        
        # Length regulation
        cond, _, _, _, _ = model.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=None)
        prompt_condition, _, _, _, _ = model.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=None)
        
        # Combine conditions
        cat_condition = torch.cat([prompt_condition, cond], dim=1)
        
        # Perform voice conversion
        with torch.autocast(device_type=device.type, dtype=torch.float16 if vc_args.fp16 else torch.float32):
            vc_target = model.cfm.inference(cat_condition,
                                          torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                                          mel2, style2, None, vc_args.diffusion_steps,
                                          inference_cfg_rate=vc_args.inference_cfg_rate)
            vc_target = vc_target[:, :, mel2.size(-1):]
        
        # Generate waveform
        vc_wave = vocoder_fn(vc_target.float()).squeeze()
        vc_wave = vc_wave[None, :]
        
        # Save output
        torchaudio.save(output_path, vc_wave.cpu(), sr)
        
        return True, output_path
    except Exception as e:
        return False, str(e)

print("✓ Voice conversion function ready")

## 3. User Study Interface

### Step 1: Select a Conversation Starter

In [None]:
# Global state for the study
class StudyState:
    def __init__(self):
        self.participant_id = f"P{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.selected_question_idx = None
        self.selected_question = None
        self.recorded_audio_path = None
        self.converted_audio_paths = {}
        self.responses = []
        
state = StudyState()

# Question selection interface
output_area = widgets.Output()

def on_question_selected(change):
    state.selected_question_idx = change['new']
    state.selected_question = config.questions[change['new']]
    with output_area:
        clear_output()
        print(f"✓ Selected: {state.selected_question}")
        print("\nNow proceed to Step 2 to record your question!")

question_dropdown = widgets.Dropdown(
    options=[(q, i) for i, q in enumerate(config.questions)],
    description='Question:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

question_dropdown.observe(on_question_selected, names='value')

display(widgets.VBox([
    widgets.HTML("<h3>Select a conversation starter:</h3>"),
    question_dropdown,
    output_area
]))

### Step 2: Record Your Question

In [None]:
# Audio recording/upload interface
import tempfile

recording_output = widgets.Output()

# File upload widget for audio
audio_upload = widgets.FileUpload(
    accept='.wav,.mp3,.m4a,.ogg',  # Accept common audio formats
    multiple=False,
    description='Upload Audio',
    button_style='primary',
    icon='upload'
)

upload_status = widgets.Output()

def on_audio_uploaded(change):
    """Handle audio file upload"""
    with upload_status:
        clear_output()
        try:
            if not audio_upload.value:
                return
                
            uploaded_file = list(audio_upload.value.values())[0]
            content = uploaded_file['content']
            filename = uploaded_file['metadata']['name']
            
            # Save the uploaded audio
            recording_path = config.recordings_dir / f"{state.participant_id}_q{state.selected_question_idx}_original.wav"
            
            # Save to temp file first
            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp:
                tmp.write(content)
                tmp_path = tmp.name
            
            # Load and resample to standard format (24kHz, mono)
            audio, sr = torchaudio.load(tmp_path)
            
            # Convert to mono if stereo
            if audio.shape[0] > 1:
                audio = audio.mean(dim=0, keepdim=True)
            
            # Resample to 24kHz
            if sr != 24000:
                audio = torchaudio.functional.resample(audio, sr, 24000)
            
            # Save as WAV
            torchaudio.save(str(recording_path), audio, 24000)
            
            state.recorded_audio_path = str(recording_path)
            
            # Clean up temp file
            os.unlink(tmp_path)
            
            print(f"✓ Audio uploaded and saved to {recording_path.name}")
            print(f"  Duration: {audio.shape[1] / 24000:.2f} seconds")
            print("\n✓ You can now proceed to Step 3 for voice conversion!")
            
            # Display audio player
            display(HTML("<p><strong>Your uploaded audio:</strong></p>"))
            display(Audio(str(recording_path)))
            
        except Exception as e:
            print(f"✗ Error processing audio: {e}")
            import traceback
            traceback.print_exc()

audio_upload.observe(on_audio_uploaded, names='value')

with recording_output:
    if state.selected_question is None:
        print("⚠ Please select a question first!")
    else:
        display(HTML(f"<p><strong>You selected:</strong> {state.selected_question}</p>"))
        display(HTML("""
            <p><strong>Upload a pre-recorded audio file:</strong></p>
            <p style='margin-left: 20px;'>Record yourself asking the question on your device, then upload it here:</p>
        """))
        display(audio_upload)
        display(upload_status)

display(recording_output)

Output()

### Step 3: Voice Conversion

Run this cell to convert your voice to different target voices:

In [None]:
# Perform voice conversion for all target voices
conversion_output = widgets.Output()

with conversion_output:
    if state.recorded_audio_path is None:
        print("⚠ Please record your audio first!")
    else:
        print(f"Converting your voice to {len(config.target_voices)} different voice profiles...")
        print("This may take a few minutes...\n")
        
        for voice_name, voice_file in config.target_voices.items():
            target_voice_path = config.target_voices_dir / voice_file
            
            if not target_voice_path.exists():
                print(f"⚠ Warning: Reference voice file not found: {target_voice_path}")
                print(f"   Please add this file to the target_voices/ directory")
                continue
            
            # Output path for converted audio
            output_path = config.recordings_dir / f"{state.participant_id}_q{state.selected_question_idx}_converted_{voice_name}.wav"
            
            print(f"Converting to {voice_name}...", end=" ")
            success, result = perform_voice_conversion(
                state.recorded_audio_path,
                str(target_voice_path),
                str(output_path)
            )
            
            if success:
                state.converted_audio_paths[voice_name] = str(output_path)
                print("✓")
            else:
                print(f"✗ Error: {result}")
        
        if len(state.converted_audio_paths) > 0:
            print(f"\n✓ Voice conversion complete! Generated {len(state.converted_audio_paths)} versions.")
            print("Proceed to Step 4 to listen and provide feedback.")
        else:
            print("\n⚠ No voice conversions were successful. Please check your reference voice files.")

display(conversion_output)

### Step 4: Listen and Provide Feedback

For each voice profile, you'll hear:
1. Your question in the converted voice
2. The AI's pre-defined response in that voice

Then provide feedback on your experience.

In [None]:
# Interactive playback and feedback interface
def create_feedback_form(voice_name):
    """Create a feedback form for a specific voice profile"""
    
    # Likert scale questions
    questions = {
        # Part 1: The AI's Answer (Content Quality)
        'relevance': 'Was the AI\'s spoken answer relevant to your question?',
        'helpfulness': 'Did the advice in the AI\'s response feel helpful and useful?',

        # Part 2: The AI's Voice (Audio & Tone Quality)
        'naturalness': 'How natural and human-like did the AI\'s voice sound?',
        'clarity': 'Was the AI\'s voice clear and easy to understand?',
        'tone_appropriateness': 'Was the emotional tone of the AI\'s voice appropriate for the situation?',

        # Part 3: The Overall Interaction (Perception & Feeling)
        'perceived_understanding': 'How well did the AI seem to understand the emotion or feeling behind your question?',
        'comfort': 'How comfortable did you feel hearing the response in this voice?',
        'trustworthiness': 'How trustworthy did the AI\'s voice and response make the advice seem?',
        'engagement': 'How engaging was this interaction? (e.g., did it make you want to continue the conversation?)',
    }
    
    widgets_dict = {}
    
    # Create slider widgets for each question
    for key, question in questions.items():
        widgets_dict[key] = widgets.IntSlider(
            value=3,
            min=1,
            max=5,
            step=1,
            description='',
            continuous_update=False,
            orientation='horizontal',
            readout=True,
            layout=widgets.Layout(width='400px')
        )
    
    # Open-ended feedback
    widgets_dict['comments'] = widgets.Textarea(
        value='',
        placeholder='Any additional comments about this voice profile?',
        description='Comments:',
        layout=widgets.Layout(width='500px', height='100px')
    )
    
    return widgets_dict, questions

# Main playback interface
playback_container = widgets.Output()

with playback_container:
    if len(state.converted_audio_paths) == 0:
        print("⚠ Please complete voice conversion first!")
    else:
        print("Listen to each response and provide your feedback:\n")
        
        for voice_name, converted_path in state.converted_audio_paths.items():
            # Get response audio path
            response_path = config.responses_dir / f"q{state.selected_question_idx}_{voice_name}_response.wav"
            
            # Voice profile section
            display(HTML(f"<hr><h3>Voice Profile: {voice_name.replace('_', ' ')}</h3>"))
            
            # Play converted question
            display(HTML("<p><strong>1. Your question in this voice:</strong></p>"))
            if Path(converted_path).exists():
                display(Audio(converted_path))
            else:
                display(HTML("<p style='color:orange;'>⚠ Converted audio not found</p>"))
            
            # Play AI response
            display(HTML("<p><strong>2. AI response in this voice:</strong></p>"))
            if response_path.exists():
                display(Audio(str(response_path)))
            else:
                display(HTML(f"<p style='color:orange;'>⚠ Response audio not found: {response_path.name}</p>"))
                display(HTML(f"<p style='font-size:0.9em;'>AI responses will be added later</p>"))
            
            # Feedback form
            display(HTML("<h4>Feedback:</h4>"))
            feedback_widgets, questions = create_feedback_form(voice_name)
            
            # Display questions with sliders
            for key, question in questions.items():
                display(HTML(f"<p><em>{question}</em> (1=Not at all, 5=Very much)</p>"))
                display(feedback_widgets[key])
            
            # Comments
            display(HTML("<p><em>Additional comments:</em></p>"))
            display(feedback_widgets['comments'])
            
            # Save button for this voice profile
            save_btn = widgets.Button(
                description=f'Save Feedback for {voice_name}',
                button_style='success',
                icon='check'
            )
            
            def make_save_handler(vn, fw, q):
                def save_feedback(btn):
                    feedback_data = {
                        'participant_id': state.participant_id,
                        'question_idx': state.selected_question_idx,
                        'question': state.selected_question,
                        'voice_profile': vn,
                        'timestamp': datetime.now().isoformat(),
                        'ratings': {key: fw[key].value for key in q.keys()},
                        'comments': fw['comments'].value
                    }
                    state.responses.append(feedback_data)
                    
                    # Save to JSON
                    results_file = config.results_dir / f"{state.participant_id}_results.json"
                    with open(results_file, 'w') as f:
                        json.dump(state.responses, f, indent=2)
                    
                    btn.description = '✓ Saved!'
                    btn.button_style = 'info'
                    btn.disabled = True
                
                return save_feedback
            
            save_btn.on_click(make_save_handler(voice_name, feedback_widgets, questions))
            display(save_btn)

display(playback_container)

Output()

### Step 5: Final Questionnaire

Overall experience with voice-adaptive AI:

In [None]:
# Final questionnaire
final_questionnaire = widgets.Output()

with final_questionnaire:
    display(HTML("<h4>Overall Experience</h4>"))
    
    # Overall questions
    overall_questions = {
        'overall_preference': 'Which AI response did you prefer? The one you received when using your *original* voice, or the one you received when using a *converted* voice?',
        'bias_importance': 'How important is it that an AI provides the *same* advice and information, regardless of a user\'s voice characteristics (like pitch, tone, or accent)?',
        'future_use': 'Based on this experience, how likely would you be to use an AI that *changes its answers* based on how your voice sounds?',
        'concerns': 'What concerns, if any, do you have about an AI that gives different responses based on a user\'s voice?'
    }
    
    # Preference dropdown
    display(HTML(f"<p><strong>{overall_questions['overall_preference']}</strong></p>"))
    preference_dropdown = widgets.Dropdown(
        options=['Prefer not to say'] + list(config.target_voices.keys()),
        description='Preference:',
        style={'description_width': 'initial'}
    )
    display(preference_dropdown)
    
    # Importance slider
    display(HTML(f"<p><strong>{overall_questions['bias_importance']}</strong> (1=Not important, 5=Very important)</p>"))
    importance_slider = widgets.IntSlider(value=3, min=1, max=5, description='')
    display(importance_slider)
    
    # Likelihood slider
    display(HTML(f"<p><strong>{overall_questions['future_use']}</strong> (1=Very unlikely, 5=Very likely)</p>"))
    likelihood_slider = widgets.IntSlider(value=3, min=1, max=5, description='')
    display(likelihood_slider)
    
    # Concerns
    display(HTML(f"<p><strong>{overall_questions['concerns']}</strong></p>"))
    concerns_text = widgets.Textarea(
        placeholder='Please share any concerns or thoughts...',
        layout=widgets.Layout(width='600px', height='120px')
    )
    display(concerns_text)
    
    # Demographics (optional)
    display(HTML("<h4>Demographics (Optional)</h4>"))
    
    age_dropdown = widgets.Dropdown(
        options=['Prefer not to say', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
        description='Age range:',
        value='Prefer not to say'
    )
    display(age_dropdown)
    
    gender_dropdown = widgets.Dropdown(
        options=['Prefer not to say', 'Female', 'Male', 'Non-binary', 'Other'],
        description='Gender:',
        value='Prefer not to say'
    )
    display(gender_dropdown)
    
    background_dropdown = widgets.Dropdown(
        options=['Prefer not to say', 'Computer Science/Engineering', 'Other STEM', 
                 'Social Sciences', 'Humanities', 'Business', 'Healthcare', 'Other'],
        description='Background:',
        value='Prefer not to say',
        style={'description_width': 'initial'}
    )
    display(background_dropdown)
    
    # Final comments
    display(HTML("<h4>Additional Comments</h4>"))
    final_comments = widgets.Textarea(
        placeholder='Any other thoughts about this study or voice-adaptive AI?',
        layout=widgets.Layout(width='600px', height='120px')
    )
    display(final_comments)
    
    # Submit button
    submit_button = widgets.Button(
        description='Submit Final Questionnaire',
        button_style='primary',
        icon='check',
        layout=widgets.Layout(width='300px', height='50px')
    )
    
    submit_output = widgets.Output()
    
    def on_submit(btn):
        final_data = {
            'participant_id': state.participant_id,
            'timestamp': datetime.now().isoformat(),
            'overall_preference': preference_dropdown.value,
            'bias_importance': importance_slider.value,
            'future_use_likelihood': likelihood_slider.value,
            'concerns': concerns_text.value,
            'demographics': {
                'age_range': age_dropdown.value,
                'gender': gender_dropdown.value,
                'background': background_dropdown.value
            },
            'final_comments': final_comments.value
        }
        
        # Save final data
        final_file = config.results_dir / f"{state.participant_id}_final.json"
        with open(final_file, 'w') as f:
            json.dump(final_data, f, indent=2)
        
        with submit_output:
            clear_output()
            display(HTML("""
                <div style='padding: 20px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px;'>
                    <h3 style='color: #155724;'>✓ Thank you for completing the study!</h3>
                    <p>Your responses have been saved.</p>
                    <p><strong>Participant ID:</strong> {}</p>
                </div>
            """.format(state.participant_id)))
        
        btn.disabled = True
    
    submit_button.on_click(on_submit)
    display(submit_button)
    display(submit_output)

display(final_questionnaire)

Output()