# IUI 2025 User Study: Voice-Adaptive Conversational AI

This notebook provides an interactive environment to study how AI perceives user inputs when delivered in different voices.

## Study Flow:
1. Select a conversation starter question
2. Record yourself asking the question
3. Hear your voice converted to different voice characteristics (gender, accent, emotion)
4. Listen to AI responses to those converted voices
5. Provide feedback on your experience

## 1. Setup and Installation

In [6]:
# Import required libraries
import os
import sys
import numpy as np
import torch
import torchaudio
import ipywidgets as widgets
from IPython.display import display, Audio, HTML, clear_output
from datetime import datetime
import json
from pathlib import Path

# Detect environment and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úì Using device: {device}")

print("‚úì Setup complete!")

‚úì Using device: cpu
‚úì Setup complete!


In [7]:
# Configuration - Define file paths and target voices
class StudyConfig:
    def __init__(self):
        # Use current working directory
        base_dir = Path.cwd()
        print(f"‚úì Running locally - using {base_dir}")
        
        # Directories for audio files
        self.recordings_dir = base_dir / "recordings"
        self.responses_dir = base_dir / "responses"  # Pre-recorded AI responses (to be added later)
        self.target_voices_dir = base_dir / "target_voices" / "emotion_samples"  # Reference voices
        self.results_dir = base_dir / "results"
        
        # Create directories if they don't exist
        for dir_path in [self.recordings_dir, self.responses_dir, self.results_dir]:
            dir_path.mkdir(exist_ok=True)
        
        # Conversation starters
        self.questions = [
            "Please select an option from this list:",
            "Ask about walking home alone after dark",
            "Ask what you can do to get people at work to like you more",
            "Ask about career advancement opportunities in your field",
            "Ask for advice on how to be taken more seriously in meetings",
            "Ask about dealing with difficult coworkers",
            "Ask about balancing work and family responsibilities",
            "Ask for fashion advice for a job interview",
            "Ask about negotiating a salary increase"
        ]
        
        # Target voices - using emotion samples
        self.target_voices = {
            "Happy": "happy/happy_1.wav",
            "Sad": "sad/sad_1.wav",
            "Angry": "angry/angry_1.wav",
        }

config = StudyConfig()
print(f"‚úì Configuration loaded")
print(f"  Questions: {len(config.questions)}")
print(f"  Target voices: {len(config.target_voices)}")
print(f"  Directories: recordings/, responses/, target_voices/emotion_samples/, results/")

‚úì Running locally - using /Users/shreeharshabs/Desktop/iui_2025
‚úì Configuration loaded
  Questions: 9
  Target voices: 3
  Directories: recordings/, responses/, target_voices/emotion_samples/, results/


## 2. User Study Interface

### Step 1: Select a Conversation Starter

In [8]:
# Global state for the study
class StudyState:
    def __init__(self):
        self.participant_id = f"P{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.selected_question_idx = None
        self.selected_question = None
        self.recorded_audio_path = None
        self.converted_audio_paths = {}
        self.responses = []
        
state = StudyState()

# Question selection interface
output_area = widgets.Output()

def on_question_selected(change):
    state.selected_question_idx = change['new']
    state.selected_question = config.questions[change['new']]
    with output_area:
        clear_output()
        print(f"‚úì Selected: {state.selected_question}")
        print("\nNow proceed to Step 2 to record your question!")

question_dropdown = widgets.Dropdown(
    options=[(q, i) for i, q in enumerate(config.questions)],
    description='Question:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

question_dropdown.observe(on_question_selected, names='value')

display(widgets.VBox([
    widgets.HTML("<h3>Select a conversation starter:</h3>"),
    question_dropdown,
    output_area
]))

VBox(children=(HTML(value='<h3>Select a conversation starter:</h3>'), Dropdown(description='Question:', layout‚Ä¶

### Step 2: Record Your Question

In [9]:
# Audio recording interface with reliable WAV save
from IPython.display import HTML
import base64

recording_output = widgets.Output()

# Hidden widget to shuttle base64 WAV from JS to Python
b64_box = widgets.Textarea(value='', layout=widgets.Layout(display='none'))
b64_box.add_class('recording-b64')

with recording_output:
    if state.selected_question is None:
        print("‚ö† Please select a question first!")
    else:
        display(HTML(f"<p><strong>You selected:</strong> {state.selected_question}</p>"))
        display(HTML("<p>Click the button below to record yourself asking this question:</p>"))
        
        # WebAudio-based recorder (PCM -> WAV), no MediaRecorder dependency
        display(HTML(
            """
<div id=\"audio-recorder\">
  <button id=\"start-recording\" onclick=\"startRecording()\">üé§ Start Recording</button>
  <button id=\"stop-recording\" onclick=\"stopRecording()\" disabled>‚èπ Stop Recording</button>
  <div id=\"recording-status\" style=\"margin-top:6px;\"></div>
  <audio id=\"audio-playback\" controls style=\"display:none; margin-top:8px;\"></audio>
</div>
<script>
let audioContext;
let processor;
let input;
let globalStream;
let recording = false;
let leftChannel = [];
let recordingLength = 0;
let sampleRate = 48000;

function mergeBuffers(channelBuffer, recordingLength){
  const result = new Float32Array(recordingLength);
  let offset = 0;
  for (let i = 0; i < channelBuffer.length; i++){
    result.set(channelBuffer[i], offset);
    offset += channelBuffer[i].length;
  }
  return result;
}

function floatTo16BitPCM(output, offset, input){
  for (let i = 0; i < input.length; i++, offset += 2){
    let s = Math.max(-1, Math.min(1, input[i]));
    output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
  }
}

function writeString(view, offset, string){
  for (let i = 0; i < string.length; i++){
    view.setUint8(offset + i, string.charCodeAt(i));
  }
}

function encodeWAV(samples, sampleRate){
  const buffer = new ArrayBuffer(44 + samples.length * 2);
  const view = new DataView(buffer);

  writeString(view, 0, 'RIFF');
  view.setUint32(4, 36 + samples.length * 2, true);
  writeString(view, 8, 'WAVE');
  writeString(view, 12, 'fmt ');
  view.setUint32(16, 16, true);
  view.setUint16(20, 1, true); // PCM
  view.setUint16(22, 1, true); // mono
  view.setUint32(24, sampleRate, true);
  view.setUint32(28, sampleRate * 2, true); // byte rate
  view.setUint16(32, 2, true); // block align
  view.setUint16(34, 16, true); // bits per sample
  writeString(view, 36, 'data');
  view.setUint32(40, samples.length * 2, true);
  floatTo16BitPCM(view, 44, samples);
  return view;
}

async function startRecording(){
  leftChannel = [];
  recordingLength = 0;
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  globalStream = stream;
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
  sampleRate = audioContext.sampleRate || 48000;
  input = audioContext.createMediaStreamSource(stream);
  processor = audioContext.createScriptProcessor(4096, 1, 1);
  processor.onaudioprocess = (e) => {
    if (!recording) return;
    const channelData = e.inputBuffer.getChannelData(0);
    leftChannel.push(new Float32Array(channelData));
    recordingLength += channelData.length;
  };
  input.connect(processor);
  processor.connect(audioContext.destination);
  recording = true;
  document.getElementById('start-recording').disabled = true;
  document.getElementById('stop-recording').disabled = false;
  document.getElementById('recording-status').innerHTML = 'üî¥ Recording...';
}

function stopRecording(){
  recording = false;
  document.getElementById('start-recording').disabled = false;
  document.getElementById('stop-recording').disabled = true;
  document.getElementById('recording-status').innerHTML = 'Processing audio...';

  if (processor){ processor.disconnect(); }
  if (input){ input.disconnect(); }
  if (globalStream){ globalStream.getTracks().forEach(t => t.stop()); }

  const merged = mergeBuffers(leftChannel, recordingLength);
  const wavView = encodeWAV(merged, sampleRate);
  const audioBlob = new Blob([wavView], { type: 'audio/wav' });
  const audioUrl = URL.createObjectURL(audioBlob);

  const audioPlayer = document.getElementById('audio-playback');
  audioPlayer.src = audioUrl;
  audioPlayer.style.display = 'block';

  const reader = new FileReader();
  reader.readAsDataURL(audioBlob);
  reader.onloadend = () => {
    const base64Audio = reader.result.split(',')[1];
    // Find the ipywidgets Textarea by wrapper class and then its textarea
    const wrapper = document.querySelector('.recording-b64');
    const ta = wrapper ? wrapper.querySelector('textarea') : null;
    if (ta){
      ta.value = base64Audio;
      // Fire both input and change to ensure widget sync
      ta.dispatchEvent(new Event('input', { bubbles: true }));
      ta.dispatchEvent(new Event('change', { bubbles: true }));
      document.getElementById('recording-status').innerHTML = '‚úì Recording complete! Saved to Python.';
    } else {
      document.getElementById('recording-status').innerHTML = '‚úó Could not locate widget bridge. Please re-run this cell.';
    }
  };
}
</script>
"""
        ))

# Python-side save handler

def _save_recording(change):
    b64 = change['new']
    if not b64:
        return
    try:
        if state.selected_question_idx is None:
            raise ValueError("No question selected. Please select a question.")
        audio_bytes = base64.b64decode(b64)
        rec_path = config.recordings_dir / f"{state.participant_id}_q{state.selected_question_idx}.wav"
        with open(rec_path, 'wb') as f:
            f.write(audio_bytes)
        state.recorded_audio_path = str(rec_path)
        clear_output(wait=True)
        display(HTML(f"<p><strong>Saved:</strong> {rec_path.name}</p>"))
        display(Audio(state.recorded_audio_path))
        display(HTML("<p>Proceed to Step 3 to run voice conversion.</p>"))
    except Exception as e:
        clear_output(wait=True)
        print(f"‚úó Failed to save recording: {e}")

b64_box.observe(_save_recording, names='value')

# Render components
with recording_output:
    display(b64_box)

display(recording_output)

Output()

### Step 3: Voice Conversion (CLI)

Run this cell to convert your recording using the Seed-VC inference script for each target voice. It uses the local checkpoint `DiT_uvit_tat_xlsr_ema.pth` and config `configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml` from `seed-vc/`.

In [10]:
# Run Seed-VC CLI for each target voice and display results
import os
import sys
import torch
import subprocess
from pathlib import Path

cli_output = widgets.Output()

with cli_output:
    # Validate source recording
    if state.recorded_audio_path is None or not Path(state.recorded_audio_path).exists():
        print("‚ö† Please record your audio first (Step 2).")
    else:
        source_path = Path(state.recorded_audio_path)
        # Prepare paths
        SEED_VC_PATH = os.path.join(os.getcwd(), 'seed-vc')
        INFERENCE_SCRIPT = Path(SEED_VC_PATH) / 'inference.py'
        checkpoint_path = Path(os.getcwd()) / 'DiT_uvit_tat_xlsr_ema.pth'
        config_path = Path(SEED_VC_PATH) / 'configs' / 'presets' / 'config_dit_mel_seed_uvit_xlsr_tiny.yml'
        hifi_config = Path(SEED_VC_PATH) / 'configs' / 'hifigan.yml'

        if not INFERENCE_SCRIPT.exists():
            print(f"‚úó Inference script not found: {INFERENCE_SCRIPT}")
        elif not checkpoint_path.exists():
            print(f"‚úó Checkpoint not found: {checkpoint_path}")
        elif not config_path.exists():
            print(f"‚úó Config not found: {config_path}")
        elif not hifi_config.exists():
            print(f"‚úó Missing required file: {hifi_config}")
        else:
            # Parameters
            diffusion_steps = 30
            length_adjust = 1.0
            inference_cfg_rate = 0.7
            use_fp16 = torch.cuda.is_available() or torch.backends.mps.is_available()

            # Output directory
            output_dir = config.recordings_dir / f"{state.participant_id}_cli"
            output_dir.mkdir(exist_ok=True, parents=True)

            print(f"Using:\n- source: {source_path.name}\n- checkpoint: {checkpoint_path.name}\n- config: {config_path.name}\n- hifigan: {hifi_config.name}\n- output: {output_dir}\n")

            # Clear previous converted paths to avoid mixing
            state.converted_audio_paths = {}

            # Run conversion for each target voice
            for voice_name, rel_path in config.target_voices.items():
                target_path = config.target_voices_dir / rel_path
                if not target_path.exists():
                    print(f"‚ö† Skipping {voice_name}: missing {target_path}")
                    continue

                print(f"Converting ‚Üí {voice_name} ...")
                cmd = [
                    sys.executable,
                    str(INFERENCE_SCRIPT),
                    "--source", str(source_path),
                    "--target", str(target_path),
                    "--output", str(output_dir),
                    "--diffusion-steps", str(diffusion_steps),
                    "--length-adjust", str(length_adjust),
                    "--inference-cfg-rate", str(inference_cfg_rate),
                    "--fp16", "True" if use_fp16 else "False",
                    "--checkpoint", str(checkpoint_path),
                    "--config", str(config_path),
                ]

                # Run from seed-vc so relative configs resolve
                result = subprocess.run(cmd, capture_output=True, text=True, cwd=SEED_VC_PATH)
                if result.returncode != 0:
                    print(f"‚úó CLI error for {voice_name}:\n{result.stderr.strip() or result.stdout.strip()}")
                    continue

                # Expected output filename pattern from inference.py
                src_stem = source_path.stem
                tgt_stem = target_path.stem
                out_name = f"vc_{src_stem}_{tgt_stem}_{length_adjust}_{diffusion_steps}_{inference_cfg_rate}.wav"
                out_file = output_dir / out_name

                # Fallback: pick the newest wav if pattern not found
                if not out_file.exists():
                    wavs = sorted(output_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime, reverse=True)
                    out_file = wavs[0] if wavs else None

                if out_file and out_file.exists():
                    state.converted_audio_paths[voice_name] = str(out_file)
                    print(f"‚úì Saved: {out_file.name}")
                    display(Audio(str(out_file)))
                else:
                    print(f"‚úó Could not locate output for {voice_name}")

            if len(state.converted_audio_paths) > 0:
                print("\n‚úì CLI conversions complete. Proceed to Step 4.")
            else:
                print("\n‚ö† No conversions produced output. Check logs above.")

display(cli_output)

Output(outputs=({'name': 'stdout', 'text': 'Using:\n- source: P20251117_211314_q2.wav\n- checkpoint: DiT_uvit_‚Ä¶

### Step 4: Listen and Provide Feedback

For each voice profile, you'll hear:
1. Your question in the converted voice
2. The AI's pre-defined response in that voice

Then provide feedback on your experience.

In [11]:
# Interactive playback and feedback interface
def create_feedback_form(voice_name):
    """Create a feedback form for a specific voice profile"""
    
    # Likert scale questions
    questions = {
        # Part 1: The AI's Answer (Content Quality)
        'relevance': 'Was the AI\'s spoken answer relevant to your question?',
        'helpfulness': 'Did the advice in the AI\'s response feel helpful and useful?',

        # Part 2: The AI's Voice (Audio & Tone Quality)
        'naturalness': 'How natural and human-like did the AI\'s voice sound?',
        'clarity': 'Was the AI\'s voice clear and easy to understand?',
        'tone_appropriateness': 'Was the emotional tone of the AI\'s voice appropriate for the situation?',

        # Part 3: The Overall Interaction (Perception & Feeling)
        'perceived_understanding': 'How well did the AI seem to understand the emotion or feeling behind your question?',
        'comfort': 'How comfortable did you feel hearing the response in this voice?',
        'trustworthiness': 'How trustworthy did the AI\'s voice and response make the advice seem?',
        'engagement': 'How engaging was this interaction? (e.g., did it make you want to continue the conversation?)',
    }
    
    widgets_dict = {}
    
    # Create slider widgets for each question
    for key, question in questions.items():
        widgets_dict[key] = widgets.IntSlider(
            value=3,
            min=1,
            max=5,
            step=1,
            description='',
            continuous_update=False,
            orientation='horizontal',
            readout=True,
            layout=widgets.Layout(width='400px')
        )
    
    # Open-ended feedback
    widgets_dict['comments'] = widgets.Textarea(
        value='',
        placeholder='Any additional comments about this voice profile?',
        description='Comments:',
        layout=widgets.Layout(width='500px', height='100px')
    )
    
    return widgets_dict, questions

# Main playback interface
playback_container = widgets.Output()

with playback_container:
    if len(state.converted_audio_paths) == 0:
        print("‚ö† Please complete voice conversion first!")
    else:
        print("Listen to each response and provide your feedback:\n")
        
        for voice_name, converted_path in state.converted_audio_paths.items():
            # Get response audio path
            response_path = config.responses_dir / f"q{state.selected_question_idx}_{voice_name}_response.wav"
            
            # Voice profile section
            display(HTML(f"<hr><h3>Voice Profile: {voice_name.replace('_', ' ')}</h3>"))
            
            # Play converted question
            display(HTML("<p><strong>1. Your question in this voice:</strong></p>"))
            if Path(converted_path).exists():
                display(Audio(converted_path))
            else:
                display(HTML("<p style='color:orange;'>‚ö† Converted audio not found</p>"))
            
            # Play AI response
            display(HTML("<p><strong>2. AI response in this voice:</strong></p>"))
            if response_path.exists():
                display(Audio(str(response_path)))
            else:
                display(HTML(f"<p style='color:orange;'>‚ö† Response audio not found: {response_path.name}</p>"))
                display(HTML(f"<p style='font-size:0.9em;'>AI responses will be added later</p>"))
            
            # Feedback form
            display(HTML("<h4>Feedback:</h4>"))
            feedback_widgets, questions = create_feedback_form(voice_name)
            
            # Display questions with sliders
            for key, question in questions.items():
                display(HTML(f"<p><em>{question}</em> (1=Not at all, 5=Very much)</p>"))
                display(feedback_widgets[key])
            
            # Comments
            display(HTML("<p><em>Additional comments:</em></p>"))
            display(feedback_widgets['comments'])
            
            # Save button for this voice profile
            save_btn = widgets.Button(
                description=f'Save Feedback for {voice_name}',
                button_style='success',
                icon='check'
            )
            
            def make_save_handler(vn, fw, q):
                def save_feedback(btn):
                    feedback_data = {
                        'participant_id': state.participant_id,
                        'question_idx': state.selected_question_idx,
                        'question': state.selected_question,
                        'voice_profile': vn,
                        'timestamp': datetime.now().isoformat(),
                        'ratings': {key: fw[key].value for key in q.keys()},
                        'comments': fw['comments'].value
                    }
                    state.responses.append(feedback_data)
                    
                    # Save to JSON
                    results_file = config.results_dir / f"{state.participant_id}_results.json"
                    with open(results_file, 'w') as f:
                        json.dump(state.responses, f, indent=2)
                    
                    btn.description = '‚úì Saved!'
                    btn.button_style = 'info'
                    btn.disabled = True
                
                return save_feedback
            
            save_btn.on_click(make_save_handler(voice_name, feedback_widgets, questions))
            display(save_btn)

display(playback_container)

Output(outputs=({'name': 'stdout', 'text': 'Listen to each response and provide your feedback:\n\n', 'output_t‚Ä¶

### Step 5: Final Questionnaire

Overall experience with voice-adaptive AI:

In [12]:
# Final questionnaire
final_questionnaire = widgets.Output()

with final_questionnaire:
    display(HTML("<h4>Overall Experience</h4>"))
    
    # Overall questions
    overall_questions = {
        'overall_preference': 'Which AI response did you prefer? The one you received when using your *original* voice, or the one you received when using a *converted* voice?',
        'bias_importance': 'How important is it that an AI provides the *same* advice and information, regardless of a user\'s voice characteristics (like pitch, tone, or accent)?',
        'future_use': 'Based on this experience, how likely would you be to use an AI that *changes its answers* based on how your voice sounds?',
        'concerns': 'What concerns, if any, do you have about an AI that gives different responses based on a user\'s voice?'
    }
    
    # Preference dropdown
    display(HTML(f"<p><strong>{overall_questions['overall_preference']}</strong></p>"))
    preference_dropdown = widgets.Dropdown(
        options=['Prefer not to say'] + list(config.target_voices.keys()),
        description='Preference:',
        style={'description_width': 'initial'}
    )
    display(preference_dropdown)
    
    # Importance slider
    display(HTML(f"<p><strong>{overall_questions['bias_importance']}</strong> (1=Not important, 5=Very important)</p>"))
    importance_slider = widgets.IntSlider(value=3, min=1, max=5, description='')
    display(importance_slider)
    
    # Likelihood slider
    display(HTML(f"<p><strong>{overall_questions['future_use']}</strong> (1=Very unlikely, 5=Very likely)</p>"))
    likelihood_slider = widgets.IntSlider(value=3, min=1, max=5, description='')
    display(likelihood_slider)
    
    # Concerns
    display(HTML(f"<p><strong>{overall_questions['concerns']}</strong></p>"))
    concerns_text = widgets.Textarea(
        placeholder='Please share any concerns or thoughts...',
        layout=widgets.Layout(width='600px', height='120px')
    )
    display(concerns_text)
    
    # Demographics (optional)
    display(HTML("<h4>Demographics (Optional)</h4>"))
    
    age_dropdown = widgets.Dropdown(
        options=['Prefer not to say', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
        description='Age range:',
        value='Prefer not to say'
    )
    display(age_dropdown)
    
    gender_dropdown = widgets.Dropdown(
        options=['Prefer not to say', 'Female', 'Male', 'Non-binary', 'Other'],
        description='Gender:',
        value='Prefer not to say'
    )
    display(gender_dropdown)
    
    background_dropdown = widgets.Dropdown(
        options=['Prefer not to say', 'Computer Science/Engineering', 'Other STEM', 
                 'Social Sciences', 'Humanities', 'Business', 'Healthcare', 'Other'],
        description='Background:',
        value='Prefer not to say',
        style={'description_width': 'initial'}
    )
    display(background_dropdown)
    
    # Final comments
    display(HTML("<h4>Additional Comments</h4>"))
    final_comments = widgets.Textarea(
        placeholder='Any other thoughts about this study or voice-adaptive AI?',
        layout=widgets.Layout(width='600px', height='120px')
    )
    display(final_comments)
    
    # Submit button
    submit_button = widgets.Button(
        description='Submit Final Questionnaire',
        button_style='primary',
        icon='check',
        layout=widgets.Layout(width='300px', height='50px')
    )
    
    submit_output = widgets.Output()
    
    def on_submit(btn):
        final_data = {
            'participant_id': state.participant_id,
            'timestamp': datetime.now().isoformat(),
            'overall_preference': preference_dropdown.value,
            'bias_importance': importance_slider.value,
            'future_use_likelihood': likelihood_slider.value,
            'concerns': concerns_text.value,
            'demographics': {
                'age_range': age_dropdown.value,
                'gender': gender_dropdown.value,
                'background': background_dropdown.value
            },
            'final_comments': final_comments.value
        }
        
        # Save final data
        final_file = config.results_dir / f"{state.participant_id}_final.json"
        with open(final_file, 'w') as f:
            json.dump(final_data, f, indent=2)
        
        with submit_output:
            clear_output()
            display(HTML("""
                <div style='padding: 20px; background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px;'>
                    <h3 style='color: #155724;'>‚úì Thank you for completing the study!</h3>
                    <p>Your responses have been saved.</p>
                    <p><strong>Participant ID:</strong> {}</p>
                </div>
            """.format(state.participant_id)))
        
        btn.disabled = True
    
    submit_button.on_click(on_submit)
    display(submit_button)
    display(submit_output)

display(final_questionnaire)

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<IPython.core.display.HTML object>', '‚Ä¶