In [1]:
# Step 1: Install the Google Gemini AI SDK
!pip install -q google-generativeai

In [2]:
# API KEY
import os
os.environ["GOOGLE_API_KEY"] = "ENTER API KEY"

In [3]:
from google import genai
from google.genai import types

client = genai.Client()

# Speech Generation

In [4]:
"""
1. text to audio
2. config: GenerateContentConfig
3. response modality: audio
4. GenerateContentConfig: SpeechConfig
5. SpeechConfig: VoiceConfig (for generating output in single voice)
6. VoiceConfig: PrebuilVoiceConfig
7. PrebuilVoiceConfig: voiceName
"""

response_1 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents="say politely: Hey how're you doing ?",
    config=types.GenerateContentConfig(
        response_modalities=['AUDIO'],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="Kore"
                )
                )
            )
            )
)

In [5]:
# displaying response_1
response_1

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='ebf_aJPyAp31jMcP2tDo8Qg',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=54,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=54
      ),
    ],
    prompt_token_count=11,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=11
      ),
    ],
    total_token_count=65
  )
)

In [6]:
# storing binary audio data into data_1 variable
data_1 = response_1.candidates[0].content.parts[0].inline_data.data

In [7]:
# converting this binary data into (.wav) file

"""
1. channel: 1(mono), 2(stereo)
2. samplewidth: clearity 1-8bits, 2-16bits
3. framerates: hertz
"""

import wave

def wave_file(filename, input):
  with wave.open(filename, 'wb') as w:
    w.setnchannels(1)
    w.setsampwidth(2)
    w.setframerate(24000)
    w.writeframes(input)

In [8]:
# calling wav_file function
filename_1='Speech_Out_1.wav'
wave_file(filename_1, data_1)

In [9]:
# playing the audio
from IPython.display import Audio
Audio(filename="/content/Speech_Out_1.wav")

In [10]:
# Multi Tone Paragraph
content = [
    "Say cheerfully: Good Morning sneha",
    "Say excitedly: Tommorow we'll go for a vacation",
    "Say angrly: How dare you to touch my laptop",
    "Say encouragingly: You can do it, i know you're very smart",
    "Say sadly: I wish, we could meet"
]

response_2 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents=content,
    config=types.GenerateContentConfig(
        response_modalities=['AUDIO'],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="kore"
                )
                )
            )
            )
)

In [11]:
# displaying response_2
response_2

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='gbf_aKSrGraYjMcPlOKwoAw',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=477,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=477
      ),
    ],
    prompt_token_count=57,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=57
      ),
    ],
    total_token_count=534
  )
)

In [12]:
# storing binary audio data into data_2 variable
data_2 = response_2.candidates[0].content.parts[0].inline_data.data

In [13]:
# calling wav_file function
filename_2='Speech_Out_2.wav'
wave_file(filename_2, data_2)

In [14]:
# playing the audio
from IPython.display import Audio
Audio("/content/Speech_Out_2.wav")

In [16]:
# Multiple Voice (like two people talk with each other)

"""
1. text to audio
2. config: GenerateContentConfig
3. response modality: audio
4. GenerateContentConfig: SpeechConfig
5. SpeechConfig: MultiSpeakerVoiceConfig (for generating output in multiple voice)
6. MultiSpeakerVoiceConfig: SpeakerVoiceConfigs (speaker)
7. SpeakerVoiceConfigs: VoiceConfig
8. VoiceConfig: PrebuilVoiceConfig
9. PrebuilVoiceConfig: voiceName
"""

prompt =  """
<speaker name="Robert">Hey Elizabeth, can you believe we’re finally in college?</speaker>
<speaker name="Elizabeth">I know, right? It feels so strange but exciting at the same time.</speaker>
<speaker name="Robert">Yeah, I was so nervous this morning. I even forgot my ID card.</speaker>
<speaker name="Elizabeth">(laughs) Classic first-day moment. Don’t worry, I almost got lost finding my class!</speaker>
"""

response_3 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents=prompt,
    config=types.GenerateContentConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
                speaker_voice_configs=[
                    types.SpeakerVoiceConfig(
                        speaker="Robert",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Puck"
                            )
                        )
                    ),
                    types.SpeakerVoiceConfig(
                        speaker="Elizabeth",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Kore"
                            )
                        )
                    ),
                ]
            )
        )
    )
)


In [17]:
# displaying response_3
response_3

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='k7f_aP20Nc3cjMcP_5bm0Qs',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=486,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=486
      ),
    ],
    prompt_token_count=104,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=104
      ),
    ],
    total_token_count=590
  )
)

In [18]:
# storing binary audio data into data_3 variable
data_3 = response_3.candidates[0].content.parts[0].inline_data.data

In [19]:
# calling wav_file function
filename_3='Speech_Out_3.wav'
wave_file(filename_3, data_3)

In [20]:
# playing the audio
from IPython.display import Audio
Audio("/content/Speech_Out_3.wav")