In [None]:
%pip install --upgrade eyed3
%pip install --upgrade openai
%pip install --upgrade python-dotenv
%pip install --upgrade pydub
%pip install --upgrade soundfile
%pip install --upgrade openai-whisper
%pip install --upgrade azure-cognitiveservices-speech
%pip install --upgrade playsound

Collecting playsound
  Downloading playsound-1.3.0.tar.gz (7.7 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: playsound
  Building wheel for playsound (setup.py): started
  Building wheel for playsound (setup.py): finished with status 'done'
  Created wheel for playsound: filename=playsound-1.3.0-py3-none-any.whl size=7044 sha256=1e4ad59c8d2b881546b6a3e95a35f6ba765e5102f1fca2042307e3746a923798
  Stored in directory: c:\users\rmendonca\appdata\local\pip\cache\wheels\50\98\42\62753a9e1fb97579a0ce2f84f7db4c21c09d03bb2091e6cef4
Successfully built playsound
Installing collected packages: playsound
Successfully installed playsound-1.3.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import time
from pydub import AudioSegment
from dotenv import load_dotenv
from openai import AzureOpenAI
from playsound import playsound

load_dotenv()

True

In [None]:

# inicialização do cliente Whisper
openai = AzureOpenAI(api_key=os.getenv("AZURE_OPENAI_KEY_NORTHCENTRALUS"),
                      azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT_NORTHCENTRALUS"), 
                      api_version=os.getenv("WHISPER_VERSION"))

# Verifica se o arquivo de áudio precisa ser dividido
def need_to_split(file_path, size_threshold_mb=25, duration_threshold_minutes=3):
    print(f"Checking if {file_path} needs to be split")
    size_threshold_bytes = size_threshold_mb * 1024 * 1024
    duration_threshold_ms = duration_threshold_minutes * 60 * 1000
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    file_size = os.path.getsize(file_path)
    audio = AudioSegment.from_file(file_path)
    audio_duration = len(audio)
    
    if file_size > size_threshold_bytes or audio_duration > duration_threshold_ms:
        print(f"The file {file_path} needs to be split.")
        return True
    else:
        print(f"The file {file_path} does not need to be split.")
        return False

# conversão do áudio para mp3
def compress_audio(file_path, ext="mp3"):
    print(f"Compressing {file_path} to {ext}")
    file_root, _ = os.path.splitext(file_path)
    file_compressed = f"{file_root}.{ext}"
    # Load the audio file
    audio = AudioSegment.from_file(file_path, codec="adpcm_ima_wav")
    audio.export(file_compressed, format=ext, parameters=["-ac","2","-ar","8000"])
    return file_compressed

# Faz o split do arquivo de áudio em arquivos menores
def split_audio(file_path, chunk_size_kb=None, chunk_duration_minutes=None):
    audio = AudioSegment.from_file(file_path)
    
    if chunk_size_kb:
        file_size_kb = os.path.getsize(file_path) / 1024
        num_chunks = int(file_size_kb // chunk_size_kb) + 1
        chunk_duration_ms = len(audio) / num_chunks
    elif chunk_duration_minutes:
        chunk_duration_ms = chunk_duration_minutes * 60 * 1000
        num_chunks = int(len(audio) // chunk_duration_ms) + 1
    else:
        raise ValueError("Either chunk_size_kb or chunk_duration_minutes must be provided.")
    
    chunks = [audio[i * chunk_duration_ms:(i + 1) * chunk_duration_ms] for i in range(num_chunks)]
    
    base_name, ext = os.path.splitext(file_path)
    chunk_files = []
    for i, chunk in enumerate(chunks):
        chunk_file = f"{base_name}_chunk{i + 1}{ext}"
        chunk.export(chunk_file, format=ext[1:])
        chunk_files.append(chunk_file)
    
    return chunk_files
    

def transcript_audio_to_text_splitting(audio_file_path, model="whisper", region="northcentralus"):
    print(f"Transcribing {audio_file_path} with model {model}")
    start_time = time.perf_counter()
    split_files = split_audio(audio_file_path, chunk_size_kb=None, chunk_duration_minutes=5)
    print(f"Split audio into {len(split_files)} chunks")
    transcriptions = []
    for audio_file in split_files:
        transcriptions.append(transcript_audio_to_text(audio_file, model, region))
    
    elapsed_time = time.perf_counter() - start_time
    return {"0.elapsed_time": elapsed_time,"1.transcriptions": transcriptions}


def transcript_audio_to_text(audio_file_path, model="whisper", region="northcentralus"):
    print(f"Transcribing {audio_file_path} with model {model}")
    with open(audio_file_path, "rb") as audio_file:
        transcription = openai.audio.transcriptions.create(
            model=model,
            file=audio_file
    )
    file_root, _ = os.path.splitext(audio_file_path)
    transcript_file = f"{file_root}.txt"
    append_to_file(transcript_file, transcription.text)
    return transcription

def append_to_file(file_path, content):

    with open(file_path, 'a') as file:
        file.write(content)
        file.write('\n')  # Optionally add a newline after the content

In [8]:
audio_file = "C:/Users/rmendonca/Downloads/teste.mp3"

if need_to_split(audio_file):
    transcript = transcript_audio_to_text_splitting(audio_file)
else:
    transcript = transcript_audio_to_text(audio_file)

print("Transcription:", transcript.text)


Checking if C:/Users/rmendonca/Downloads/teste.mp3 needs to be split
The file C:/Users/rmendonca/Downloads/teste.mp3 does not need to be split.
Transcribing C:/Users/rmendonca/Downloads/teste.mp3 with model whisper
Transcription: Esse áudio eu vou usar para testar a capacidade de transcription do GPT-4O ou talvez outros modelos de transcription dentro do Fabric. Para isso eu vou ler um trecho da Wikipedia. Escolhi aqui o trecho que trata da vida de Ana Nery. Ana Justina Ferreira Nery, mais conhecida como Ana Nery. Nascida em 13 de dezembro de 1814, na cidade de Cachoeira. Veio a falecer no dia 20 de maio de 1880, no Rio de Janeiro. Foi uma enfermeira brasileira, pioneira da enfermagem no Brasil. É conhecida como a mãe dos brasileiros. Apelido compartilhado entre outros. Biografia. Antes da guerra do Paraguai. Filha de José Ferreira de Jesus e Luísa Maria das Virgens. Ana Justina Ferreira. Nasceu em Cachoeira da Bahia. Casou-se com o capitão de fragata Isidoro Antônio Nery em 1837. Quan

In [8]:
#custom model

import azure.cognitiveservices.speech as speechsdk

# Creates an instance of a speech config with specified subscription key and service region.
speech_key = os.getenv("CUSTOM_SPEECH_KEY")
service_region = "eastus"

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.endpoint_id = os.getenv("CUSTOM_SPEECH_ENDPOINT_ID")
speech_config.speech_synthesis_voice_name = os.getenv("SPEECH_SYNTESIS_VOICE_NAME")
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio24Khz160KBitRateMonoMp3)
print("Custom voice name:", speech_config.speech_synthesis_voice_name)
print("Custom endpoint id:", speech_config.endpoint_id)
text = "Oi, esta é a minha voz personalizada."
file_name = "sample.wav"

# using the default speaker as audio output.
file_config = speechsdk.audio.AudioOutputConfig(filename=file_name)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)

result = speech_synthesizer.speak_text_async(text).get()
# Check result
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text, file_name))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))

Custom voice name: Rodam_v1
Custom endpoint id: 39b3f6c9-41b6-43a2-bd1d-13445e77c06b
Speech synthesized for text [Oi, esta é a minha voz personalizada.], and the audio was saved to [sample.wav]


In [2]:
playsound("sample.wav")


    Error 263 for command:
        open sample.wav
    The specified device is not open or is not recognized by MCI.

    Error 263 for command:
        close sample.wav
    The specified device is not open or is not recognized by MCI.
Failed to close the file: sample.wav


PlaysoundException: 
    Error 263 for command:
        open sample.wav
    The specified device is not open or is not recognized by MCI.