In [3]:
import os
from google.cloud import speech_v1p1beta1 as speech

def transcribe_audio_with_diarization(audio_file_path, credentials_path, output_folder):
    # Set Google Cloud credentials
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

    # Initialize the client
    client = speech.SpeechClient()

    # Read the audio file
    with open(audio_file_path, "rb") as audio_file:
        content = audio_file.read()

    # Configure recognition request
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,  # Change to MP3 if your file is MP3
        sample_rate_hertz=44100,  # Replace with your audio's sample rate
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,  # Specify the number of speakers
        model="telephony"  # Use "phone_call" if applicable
    )

    # Perform the recognition
    response = client.recognize(config=config, audio=audio)

    # Group words by speakers
    speaker_transcripts = {}
    for result in response.results:
        for word_info in result.alternatives[0].words:
            speaker = word_info.speaker_tag
            word = word_info.word
            if speaker not in speaker_transcripts:
                speaker_transcripts[speaker] = []
            speaker_transcripts[speaker].append(word)

    # Prepare output text
    output_text = []
    for speaker, words in speaker_transcripts.items():
        output_text.append(f"Speaker {speaker}: {' '.join(words)}")

    # Join the text with line breaks
    output_text_str = "\n\n".join(output_text)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get the base name of the input audio file without extension
    file_name_without_extension = os.path.splitext(os.path.basename(audio_file_path))[0]

    # Define the output file path with the same name as the input audio file
    output_file_path = os.path.join(output_folder, f"{file_name_without_extension}.txt")

    # Write to a text file
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(output_text_str)

    print(f"Transcription saved to {output_file_path}")


# Replace with your audio file path, credentials path, and desired output folder
audio_file_path = "dataset/1215.MP3"  # Change to your file path
credentials_path = "C:/gcloud/service-account-key.json"  # Change to your credentials file path
output_folder = "output"  # Specify the output folder

# Call the function
transcribe_audio_with_diarization(audio_file_path, credentials_path, output_folder)


Transcription saved to output\1215.txt


In [27]:
import os
from google.cloud import speech_v1p1beta1 as speech

def transcribe_audio_with_diarization(audio_file_uri, credentials_path, output_folder):
    # Set Google Cloud credentials
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

    # Initialize the client
    client = speech.SpeechClient()

    # Configure recognition request using the GCS URI
    audio = speech.RecognitionAudio(uri=audio_file_uri)  # Use GCS URI here
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,  # Adjust for your audio file's encoding
        sample_rate_hertz=44100,  # Replace with your audio's sample rate
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,  # Set the number of speakers you expect
        model="telephony"  # Use "phone_call" or other models as necessary
    )

    # Use long_running_recognize for longer audio files
    operation = client.long_running_recognize(config=config, audio=audio)
    
    # Wait for the operation to complete
    print("Waiting for the operation to complete...")
    response = operation.result()

    # Group words by speakers
    speaker_transcripts = {}
    for result in response.results:
        for word_info in result.alternatives[0].words:
            speaker = word_info.speaker_tag
            word = word_info.word
            if speaker not in speaker_transcripts:
                speaker_transcripts[speaker] = []
            speaker_transcripts[speaker].append(word)

    # Prepare output text
    output_text = []
    for speaker, words in speaker_transcripts.items():
        output_text.append(f"Speaker {speaker}: {' '.join(words)}")

    # Join the text with line breaks
    output_text_str = "\n\n".join(output_text)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get the base name of the input audio file without extension
    file_name_without_extension = os.path.splitext(os.path.basename(audio_file_uri))[0]

    # Define the output file path with the same name as the input audio file
    output_file_path = os.path.join(output_folder, f"{file_name_without_extension}.txt")

    # Write to a text file
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(output_text_str)

    print(f"Transcription saved to {output_file_path}")

# Replace with your GCS audio file URI, credentials path, and desired output folder
audio_file_uri = "gs://nigeria-3/1217.MP3"  # Your GCS URI
credentials_path = "C:/gcloud/service-account-key.json"  # Change to your credentials file path
output_folder = "output"  # Specify the output folder

# Call the function
transcribe_audio_with_diarization(audio_file_uri, credentials_path, output_folder)


Waiting for the operation to complete...
Transcription saved to output\1217.txt


In [None]:
import os
from google.cloud import speech_v1p1beta1 as speech
import time

def transcribe_audio_with_diarization_async(audio_file_path, credentials_path, output_folder):
    # Set Google Cloud credentials
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

    # Initialize the client
    client = speech.SpeechClient()

    # Read the audio file
    with open(audio_file_path, "rb") as audio_file:
        content = audio_file.read()

    # Configure recognition request
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,  # Encoding for .m4a files
        sample_rate_hertz=44100,  # Sample rate for your audio file
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,  # Specify the number of speakers
        model="telephone"  # Adjust based on the use case, e.g., "phone_call"
    )

    # Perform asynchronous recognition
    operation = client.long_running_recognize(config=config, audio=audio)

    print("Processing audio...")

    # Wait for the operation to complete
    response = operation.result(timeout=600)  # Timeout after 10 minutes (adjust if needed)

    # Group words by speakers
    speaker_transcripts = {}
    for result in response.results:
        for word_info in result.alternatives[0].words:
            speaker = word_info.speaker_tag
            word = word_info.word
            if speaker not in speaker_transcripts:
                speaker_transcripts[speaker] = []
            speaker_transcripts[speaker].append(word)

    # Prepare output text
    output_text = []
    for speaker, words in speaker_transcripts.items():
        output_text.append(f"Speaker {speaker}: {' '.join(words)}")

    # Join the text with line breaks
    output_text_str = "\n\n".join(output_text)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get the base name of the input audio file without extension
    file_name_without_extension = os.path.splitext(os.path.basename(audio_file_path))[0]

    # Define the output file path with the same name as the input audio file
    output_file_path = os.path.join(output_folder, f"{file_name_without_extension}.txt")

    # Write to a text file
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(output_text_str)

    print(f"Transcription saved to {output_file_path}")


# Replace with your audio file path, credentials path, and desired output folder
audio_file_path = "dataset/Nigeria/CBM HEAD.m4a"  # Change to your .m4a file path
credentials_path = "C:/gcloud/service-account-key.json"  # Change to your credentials file path
output_folder = "output"  # Specify the output folder

# Call the function
transcribe_audio_with_diarization_async(audio_file_path, credentials_path, output_folder)


In [4]:
from google.cloud import speech

# Path to the local audio file
local_file_path = "dataset/1215.MP3"

def transcribe_speech():
    # Instantiates a client
    client = speech.SpeechClient()

    # Read the audio file into memory
    with open(local_file_path, "rb") as audio_file:
        content = audio_file.read()

    # Set up the recognition configuration
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code="en-US",
        model="telephony",
        audio_channel_count=2,
    )

    # Create the RecognitionAudio object with the local content
    audio = speech.RecognitionAudio(content=content)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))

# Call the function
transcribe_speech()


Waiting for operation to complete...
Transcript: so uh just chat with could you please um Talk a bit more about CBM Nigeria what it is doing when it has started operating in Nigeria and our activities and our view of activities
Transcript:  okay um my name is
