In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


## Audio transcription V2
@forusone (shins777@gmail.com)

### Transcribing audio file
This colab explains how to transcribe audio file with speech api(STT)

### Install Vertex AI SDK for Python

In [None]:
!pip install --upgrade --quiet google-cloud-aiplatform[langchain,reasoningengine] \
    google-cloud-speech \
    google-cloud-storage \
    ffmpeg-python

### GCP Authentication

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id="ai-hangsik")

!gcloud config set project ai-hangsik


Updated property [core/project].


### Initial set up

In [None]:
PROJECT_ID = "ai-hangsik"
LOCATION = "us-central1"
STAGING_BUCKET = "gs://stt_0116"

import base64
import vertexai
from vertexai.generative_models import GenerativeModel

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET)


### Helper functions



In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket.
    """

    from google.cloud import storage

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

In [None]:
# @title Cloud Speech to Text V2 for long audio file

import os

from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech


def transcribe_batch_gcs_input_inline_output_v2(
    audio_uri: str,
) -> cloud_speech.BatchRecognizeResults:
    """Transcribes audio from a Google Cloud Storage URI using the Google Cloud Speech-to-Text API.
        The transcription results are returned inline in the response.
    Args:
        audio_uri (str): The Google Cloud Storage URI of the input audio file.
            E.g., gs://[BUCKET]/[FILE]
    Returns:
        cloud_speech.BatchRecognizeResults: The response containing the transcription results.
    """
    # Instantiates a client
    client = SpeechClient()



    config = cloud_speech.RecognitionConfig(
        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        model="long",
    )

    file_metadata = cloud_speech.BatchRecognizeFileMetadata(uri=audio_uri)

    request = cloud_speech.BatchRecognizeRequest(
        recognizer=f"projects/{PROJECT_ID}/locations/global/recognizers/_",
        config=config,
        files=[file_metadata],
        recognition_output_config=cloud_speech.RecognitionOutputConfig(
            inline_response_config=cloud_speech.InlineOutputConfig(),
        ),
    )

    # Transcribes the audio into text
    operation = client.batch_recognize(request=request)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=120)


    for result in response.results[audio_uri].transcript.results:

        if result.alternatives:
            print(f"Transcript: {result.alternatives[0]}")
        else:
            print("No transcript alternatives found for this segment.")  # Optional: Print a message when no alternatives are found


    return response.results[audio_uri]


### export audio file from video file.

In [None]:
!ffmpeg -i './Stephen Hawking Discovers.mp4' './Stephen Hawking Discovers.mp3'

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

## Upload audio(mp3) file to GCS

In [None]:
bucket_name = "stt_0116"
source_file_name = "Stephen Hawking Discovers.mp3"
destination_blob_name = f"translation/{source_file_name}"

upload_blob(bucket_name, source_file_name, destination_blob_name)

File Stephen Hawking Discovers.mp3 uploaded to translation/Stephen Hawking Discovers.mp3.


## STT v2 test

In [None]:
# @title Transcription with confidence
audio_uri = f"gs://{bucket_name}/{destination_blob_name}"

outcome = transcribe_batch_gcs_input_inline_output_v2(audio_uri)

Waiting for operation to complete...
Transcript: transcript: "this island King cropp"
confidence: 0.3163113

Transcript: transcript: " what is it"
confidence: 0.142858282

Transcript: transcript: " stop"
confidence: 0.496778071

Transcript: transcript: " more than three times the size of our sun or to end its life how"
confidence: 0.860717356

Transcript: transcript: " the collapse"
confidence: 0.744493306

Transcript: transcript: " the gravitational forces of the entire Mass overcoming the electromagnetic forces of individual atoms and so collapsing inwards"
confidence: 0.923254251

Transcript: transcript: " if the star is massive enough it will continue this collapse creating a black hole"
confidence: 0.952531815

Transcript: transcript: " where the warping of space-time is so great that nothing can escape not even light it gets"
confidence: 0.906234264

Transcript: transcript: " smaller"
confidence: 0.877837

Transcript: transcript: " smaller"
confidence: 0.72185415

Transcript: tra

In [None]:
# @title Transcription with timestamp.
start = 0

for result in outcome.inline_result.transcript.results:
  if result.alternatives:
    end = int(result.result_end_offset.seconds)
    print(f"Time [{start}-{end}] : {result.alternatives[0].transcript}")
    start = end


Time [0-2] : this island King cropp
Time [2-11] :  what is it
Time [11-30] :  stop
Time [30-34] :  more than three times the size of our sun or to end its life how
Time [34-36] :  the collapse
Time [36-45] :  the gravitational forces of the entire Mass overcoming the electromagnetic forces of individual atoms and so collapsing inwards
Time [45-50] :  if the star is massive enough it will continue this collapse creating a black hole
Time [50-57] :  where the warping of space-time is so great that nothing can escape not even light it gets
Time [57-59] :  smaller
Time [59-61] :  smaller
Time [61-70] :  the star in fact gets denser as atoms even subatomic particles get literally crushed into smaller and smaller space
Time [70-73] : and at its end point
Time [73-75] :  what are we left with
Time [75-79] :  space-time singularity
Time [79-84] :  space and time come to a stop
Time [84-115] :  what would happen if you applied Penrose theory about black holes to the entire universe
Time [115-12