In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


## Audio transcription
@forusone (shins777@gmail.com)

### Transcribing audio file
This colab explains how to transcribe audio file with speech api(STT)

### Install Vertex AI SDK for Python

In [2]:
!pip install --upgrade --quiet \
    "google-cloud-aiplatform[langchain,reasoningengine]" \
    google-cloud-speech \
    google-cloud-storage \
    ffmpeg-python

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/305.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m297.0/305.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.0/305.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.5/130.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.3/94.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[?25h

### GCP Authentication

In [3]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id="ai-hangsik")

!gcloud config set project ai-hangsik


Updated property [core/project].


### Initial set up

In [4]:
PROJECT_ID = "ai-hangsik"
LOCATION = "us-central1"
STAGING_BUCKET = "gs://reasoning_7424"
MODEL = "gemini-1.5-pro-002"

import base64
import vertexai
from vertexai.generative_models import GenerativeModel

vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=STAGING_BUCKET)
model = GenerativeModel(MODEL)

### export audio file from video file.

In [None]:
!ffmpeg -i './small_talk.mp4' './small_talk.mp3'

### Helper functions



In [5]:

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket.

    """

    from google.cloud import storage

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

def transcribe_audio(audio_uri:str):
  """Transcribe an audio file."""

  from google.cloud import speech

  # Instantiates a client
  client = speech.SpeechClient()
  audio = speech.RecognitionAudio(uri=audio_uri)

  config = speech.RecognitionConfig(
      #encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
      encoding=speech.RecognitionConfig.AudioEncoding.MP3,
      sample_rate_hertz=16000,
      language_code="en-US",
      model="video",  # Chosen model
  )

  operation = client.long_running_recognize(config=config, audio=audio)

  print("Waiting for operation to complete...")
  response = operation.result(timeout=90)  # Set a timeout

  # Process the response
  for i, result in enumerate(response.results):
      alternative = result.alternatives[0]
      print("-" * 20)
      print(f"First alternative of result {i}")
      print(f"Transcript: {alternative.transcript}")

### Transcribe audio file.

In [7]:

audio_uri = "gs://tests_nov25_2024/translation/small_talk.mp3"
transcribe_audio(audio_uri )

Waiting for operation to complete...
--------------------
First alternative of result 0
Transcript: excuse me I'm trying to relax
--------------------
First alternative of result 1
Transcript:  would you mind
--------------------
First alternative of result 2
Transcript: 
--------------------
First alternative of result 3
Transcript:  Hey kid kid kid
--------------------
First alternative of result 4
Transcript:  just come over here and sit down would you
--------------------
First alternative of result 5
Transcript:  what's your name Mister my name is Adam
--------------------
First alternative of result 6
Transcript:  you look like my grandpa said he's not as old that's very rude
--------------------
First alternative of result 7
Transcript:  Adam who are you
--------------------
First alternative of result 8
Transcript:  Joseph
--------------------
First alternative of result 9
Transcript:  where's your mother she's with her boyfriend
--------------------
First alternative of result