In [1]:
# Copyright 2024 Forusone(shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Video transcription

* [Google Gen AI SDKs](https://cloud.google.com/vertex-ai/generative-ai/docs/sdks/overview)
  * The new Google Gen AI SDK provides a unified interface to Gemini 2.0 through both the Gemini Developer API and the Gemini API on Vertex AI. With a few exceptions, code that runs on one platform will run on both.
  * This means that you can prototype an application using the Developer API and then migrate the application to Vertex AI without rewriting your code.

* [google-genai 0.3.0](https://pypi.org/project/google-genai/)
* [Google Gen AI SDK](https://googleapis.github.io/python-genai/)
* [python-genai : Github](https://github.com/googleapis/python-genai)

## Set configuration

### Package Install
* [google-cloud-aiplatform](https://cloud.google.com/python/docs/reference/aiplatform/latest)

In [2]:
%pip install --upgrade --user --quiet google-genai

### Define constants

In [4]:
PROJECT_ID = "ai-hangsik"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
MODEL_NAME = "gemini-2.0-flash-exp" # @param {type:"string"}

### Authentication to access to GCP
* Only for Colab in Google Drive
* No need to do this process if in Colab Enteprise on Vertex AI.
* Refer to the [authentication ways](https://cloud.google.com/docs/authentication?hl=ko) in GCP

In [11]:
# To use markdown for output data from LLM
from IPython.display import display, Markdown

# Use OAuth to access the GCP environment.
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)


## Lab Execution

### Import libraries

In [5]:
import base64
import json

from google import genai
from google.genai.types import (
    Content,
    FunctionDeclaration,
    GenerateContentConfig,
    GenerateContentResponse,
    GoogleSearch,
    Part,
    Retrieval,
    SafetySetting,
    Tool,
    VertexAISearch,
)


### Initalize client

* [genai.Client](https://googleapis.github.io/python-genai/genai.html#genai.client.Client)

In [6]:
client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location=LOCATION
)

### Helper function

In [7]:
def transcribe(query:str,
               video_uri,
               response_schema
               ) -> GenerateContentResponse:

  contents = [
    Content(
      role="user",
      parts=[
        # https://googleapis.github.io/python-genai/genai.html#genai.types.Part
        # Check class methods. (from_bytes, from_code_execution_result, from_executable_code, from_function_call, etc... )
        Part.from_text(query),
        Part.from_uri(file_uri=video_uri,mime_type="video/*",)

      ]
    )
  ]

  # https://googleapis.github.io/python-genai/genai.html#genai.types.SafetySettingDict
  safety_settings = [SafetySetting(category="HARM_CATEGORY_HATE_SPEECH",threshold="OFF"),
                     SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT",threshold="OFF"),
                     SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT",threshold="OFF"),
                     SafetySetting(category="HARM_CATEGORY_HARASSMENT",threshold="OFF")]

  # google_search_tool = Tool(google_search=GoogleSearch())
  # tools=[google_search_tool]

  # https://googleapis.github.io/python-genai/genai.html#genai.types.GenerateContentConfig
  generate_content_config = GenerateContentConfig(
    temperature = 1,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_modalities = ["TEXT"],
    safety_settings = safety_settings,
    response_schema = response_schema,
    response_mime_type="application/json",
    # tools = tools
  )

  # https://googleapis.github.io/python-genai/genai.html#genai.models.AsyncModels.generate_content
  responses = client.models.generate_content(
    model = MODEL_NAME,
    contents = contents,
    config = generate_content_config,
  )

  return responses

In [8]:
video_uri = "https://www.youtube.com/watch?v=OoUVSHDbAeM" # Stephen Hawking Discovers The Black Hole Theory

prompt = """
          당신을 비디오를 분석해서 transcript를 작성해야 하는 AI Assistant 입니다.
          아래 가이드라인에 맞게 transcription을 작성해주세요.

          1. 첨부된 비디오를 분석하여 아래와 같은 포맷으로 모든 대화 내용을 빠짐없이 출력해주세요.
          2. 결과 출력 단위는 비디오 내의 장면이 구분되는 특정 장소를 기준으로 나누어서 출력해주세요.
          3. 목소리를 기반으로 화자(speaker)를 정확하게 분리해서 영어로 출력해주세요.
          4. 목소리외에 다양한 효과음, 감정표현은 괄호를 사용해서 반드시 자세히 표현해주세요.
"""

response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "location": { "type": "STRING",},
            "start_time": { "type": "STRING",},
            "end_time": { "type": "STRING",},
            "elapsed_time": { "type": "STRING",},
            "transcription": {
                "type": "ARRAY",
                "items" : {
                  "type": "OBJECT",
                  "properties": {
                    "speaker": { "type": "STRING",},
                    "transcript": { "type": "STRING",},
                  }
                }
            },
        },
        "required": ["start_time","end_time","location"],
    },
}

responses = transcribe(prompt, video_uri, response_schema)


In [9]:
print(responses.text)


[
  {
    "elapsed_time": "00:00-00:26",
    "end_time": "00:26",
    "start_time": "00:00",
    "location": "Train Station",
    "transcription": [
      {
        "speaker": "Unknown speaker",
        "transcript": "(train steam sound)"
      },
       {
        "speaker": "Unknown speaker",
        "transcript": "Come on, Stephen. Gonna move on."
      },
       {
        "speaker": "Unknown speaker",
        "transcript": "What's wrong with you, man? (chuckle sound)"
      },
       {
        "speaker": "Unknown speaker",
        "transcript":"(train sound)"
      }
   ]
  },
    {
    "elapsed_time":"00:27-01:27",
    "end_time":"01:27",
    "start_time":"00:27",
    "location": "Classroom",
      "transcription":[
        {
            "speaker":"Lecturer",
            "transcript":"A star, more than three times the size of our sun. Or to end its life, how? With a collapse. The gravitational forces of the entire mass overcoming the electromagnetic forces of individual atoms, and 

In [10]:
from IPython.display import display, Markdown
import pandas as pd

transcription_list = json.loads(responses.text)
rows = []

for data in transcription_list:
  location = data['location']
  start_time = data['start_time']
  end_time = data['end_time']

  transcript = data['transcription']
  for tr in transcript:
    speaker = tr['speaker']
    transcript = tr['transcript']
    row = {'location': location, 'start_time': start_time, 'end_time': end_time,  'speaker': speaker, 'transcript': transcript}
    rows.append(row)

df = pd.DataFrame(rows)
df


Unnamed: 0,location,start_time,end_time,speaker,transcript
0,Train Station,00:00,00:26,Unknown speaker,(train steam sound)
1,Train Station,00:00,00:26,Unknown speaker,"Come on, Stephen. Gonna move on."
2,Train Station,00:00,00:26,Unknown speaker,"What's wrong with you, man? (chuckle sound)"
3,Train Station,00:00,00:26,Unknown speaker,(train sound)
4,Classroom,00:27,01:27,Lecturer,"A star, more than three times the size of our ..."
5,Cup of coffee,01:27,01:32,Unknown speaker,(Pouring cream sound)
6,Cup of coffee,01:27,01:32,Unknown speaker,(Coffee sound)
7,Train,01:32,01:55,Unknown speaker,(Train sounds)
8,Garden,01:55,02:44,Stephen,"So, what would happen if you applied Penrose's..."
9,Garden,01:55,02:44,Stephen,"If Einstein is right, if general relativity is..."


In [13]:
df.to_csv("transcript.csv", sep=',', encoding='utf-8')
